In [53]:
#CSVs in Python
#write as function
import unicodecsv

def read_csv(filename):
    with open(filename, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

enrollments = read_csv('enrollments.csv')
daily_engagement = read_csv('daily_engagement.csv')
project_submissions = read_csv('project_submissions.csv')

print(enrollments[0])
print(daily_engagement[0])
print(project_submissions[0])

{'is_canceled': 'True', 'account_key': '448', 'join_date': '2014-11-10', 'days_to_cancel': '65', 'is_udacity': 'True', 'cancel_date': '2015-01-14', 'status': 'canceled'}
{'projects_completed': '0.0', 'num_courses_visited': '1.0', 'acct': '0', 'utc_date': '2015-01-09', 'total_minutes_visited': '11.6793745', 'lessons_completed': '0.0'}
{'creation_date': '2015-01-14', 'lesson_key': '3176718735', 'account_key': '256', 'processing_state': 'EVALUATED', 'completion_date': '2015-01-16', 'assigned_rating': 'UNGRADED'}


In [54]:
#Fixing data types
#taken from L1_Starter_code
from datetime import datetime as dt

# Takes a date as a string, and returns a Python datetime object. 
# If there is no date given, returns None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

In [55]:
# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]

{'account_key': '448',
 'cancel_date': datetime.datetime(2015, 1, 14, 0, 0),
 'days_to_cancel': 65,
 'is_canceled': True,
 'is_udacity': True,
 'join_date': datetime.datetime(2014, 11, 10, 0, 0),
 'status': 'canceled'}

In [56]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
daily_engagement[0]

{'acct': '0',
 'lessons_completed': 0,
 'num_courses_visited': 1,
 'projects_completed': 0,
 'total_minutes_visited': 11.6793745,
 'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}

In [57]:
# Clean up the data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

project_submissions[0]

{'account_key': '256',
 'assigned_rating': 'UNGRADED',
 'completion_date': datetime.datetime(2015, 1, 16, 0, 0),
 'creation_date': datetime.datetime(2015, 1, 14, 0, 0),
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

__Quiz: Questions about Student...__
Questions to answer during analysis
1. Does time of day for enrollment affect count or frequency of project submissions?
2. Is there a point of daily engagement which noticeably increases or decreases count of project submissions?
3. What is the average lessons completed per project submission?
4. Of students who have submitted at least 1 project, what is the average number of project submissions within 6 months of enrolling?
5. Looking only at students enrolled for at least 3 months, how many have submitted at least 1 project?
6. On average, how long does it take to complete 6 projects?

In [58]:
#Quiz: Investigating the data: counts unique
def unique_acc(table,col):
    accounts = []
    for account in table:
        if account[col] not in accounts:
            accounts.append(account[col])
    return len(accounts)

enrollment_num_rows = len(enrollments)            
enrollment_num_unique_students = unique_acc(enrollments,'account_key')  

engagement_num_rows = len(daily_engagement)             
engagement_num_unique_students = unique_acc(daily_engagement,'acct')  

submission_num_rows = len(project_submissions)             
submission_num_unique_students = unique_acc(project_submissions,'account_key')  

print('Enrollment count: {0}, projects count: {1}, engagement count: {2}'.format(
enrollment_num_rows,engagement_num_rows,submission_num_rows))

print('Unique enrollments: {0}, Users with project submissions: {1}, \
users with some daily activity: {2}'.format(
enrollment_num_rows,engagement_num_rows,submission_num_rows))


Enrollment count: 1640, projects count: 136240, engagement count: 3642
Unique enrollments: 1640, Users with project submissions: 136240, users with some daily activity: 3642


In [59]:
##Quiz: Investigating the data: add unique tables
def unique_table(table,variable):
    tmp_table = set()
    for record in table:
        tmp_table.add(record[variable])
    return tmp_table

unique_enrolled_students = unique_table(enrollments,'account_key')
unique_engagement_students = unique_table(daily_engagement,'acct')
unique_project_submitters = unique_table(project_submissions,'account_key')

print('unique students:',len(unique_enrolled_students))
print('unique engaged students:',len(unique_engagement_students))
print('unique project submitters:',len(unique_project_submitters))

unique students: 1302
unique engaged students: 1237
unique project submitters: 743


In [63]:
#Quiz: Problems in the data
#Unifying name for account key in all tables (dictionaries)

print(daily_engagement[3]['acct'])

for rec in daily_engagement:
    rec['account_key'] = rec['acct']
    del[rec['acct']]

print(daily_engagement[3]['account_key'])

0
0


In [73]:
#Quiz: Missing engagement reports: Looking at some missing records

missing_enrolled = []

for missing in unique_enrolled_students:
    if missing not in unique_engagement_students:
        missing_enrolled.append(missing)
        
print(missing_enrolled[0:10])

for missing in missing_enrolled[0:5]:
    for match in enrollments:
        if match['account_key'] == missing:
            print(match)

['802', '1171', '870', '733', '1086', '997', '1044', '1241', '1101', '727']
{'is_canceled': True, 'account_key': '802', 'join_date': datetime.datetime(2015, 1, 8, 0, 0), 'days_to_cancel': 0, 'is_udacity': False, 'cancel_date': datetime.datetime(2015, 1, 8, 0, 0), 'status': 'canceled'}
{'is_canceled': True, 'account_key': '1171', 'join_date': datetime.datetime(2015, 1, 7, 0, 0), 'days_to_cancel': 0, 'is_udacity': False, 'cancel_date': datetime.datetime(2015, 1, 7, 0, 0), 'status': 'canceled'}
{'is_canceled': True, 'account_key': '870', 'join_date': datetime.datetime(2015, 5, 12, 0, 0), 'days_to_cancel': 0, 'is_udacity': False, 'cancel_date': datetime.datetime(2015, 5, 12, 0, 0), 'status': 'canceled'}
{'is_canceled': True, 'account_key': '733', 'join_date': datetime.datetime(2015, 1, 12, 0, 0), 'days_to_cancel': 0, 'is_udacity': False, 'cancel_date': datetime.datetime(2015, 1, 12, 0, 0), 'status': 'canceled'}
{'is_canceled': True, 'account_key': '1086', 'join_date': datetime.datetime(201

In [74]:
#Quiz: Missing engagement reports: Looking further at cancelations

#hypothesis: cancelation on day 0 is not counted as daily engagement
#Starting by checking count of missing not canceled 
#then checking count of canceled not canceled on day 0

not_canceled = []
days_to_cancel_zero = []
canceled_later = []

for missing in missing_enrolled:
    for match in enrollments:
        if match['account_key'] == missing:
            if match['is_canceled'] == False:
                not_canceled.append(match)
            elif match['days_to_cancel'] == 0:
                days_to_cancel_zero.append(match)
            else:
                canceled_later.append(match)

print('Not canceled:',len(not_canceled))
print('Canceled first day:',len(days_to_cancel_zero))
print('Canceled later:',len(canceled_later))

Not canceled: 1
Canceled first day: 68
Canceled later: 2


In [75]:
#Quiz: Missing engagement reports: Looking at non-canceled and later-canceled

print(not_canceled)
print(canceled_later)

[{'is_canceled': False, 'account_key': '1101', 'join_date': datetime.datetime(2015, 2, 25, 0, 0), 'days_to_cancel': None, 'is_udacity': True, 'cancel_date': None, 'status': 'current'}]
[{'is_canceled': True, 'account_key': '1304', 'join_date': datetime.datetime(2015, 1, 10, 0, 0), 'days_to_cancel': 59, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), 'status': 'canceled'}, {'is_canceled': True, 'account_key': '1304', 'join_date': datetime.datetime(2015, 3, 10, 0, 0), 'days_to_cancel': 99, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), 'status': 'canceled'}]


In [94]:
#Quiz: Missing engagement reports: is_udacity count


#checking count of is_udacity missing engagement records
enrolled_is_udacity = []

for missing in missing_enrolled:
    for enrolled in enrollments:
        if enrolled['account_key'] == missing:
            if enrolled['is_udacity'] == True:
                enrolled_is_udacity.append(missing)
                
print('udacity testers missing from engagement:',len(enrolled_is_udacity))

udacity testers missing from engagement: 4


In [95]:
#Quiz: Missing engagement reports: is_udacity records

print(enrolled_is_udacity)


['1101', '1069', '1304', '1304']


In [100]:
for record in enrollments:
    if record['account_key'] in ('1101', '1069', '1304', '1304'):
        print(record)

{'is_canceled': True, 'account_key': '1304', 'join_date': datetime.datetime(2015, 1, 10, 0, 0), 'days_to_cancel': 59, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), 'status': 'canceled'}
{'is_canceled': True, 'account_key': '1304', 'join_date': datetime.datetime(2015, 3, 10, 0, 0), 'days_to_cancel': 99, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), 'status': 'canceled'}
{'is_canceled': True, 'account_key': '1069', 'join_date': datetime.datetime(2015, 6, 1, 0, 0), 'days_to_cancel': 0, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 6, 1, 0, 0), 'status': 'canceled'}
{'is_canceled': False, 'account_key': '1101', 'join_date': datetime.datetime(2015, 2, 25, 0, 0), 'days_to_cancel': None, 'is_udacity': True, 'cancel_date': None, 'status': 'current'}


In [106]:
#Quiz: checking for more problems:
#finding number of suspicious records not canceled on day 1 (answer from video)

num_problem_students = 0
for enrollment in enrollments:
    student = enrollment['account_key']
    if (student not in unique_engagement_students and 
            enrollment['join_date'] != enrollment['cancel_date']):
        print(enrollment)
        num_problem_students += 1
        
print(num_problem_students)
#Suspecting it's the same as Udacity records, I am printing those as well
for record in enrollments:
    if record['account_key'] in ('1101', '1069', '1304', '1304'):
        if record['days_to_cancel'] != 0:
            print(record)

{'is_canceled': True, 'account_key': '1304', 'join_date': datetime.datetime(2015, 1, 10, 0, 0), 'days_to_cancel': 59, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), 'status': 'canceled'}
{'is_canceled': True, 'account_key': '1304', 'join_date': datetime.datetime(2015, 3, 10, 0, 0), 'days_to_cancel': 99, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), 'status': 'canceled'}
{'is_canceled': False, 'account_key': '1101', 'join_date': datetime.datetime(2015, 2, 25, 0, 0), 'days_to_cancel': None, 'is_udacity': True, 'cancel_date': None, 'status': 'current'}
3
{'is_canceled': True, 'account_key': '1304', 'join_date': datetime.datetime(2015, 1, 10, 0, 0), 'days_to_cancel': 59, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), 'status': 'canceled'}
{'is_canceled': True, 'account_key': '1304', 'join_date': datetime.datetime(2015, 3, 10, 0, 0), 'days_to_cancel': 99, 'is_udacity': True, 'cancel_date': datetime.datetime(2015,

In [107]:
#Tracking down remainining problems
#Removing udacity accounts from data (inspired by video)
udacity_test_accounts = set()

for tester in enrollments:
    if tester['is_udacity'] == True:
        udacity_test_accounts.add(tester['account_key'])
        
print(len(udacity_test_accounts))

#function to remove testers
def remove_testers(table):
    non_testers = list()
    for record in table:
        if record['account_key'] not in udacity_test_accounts:
            non_testers.append(record) 
    return non_testers



6


In [108]:
#calling function to remove testers

non_udacity_enrollments = remove_testers(enrollments)
non_udacity_engagement = remove_testers(daily_engagement)
non_udacity_submissions = remove_testers(project_submissions)

print('enrollments before:',len(enrollments),'after:',len(non_udacity_enrollments))
print('engagement before:',len(daily_engagement),'after:',len(non_udacity_engagement))
print('submissions before:',len(project_submissions),'after:',len(non_udacity_submissions))


enrollments before: 1640 after: 1622
engagement before: 136240 after: 135656
submissions before: 3642 after: 3634


In [115]:
#Quiz: refining the question
'''
Exploration question: How do numbers in the daily engagement table differ for students who pass their first project?
Revision after identifying Q problem areas: Only look at engagement from first week,
and exclude students who cancel within a week
'''

#Creating table (dictionary). Conditions: have not canceled or have stayed enrolled more than 7 days.
#Keys account_key, values enrollment_date
paid_students = {}

for paid in non_udacity_enrollments:
    if paid['cancel_date'] == None or paid['days_to_cancel'] > 7:
        account_key = paid['account_key']
        enrollment_date = paid['join_date']
        #if statement to ensure we're keeping the latest enrollment_date
        if account_key not in paid_students or paid_students[account_key] < enrollment_date:
            paid_students[account_key] = enrollment_date

print(len(paid_students))

count = 0
for rec in paid_students:
    count += 1 
    print(rec,paid_students[rec])
    if count == 5:
        break
        

995
852 2015-05-12 00:00:00
628 2015-06-09 00:00:00
586 2015-03-10 00:00:00
665 2015-05-12 00:00:00
126 2014-11-14 00:00:00


In [119]:
#Quiz: getting data from first week, remove free trials
#half own solution, half video


def remove_free_trials(table):
    no_trials = list()
    for row in table:
        if row['account_key'] in paid_students:
            no_trials.append(row)
    return no_trials
            
paid_enrollments = remove_free_trials(non_udacity_enrollments)
paid_engagement = remove_free_trials(non_udacity_engagement)
paid_submissions = remove_free_trials(non_udacity_submissions)

print(len(paid_enrollments))
print(len(paid_engagement))
print(len(paid_submissions))


1293
134549
3618


In [126]:
import timeit

start = timeit.default_timer()

#Quiz: getting data from first week: first week engagement
def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7 #and time_delta.days >= 0

engagement_first_week = list()

for row in paid_enrollments:
    for active in paid_engagement:
        if within_one_week(row['join_date'],active['utc_date']):
            engagement_first_week.append(active)
            
stop = timeit.default_timer()

print(stop-start)

#too slow, abandoning

130.36402567400364


In [130]:
print(len(engagement_first_week))
print(len(paid_engagement))

55887489
134549


In [145]:
type(udacity_test_accounts)

set

In [150]:
#function to print 5 rows
def five_rows(table):
    count = 0
    
    if type(table) == dict:
        for row in table:
            count += 1 
            print(row,table[row])
            if count == 5:
                break
    else:
        for row in table:
            count += 1 
            print(row)
            if count == 5:
                break

In [143]:
#Quiz: getting data from first week: first week engagement
#inspired by quick look at lesson answer

def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7 #and time_delta.days >= 0

paid_engagement_first_week = list()

for engagement in paid_engagement:
    account_key = engagement['account_key']
    join_date = paid_students[account_key]
    engagement_date = engagement['utc_date']
    
    if within_one_week(join_date,engagement_date):
        paid_engagement_first_week.append(engagement)
        
print(len(paid_engagement_first_week))
five_rows(paid_engagement_first_week)

21508
{'account_key': '0', 'projects_completed': 0, 'num_courses_visited': 1, 'utc_date': datetime.datetime(2015, 1, 9, 0, 0), 'total_minutes_visited': 11.6793745, 'lessons_completed': 0}
{'account_key': '0', 'projects_completed': 0, 'num_courses_visited': 2, 'utc_date': datetime.datetime(2015, 1, 10, 0, 0), 'total_minutes_visited': 37.2848873333, 'lessons_completed': 0}
{'account_key': '0', 'projects_completed': 0, 'num_courses_visited': 2, 'utc_date': datetime.datetime(2015, 1, 11, 0, 0), 'total_minutes_visited': 53.6337463333, 'lessons_completed': 0}
{'account_key': '0', 'projects_completed': 0, 'num_courses_visited': 1, 'utc_date': datetime.datetime(2015, 1, 12, 0, 0), 'total_minutes_visited': 33.4892696667, 'lessons_completed': 0}
{'account_key': '0', 'projects_completed': 0, 'num_courses_visited': 1, 'utc_date': datetime.datetime(2015, 1, 13, 0, 0), 'total_minutes_visited': 64.7796776667, 'lessons_completed': 0}


In [152]:
#Exploring student engagement, code from video
from collections import defaultdict

# Create a dictionary of engagement grouped by student.
# The keys are account keys, and the values are lists of engagement records.
engagement_by_account = defaultdict(list)
for engagement_record in paid_engagement_in_first_week:
    account_key = engagement_record['account_key']
    engagement_by_account[account_key].append(engagement_record)

NameError: name 'paid_engagement_in_first_week' is not defined

In [153]:
#Exploring student engagement, code from video


# Create a dictionary with the total minutes each student spent in the classroom during the first week.
# The keys are account keys, and the values are numbers (total minutes)
total_minutes_by_account = {}
for account_key, engagement_for_student in engagement_by_account.items():
    total_minutes = 0
    for engagement_record in engagement_for_student:
        total_minutes += engagement_record['total_minutes_visited']
    total_minutes_by_account[account_key] = total_minutes

#Exploring student engagement, code from video
#My edit: made total_minutes into list, a necessity in Python 3

import numpy as np

# Summarize the data about minutes spent in the classroom
total_minutes = list(total_minutes_by_account.values())
print('Mean:', np.mean(total_minutes))
print('Standard deviation:', np.std(total_minutes))
print('Minimum:', np.min(total_minutes))
print('Maximum:', np.max(total_minutes))m