In [53]:
#CSVs in Python
#write as function
import unicodecsv

def read_csv(filename):
    with open(filename, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

enrollments = read_csv('enrollments.csv')
daily_engagement = read_csv('daily_engagement.csv')
project_submissions = read_csv('project_submissions.csv')

print(enrollments[0])
print(daily_engagement[0])
print(project_submissions[0])

{'is_canceled': 'True', 'account_key': '448', 'join_date': '2014-11-10', 'days_to_cancel': '65', 'is_udacity': 'True', 'cancel_date': '2015-01-14', 'status': 'canceled'}
{'projects_completed': '0.0', 'num_courses_visited': '1.0', 'acct': '0', 'utc_date': '2015-01-09', 'total_minutes_visited': '11.6793745', 'lessons_completed': '0.0'}
{'creation_date': '2015-01-14', 'lesson_key': '3176718735', 'account_key': '256', 'processing_state': 'EVALUATED', 'completion_date': '2015-01-16', 'assigned_rating': 'UNGRADED'}


In [54]:
#Fixing data types
#taken from L1_Starter_code
from datetime import datetime as dt

# Takes a date as a string, and returns a Python datetime object. 
# If there is no date given, returns None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

In [55]:
# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]

{'account_key': '448',
 'cancel_date': datetime.datetime(2015, 1, 14, 0, 0),
 'days_to_cancel': 65,
 'is_canceled': True,
 'is_udacity': True,
 'join_date': datetime.datetime(2014, 11, 10, 0, 0),
 'status': 'canceled'}

In [56]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
daily_engagement[0]

{'acct': '0',
 'lessons_completed': 0,
 'num_courses_visited': 1,
 'projects_completed': 0,
 'total_minutes_visited': 11.6793745,
 'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}

In [57]:
# Clean up the data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

project_submissions[0]

{'account_key': '256',
 'assigned_rating': 'UNGRADED',
 'completion_date': datetime.datetime(2015, 1, 16, 0, 0),
 'creation_date': datetime.datetime(2015, 1, 14, 0, 0),
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

__Quiz: Questions about Student...__
Questions to answer during analysis
1. Does time of day for enrollment affect count or frequency of project submissions?
2. Is there a point of daily engagement which noticeably increases or decreases count of project submissions?
3. What is the average lessons completed per project submission?
4. Of students who have submitted at least 1 project, what is the average number of project submissions within 6 months of enrolling?
5. Looking only at students enrolled for at least 3 months, how many have submitted at least 1 project?
6. On average, how long does it take to complete 6 projects?

In [58]:
#Quiz: Investigating the data: counts unique
def unique_acc(table,col):
    accounts = []
    for account in table:
        if account[col] not in accounts:
            accounts.append(account[col])
    return len(accounts)

enrollment_num_rows = len(enrollments)            
enrollment_num_unique_students = unique_acc(enrollments,'account_key')  

engagement_num_rows = len(daily_engagement)             
engagement_num_unique_students = unique_acc(daily_engagement,'acct')  

submission_num_rows = len(project_submissions)             
submission_num_unique_students = unique_acc(project_submissions,'account_key')  

print('Enrollment count: {0}, projects count: {1}, engagement count: {2}'.format(
enrollment_num_rows,engagement_num_rows,submission_num_rows))

print('Unique enrollments: {0}, Users with project submissions: {1}, \
users with some daily activity: {2}'.format(
enrollment_num_rows,engagement_num_rows,submission_num_rows))


Enrollment count: 1640, projects count: 136240, engagement count: 3642
Unique enrollments: 1640, Users with project submissions: 136240, users with some daily activity: 3642


In [59]:
##Quiz: Investigating the data: add unique tables
def unique_table(table,variable):
    tmp_table = set()
    for record in table:
        tmp_table.add(record[variable])
    return tmp_table

unique_enrolled_students = unique_table(enrollments,'account_key')
unique_engagement_students = unique_table(daily_engagement,'acct')
unique_project_submitters = unique_table(project_submissions,'account_key')

print('unique students:',len(unique_enrolled_students))
print('unique engaged students:',len(unique_engagement_students))
print('unique project submitters:',len(unique_project_submitters))

unique students: 1302
unique engaged students: 1237
unique project submitters: 743


In [63]:
#Quiz: Problems in the data
#Unifying name for account key in all tables (dictionaries)

print(daily_engagement[3]['acct'])

for rec in daily_engagement:
    rec['account_key'] = rec['acct']
    del[rec['acct']]

print(daily_engagement[3]['account_key'])

0
0


In [73]:
#Quiz: Missing engagement reports: Looking at some missing records

missing_enrolled = []

for missing in unique_enrolled_students:
    if missing not in unique_engagement_students:
        missing_enrolled.append(missing)
        
print(missing_enrolled[0:10])

for missing in missing_enrolled[0:5]:
    for match in enrollments:
        if match['account_key'] == missing:
            print(match)

['802', '1171', '870', '733', '1086', '997', '1044', '1241', '1101', '727']
{'is_canceled': True, 'account_key': '802', 'join_date': datetime.datetime(2015, 1, 8, 0, 0), 'days_to_cancel': 0, 'is_udacity': False, 'cancel_date': datetime.datetime(2015, 1, 8, 0, 0), 'status': 'canceled'}
{'is_canceled': True, 'account_key': '1171', 'join_date': datetime.datetime(2015, 1, 7, 0, 0), 'days_to_cancel': 0, 'is_udacity': False, 'cancel_date': datetime.datetime(2015, 1, 7, 0, 0), 'status': 'canceled'}
{'is_canceled': True, 'account_key': '870', 'join_date': datetime.datetime(2015, 5, 12, 0, 0), 'days_to_cancel': 0, 'is_udacity': False, 'cancel_date': datetime.datetime(2015, 5, 12, 0, 0), 'status': 'canceled'}
{'is_canceled': True, 'account_key': '733', 'join_date': datetime.datetime(2015, 1, 12, 0, 0), 'days_to_cancel': 0, 'is_udacity': False, 'cancel_date': datetime.datetime(2015, 1, 12, 0, 0), 'status': 'canceled'}
{'is_canceled': True, 'account_key': '1086', 'join_date': datetime.datetime(201

In [74]:
#Quiz: Missing engagement reports: Looking further at cancelations

#hypothesis: cancelation on day 0 is not counted as daily engagement
#Starting by checking count of missing not canceled 
#then checking count of canceled not canceled on day 0

not_canceled = []
days_to_cancel_zero = []
canceled_later = []

for missing in missing_enrolled:
    for match in enrollments:
        if match['account_key'] == missing:
            if match['is_canceled'] == False:
                not_canceled.append(match)
            elif match['days_to_cancel'] == 0:
                days_to_cancel_zero.append(match)
            else:
                canceled_later.append(match)

print('Not canceled:',len(not_canceled))
print('Canceled first day:',len(days_to_cancel_zero))
print('Canceled later:',len(canceled_later))

Not canceled: 1
Canceled first day: 68
Canceled later: 2


In [75]:
#Quiz: Missing engagement reports: Looking at non-canceled and later-canceled

print(not_canceled)
print(canceled_later)

[{'is_canceled': False, 'account_key': '1101', 'join_date': datetime.datetime(2015, 2, 25, 0, 0), 'days_to_cancel': None, 'is_udacity': True, 'cancel_date': None, 'status': 'current'}]
[{'is_canceled': True, 'account_key': '1304', 'join_date': datetime.datetime(2015, 1, 10, 0, 0), 'days_to_cancel': 59, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), 'status': 'canceled'}, {'is_canceled': True, 'account_key': '1304', 'join_date': datetime.datetime(2015, 3, 10, 0, 0), 'days_to_cancel': 99, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), 'status': 'canceled'}]


In [94]:
#Quiz: Missing engagement reports: is_udacity and multiples


#checking count of is_udacity missing engagement records
enrolled_is_udacity = []

for missing in missing_enrolled:
    for enrolled in enrollments:
        if enrolled['account_key'] == missing:
            if enrolled['is_udacity'] == True:
                enrolled_is_udacity.append(missing)
                
print('udacity testers missing from engagement:',len(enrolled_is_udacity))

udacity testers missing from engagement: 4
