# Intro to Data Analysis

In [8]:
import unicodecsv

enrollments_filename = './resources/enrollments.csv'

## Longer version of code (replaced with shorter, equivalent version below)

# enrollments = []
# f = open(enrollments_filename, 'rb')
# reader = unicodecsv.DictReader(f)
# for row in reader:
#     enrollments.append(row)
# f.close()

with open(enrollments_filename, 'rb') as f:
    reader = unicodecsv.DictReader(f)
    enrollments = list(reader)
    
### Write code similar to the above to load the engagement
### and submission data. The data is stored in files with
### the given filenames. Then print the first row of each
### table to make sure that your code works. You can use the
### "Test Run" button to see the output of your code.

engagement_filename = './resources/daily_engagement.csv'
submissions_filename = './resources/project_submissions.csv'
    
with open(engagement_filename, 'rb') as f:
    reader = unicodecsv.DictReader(f)
    daily_engagement = list(reader)

with open(submissions_filename, 'rb') as f:
    reader = unicodecsv.DictReader(f)
    project_submissions = list(reader)

In [62]:
def read_csv(filename):
    with open(filename, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)
        
def count_unique(values, key):
    return len(set([rec[key] for rec in values]))

enrollments = read_csv('./resources/enrollments.csv')
daily_engagement = read_csv('./resources/daily_engagement.csv')
project_submissions = read_csv('./resources/project_submissions.csv')
    
### For each of these three tables, find the number of rows in the table and
### the number of unique students in the table. To find the number of unique
### students, you might want to create a set of the account keys in each table.

enrollment_num_rows = len(enrollments)
enrollment_num_unique_students = count_unique(enrollments, 'account_key')

engagement_num_rows = len(daily_engagement)
engagement_num_unique_students = count_unique(daily_engagement, 'acct')

submission_num_rows = len(project_submissions)
submission_num_unique_students = count_unique(project_submissions, 'account_key')

In [63]:
from datetime import datetime

def set_record_type(data, column, data_type):
    for record in data:
        if record[column]:
            if data_type == 'float':
                record[column] = float(record[column])
            elif data_type == 'integer':
                record[column] = int(float(record[column]))
            elif data_type == 'date':
                record[column] = datetime.strptime(record[column],'%Y-%m-%d')
            elif data_type == 'boolean':
                record[column] = True if record[column] == 'True' else False
        else:
            record[column] = None
    return data

enrollments = set_record_type(enrollments, 'join_date', 'date')
enrollments = set_record_type(enrollments, 'cancel_date', 'date')
enrollments = set_record_type(enrollments, 'days_to_cancel', 'integer')
enrollments = set_record_type(enrollments, 'is_udacity', 'boolean')
enrollments = set_record_type(enrollments, 'is_canceled', 'boolean')

daily_engagement = set_record_type(daily_engagement, 'utc_date', 'date')
daily_engagement = set_record_type(daily_engagement, 'num_courses_visited', 'integer')
daily_engagement = set_record_type(daily_engagement, 'total_minutes_visited', 'float')
daily_engagement = set_record_type(daily_engagement, 'lessons_completed', 'integer')
daily_engagement = set_record_type(daily_engagement, 'projects_completed', 'integer')

project_submissions = set_record_type(project_submissions, 'creation_date', 'date')
project_submissions = set_record_type(project_submissions, 'completion_date', 'date')

In [12]:
enrollment_num_unique_students

1302

In [13]:
engagement_num_unique_students

1237

In [14]:
submission_num_unique_students

743

In [70]:
for rec in daily_engagement:
    rec['account_key'] = rec['acct']
    del[rec['acct']]

In [71]:
def get_unique_students(data):
    return set([rec['account_key'] for rec in data])

In [72]:
enrollment_num_rows = len(enrollments)
enrollment_unique_students = get_unique_students(enrollments)

engagement_num_rows = len(daily_engagement)
engagement_unique_students = get_unique_students(daily_engagement)

submission_num_rows = len(project_submissions)
submission_unique_students = get_unique_students(project_submissions)

In [73]:
daily_engagement[0]

{'account_key': '0',
 'lessons_completed': 0,
 'num_courses_visited': 1,
 'projects_completed': 0,
 'total_minutes_visited': 11.6793745,
 'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}

In [74]:
for enrollment in enrollments:
    student = enrollment['account_key']
    if student not in engagement_unique_students:
        print(enrollment)
        break

{'is_udacity': False, 'days_to_cancel': 0, 'account_key': '1219', 'join_date': datetime.datetime(2014, 11, 12, 0, 0), 'status': 'canceled', 'cancel_date': datetime.datetime(2014, 11, 12, 0, 0), 'is_canceled': True}


In [75]:
for enrollment in enrollments:
    student = enrollment['account_key']
    if student not in engagement_unique_students and enrollment['join_date'] != enrollment['cancel_date']:
        print(enrollment)

{'is_udacity': True, 'days_to_cancel': 59, 'account_key': '1304', 'join_date': datetime.datetime(2015, 1, 10, 0, 0), 'status': 'canceled', 'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), 'is_canceled': True}
{'is_udacity': True, 'days_to_cancel': 99, 'account_key': '1304', 'join_date': datetime.datetime(2015, 3, 10, 0, 0), 'status': 'canceled', 'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), 'is_canceled': True}
{'is_udacity': True, 'days_to_cancel': None, 'account_key': '1101', 'join_date': datetime.datetime(2015, 2, 25, 0, 0), 'status': 'current', 'cancel_date': None, 'is_canceled': False}


In [76]:
udacity_test_accounts = set()
for enrollment in enrollments:
    if enrollment['is_udacity'] == True:
        udacity_test_accounts.add(enrollment['account_key'])
len(udacity_test_accounts)

6

In [77]:
def remove_udacity_account(data):
    return [rec for rec in data if rec['account_key'] not in udacity_test_accounts]

In [102]:
enrollments_non_udacity = remove_udacity_account(enrollments)
engagement_non_udacity = remove_udacity_account(daily_engagement)
submission_non_udacity = remove_udacity_account(project_submissions)

print(len(enrollments_non_udacity))
print(len(engagement_non_udacity))
print(len(submission_non_udacity))

1622
135656
3634


In [96]:
paid_students = {}
for enrollment in enrollments_non_udacity:
    if not enrollment['is_canceled'] or enrollment['days_to_cancel'] > 7:
        student = enrollment['account_key']
        join_date = enrollment['join_date']
        if student not in paid_students or join_date > paid_students[student]:
            paid_students[student] = join_date
            
len(paid_students)

995

In [97]:
def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7

In [99]:
def remove_free_trial(data):
    return [rec for rec in data if rec['account_key'] in paid_students]

enrollments_paid = remove_free_trial(enrollments_non_udacity)
engagement_paid = remove_free_trial(engagement_non_udacity)
submission_paid = remove_free_trial(submission_non_udacity)

paid_engagement_in_first_week = []
for rec in engagement_paid:
    student = rec['account_key']
    if within_one_week(paid_students[student], rec['utc_date']):
        paid_engagement_in_first_week.append(rec)
        
len(paid_engagement_in_first_week)

21508

In [104]:
from collections import defaultdict

engagement_by_account = defaultdict(list)
for rec in paid_engagement_in_first_week:
    account_key = rec['account_key']
    engagement_by_account[account_key].append(rec)
    
len(engagement_by_account)

995

In [107]:
total_minutes_by_accounts = {}

for account_key, engagement_for_student in engagement_by_account.items():
    total_minutes = 0
    for engagement_record in engagement_for_student:
        total_minutes += engagement_record['total_minutes_visited']
    total_minutes_by_accounts[account_key] = total_minutes

In [110]:
total_minutes = list(total_minutes_by_accounts.values())

import numpy as np
print('Mean:', np.mean(total_minutes))
print('Standard deviation:', np.std(total_minutes))
print('Minimum:', np.min(total_minutes))
print('Maximum:', np.max(total_minutes))

Mean: 647.590173826
Standard deviation: 1129.27121042
Minimum: 0.0
Maximum: 10568.1008673
