# Import CSV

In [1]:
import unicodecsv

def read_csv(filename):
    with open(filename, 'rb') as file:
        reader = unicodecsv.DictReader(file)
        return list(reader)
    

enrollments = read_csv('enrollments.csv')
daily_engagement = read_csv('daily_engagement.csv')
project_submissions = read_csv('project_submissions.csv')

print enrollments[0]
print daily_engagement[0]
print project_submissions[0]

{u'status': u'canceled', u'is_udacity': u'True', u'is_canceled': u'True', u'join_date': u'2014-11-10', u'account_key': u'448', u'cancel_date': u'2015-01-14', u'days_to_cancel': u'65'}
{u'lessons_completed': u'0.0', u'num_courses_visited': u'1.0', u'total_minutes_visited': u'11.6793745', u'projects_completed': u'0.0', u'acct': u'0', u'utc_date': u'2015-01-09'}
{u'lesson_key': u'3176718735', u'processing_state': u'EVALUATED', u'account_key': u'256', u'assigned_rating': u'UNGRADED', u'completion_date': u'2015-01-16', u'creation_date': u'2015-01-14'}


In [2]:
enrollments[0]

{u'account_key': u'448',
 u'cancel_date': u'2015-01-14',
 u'days_to_cancel': u'65',
 u'is_canceled': u'True',
 u'is_udacity': u'True',
 u'join_date': u'2014-11-10',
 u'status': u'canceled'}

In [3]:
daily_engagement[0]

{u'acct': u'0',
 u'lessons_completed': u'0.0',
 u'num_courses_visited': u'1.0',
 u'projects_completed': u'0.0',
 u'total_minutes_visited': u'11.6793745',
 u'utc_date': u'2015-01-09'}

In [4]:
project_submissions[0]

{u'account_key': u'256',
 u'assigned_rating': u'UNGRADED',
 u'completion_date': u'2015-01-16',
 u'creation_date': u'2015-01-14',
 u'lesson_key': u'3176718735',
 u'processing_state': u'EVALUATED'}

In [5]:
enrollment_num_rows = len(enrollments)
print enrollment_num_rows

1640


# Alter data types

In [6]:
from datetime import datetime as dt

def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
def parse_int(num):
    if num == '':
        return None
    else:
        return int(num)

In [7]:
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])

In [8]:
for engagement in daily_engagement:
    engagement['lessons_completed'] = int(float(engagement['lessons_completed']))
    engagement['num_courses_visited'] = int(float(engagement['num_courses_visited']))
    engagement['projects_completed'] = int(float(engagement['projects_completed']))
    engagement['total_minutes_visited'] = int(float(engagement['total_minutes_visited']))
    engagement['utc_date'] = parse_date(engagement['utc_date'])

In [9]:
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

In [10]:
for engagement in daily_engagement:
    engagement['account_key'] = engagement['acct']
    del[engagement['acct']]

# Find Unique student keys

In [11]:
def find_unique_account_key(entry):
    unique_students = set()
    for item in entry:
        unique_students.add(item['account_key'])
    return unique_students

In [12]:
print len(enrollments)
print len(daily_engagement)
print len(project_submissions)

1640
136240
3642


In [13]:
unique_enrollment_students = find_unique_account_key(enrollments)
unique_engagement_students = find_unique_account_key(daily_engagement)
unique_submission_students = find_unique_account_key(project_submissions)

print len(find_unique_account_key(enrollments))
print len(find_unique_account_key(daily_engagement))
print len(find_unique_account_key(project_submissions))

1302
1237
743


In [14]:
count = 0

for enrollment in enrollments:
    student = enrollment['account_key']
    if student not in unique_engagement_students and enrollment['join_date'] != enrollment['cancel_date'] :
        count += 1
        print enrollment

print count

{u'status': u'canceled', u'is_udacity': True, u'is_canceled': True, u'join_date': datetime.datetime(2015, 1, 10, 0, 0), u'account_key': u'1304', u'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), u'days_to_cancel': 59}
{u'status': u'canceled', u'is_udacity': True, u'is_canceled': True, u'join_date': datetime.datetime(2015, 3, 10, 0, 0), u'account_key': u'1304', u'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), u'days_to_cancel': 99}
{u'status': u'current', u'is_udacity': True, u'is_canceled': False, u'join_date': datetime.datetime(2015, 2, 25, 0, 0), u'account_key': u'1101', u'cancel_date': None, u'days_to_cancel': None}
3


In [15]:
udacity_test_accounts = set()
for enrollment in enrollments:
    if enrollment['is_udacity']:
        udacity_test_accounts.add(enrollment['account_key'])
print len(udacity_test_accounts)

6


# Remove udacity test accounts

In [16]:
def remove_udacity_accounts(data):
    non_udacity_data = []
    for data_point in data:
        if data_point['account_key'] not in udacity_test_accounts:
            non_udacity_data.append(data_point)
    return non_udacity_data

In [17]:
non_udacity_enrollments = remove_udacity_accounts(enrollments)
non_udacity_engagement = remove_udacity_accounts(daily_engagement)
non_udacity_submissions = remove_udacity_accounts(project_submissions)

len(non_udacity_enrollments)

1622

# Find paying enrollments

In [18]:
paid_students = {}

for enrollment in non_udacity_enrollments:
    if enrollment['days_to_cancel'] is None or enrollment['days_to_cancel'] > 7: 
        account_key = enrollment['account_key']
        enrollment_date = enrollment['join_date']
        
        if account_key not in paid_students or enrollment_date > paid_students[account_key]:
            paid_students[account_key] = enrollment_date
    
len(paid_students)

995

In [19]:
def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7 and time_delta.days >= 0

In [20]:
def remove_free_trial_cancels(data):
    new_data = []
    for data_point in data:
        if data_point['account_key'] in paid_students:
            new_data.append(data_point)
    return new_data

In [21]:
paid_enrollments = remove_free_trial_cancels(non_udacity_enrollments)
paid_engagement = remove_free_trial_cancels(non_udacity_engagement)
paid_submissions = remove_free_trial_cancels(non_udacity_submissions)

print len(paid_enrollments)
print len(paid_engagement)
print len(paid_submissions)

1293
134549
3618


# Find paid students who have engagement in the first week

In [22]:
paid_engagement_in_first_week = []

for engagement in paid_engagement:
    account_key = engagement['account_key']
    join_date = paid_students[account_key]
    engagement_record_date = engagement['utc_date']
    
    if within_one_week(join_date,engagement_record_date):
        paid_engagement_in_first_week.append(engagement)
        
len(paid_engagement_in_first_week)

6919

# Find engagement of these students, person wise and find average

In [23]:
from collections import defaultdict

engagement_by_account = defaultdict(list)
for engagement_record in paid_engagement_in_first_week:
    account_key = engagement_record['account_key']
    engagement_by_account[account_key].append(engagement_record)

In [24]:
total_minutes_by_account = {}

for account_key, engagement_for_student in engagement_by_account.items():
    total_minutes = 0
    for engagement_record in engagement_for_student:
        total_minutes += engagement_record['total_minutes_visited']
    total_minutes_by_account[account_key] = total_minutes        

In [25]:
total_minutes = total_minutes_by_account.values()

import numpy as np

print 'Mean:', np.mean(total_minutes)
print 'Std dev:', np.std(total_minutes)
print 'Min:', np.min(total_minutes)
print 'Max:', np.max(total_minutes)

Mean: 305.285427136
Std dev: 412.107180939
Min: 0
Max: 3562


In [26]:
student_with_max_minutes = None
max_minutes = 0.0

for student, total_minutes in total_minutes_by_account.items():
    if total_minutes > max_minutes:
        max_minutes = total_minutes
        student_with_max_minutes = student
        
max_minutes

3562

In [27]:
for engagement_record in paid_engagement_in_first_week:
    if engagement_record['account_key'] == student_with_max_minutes:
        print engagement_record

{u'lessons_completed': 4, u'num_courses_visited': 4, u'total_minutes_visited': 850, u'projects_completed': 0, 'account_key': u'163', u'utc_date': datetime.datetime(2015, 7, 9, 0, 0)}
{u'lessons_completed': 6, u'num_courses_visited': 6, u'total_minutes_visited': 872, u'projects_completed': 0, 'account_key': u'163', u'utc_date': datetime.datetime(2015, 7, 10, 0, 0)}
{u'lessons_completed': 6, u'num_courses_visited': 2, u'total_minutes_visited': 777, u'projects_completed': 0, 'account_key': u'163', u'utc_date': datetime.datetime(2015, 7, 11, 0, 0)}
{u'lessons_completed': 2, u'num_courses_visited': 1, u'total_minutes_visited': 294, u'projects_completed': 0, 'account_key': u'163', u'utc_date': datetime.datetime(2015, 7, 12, 0, 0)}
{u'lessons_completed': 1, u'num_courses_visited': 3, u'total_minutes_visited': 471, u'projects_completed': 0, 'account_key': u'163', u'utc_date': datetime.datetime(2015, 7, 13, 0, 0)}
{u'lessons_completed': 1, u'num_courses_visited': 2, u'total_minutes_visited': 29

# Lessons completed by account key

In [28]:
from collections import defaultdict

lessons_by_account = defaultdict(list)
for lessons_completed in paid_engagement_in_first_week:
    account_key = lessons_completed['account_key']
    lessons_by_account[account_key].append(lessons_completed)

In [29]:
total_lessons_by_account = {}

for account_key, lessons_for_student in lessons_by_account.items():
    total_lessons = 0
    for lessons_completed in lessons_for_student:
        total_lessons += lessons_completed['lessons_completed']
    total_lessons_by_account[account_key] = total_lessons        

In [30]:
total_lessons = total_lessons_by_account.values()

import numpy as np

print 'Mean:', np.mean(total_lessons)
print 'Std dev:', np.std(total_lessons)
print 'Min:', np.min(total_lessons)
print 'Max:', np.max(total_lessons)

Mean: 1.63618090452
Std dev: 3.00256129983
Min: 0
Max: 36


# Making it a function

In [31]:
from collections import defaultdict

def create_dict(table, key):
    new_dictionary = defaultdict(list)
    for item in table:
        set_key = item[key]
        new_dictionary[set_key].append(item)
    return new_dictionary
        
def add_countable_items(new_dictionary, adding_item):
    total_by_account = {}
    for key, value in new_dictionary.items():
        running_total = 0
        for item in value:
            running_total += item[adding_item]
        total_by_account[key] = running_total
    return total_by_account

x = create_dict(paid_engagement_in_first_week, 'account_key')
y = add_countable_items(x, 'lessons_completed')

z = y.values()

import numpy as np

print np.mean(z)

1.63618090452


In [32]:
num_courses_visited_by_account = create_dict(paid_engagement_in_first_week, 'account_key')

total_courses_by_account = {}

for account_key, num_courses in num_courses_visited_by_account.items():
    total_courses = 0
    for item in num_courses:
        if item['num_courses_visited'] > 0:
            total_courses += item['num_courses_visited']
    total_courses_by_account[account_key] = total_courses        

In [33]:
total_courses = total_courses_by_account.values()

import numpy as np

print 'Mean:', np.mean(total_courses)
print 'Std dev:', np.std(total_courses)
print 'Min:', np.min(total_courses)
print 'Max:', np.max(total_courses)

Mean: 3.97889447236
Std dev: 3.5912451871
Min: 0
Max: 25
