In [109]:
#CSVs in Python
# Representing a CSV as a list of rows

#Option 1: Each row is a list
csv = [['A1','A2','A3'],
      ['B2', 'B2', 'B3']]

#Option 2: 
csv = [{'name1': 'A1', 'name2': 'A2', 'name3': 'A3'},
      {'name1': 'B1', 'name2': 'B2', 'name3': 'B3'}]

#better to use libraries, for example unicodecsv

In [110]:
#CSVs in Python
import unicodecsv

enrollments = []
f = open('enrollments.csv', 'rb')
reader = unicodecsv.DictReader(f)

for row in reader:
    enrollments.append(row)
    
f.close()
enrollments[2]

{'account_key': '448',
 'cancel_date': '2015-01-27',
 'days_to_cancel': '0',
 'is_canceled': 'True',
 'is_udacity': 'True',
 'join_date': '2015-01-27',
 'status': 'canceled'}

In [111]:
#CSVs in Python
#Shorter version of above, using with

import unicodecsv

with open('enrollments.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    enrollments = list(reader)
    
enrollments[2]

{'account_key': '448',
 'cancel_date': '2015-01-27',
 'days_to_cancel': '0',
 'is_canceled': 'True',
 'is_udacity': 'True',
 'join_date': '2015-01-27',
 'status': 'canceled'}

In [112]:
#CSVs in Python
#Reading in the other files, own code
import unicodecsv

with open('daily_engagement.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    daily_engagement = list(reader)
    
with open('project_submissions.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    project_submissions = list(reader)
    
print(daily_engagement[0])
print(project_submissions[0])

{'lessons_completed': '0.0', 'projects_completed': '0.0', 'total_minutes_visited': '11.6793745', 'utc_date': '2015-01-09', 'acct': '0', 'num_courses_visited': '1.0'}
{'processing_state': 'EVALUATED', 'completion_date': '2015-01-16', 'account_key': '256', 'assigned_rating': 'UNGRADED', 'creation_date': '2015-01-14', 'lesson_key': '3176718735'}


In [113]:
#CSVs in Python
#write as function

def read_csv(filename):
    with open(filename, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

enrollments = read_csv('enrollments.csv')
daily_engagement = read_csv('daily_engagement.csv')
project_submissions = read_csv('project_submissions.csv')

print(enrollments[0])
print(daily_engagement[0])
print(project_submissions[0])

{'status': 'canceled', 'join_date': '2014-11-10', 'account_key': '448', 'days_to_cancel': '65', 'is_udacity': 'True', 'cancel_date': '2015-01-14', 'is_canceled': 'True'}
{'lessons_completed': '0.0', 'projects_completed': '0.0', 'total_minutes_visited': '11.6793745', 'utc_date': '2015-01-09', 'acct': '0', 'num_courses_visited': '1.0'}
{'processing_state': 'EVALUATED', 'completion_date': '2015-01-16', 'account_key': '256', 'assigned_rating': 'UNGRADED', 'creation_date': '2015-01-14', 'lesson_key': '3176718735'}


In [114]:
#Fixing data types
#taken from L1_Starter_code
from datetime import datetime as dt

# Takes a date as a string, and returns a Python datetime object. 
# If there is no date given, returns None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]

for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
daily_engagement[0]

# Clean up the data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

project_submissions[0]

{'account_key': '256',
 'assigned_rating': 'UNGRADED',
 'completion_date': datetime.datetime(2015, 1, 16, 0, 0),
 'creation_date': datetime.datetime(2015, 1, 14, 0, 0),
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

In [115]:
print(enrollment['cancel_date'])

None


__Quiz: Questions about Student...__
1. Does time of day for enrollment affect count or frequency of project submissions?
2. Is there a point of daily engagement which noticeably increases or decreases count of project submissions?
3. What is the average lessons completed per project submission?
4. Of students who have submitted at least 1 project, what is the average number of project submissions within 6 months of enrolling?
5. Looking only at students enrolled for at least 3 months, how many have submitted at least 1 project?
6. On average, how long does it take to complete 6 projects?

In [116]:
#Quiz: Investigating the data
enrollments_count = len(enrollments)
projects_count = len(project_submissions)
engagement_count = len(daily_engagement)

print('Enrollment count: {0}, projects count: {1}, engagement count: {2}'.format(enrollments_count,projects_count,engagement_count))

def unique_acc(table,col):
    accounts = []
    for account in table:
        if account[col] not in accounts:
            accounts.append(account[col])
    return len(accounts)

enrollments_unique = unique_acc(enrollments,'account_key')
projects_unique_users = unique_acc(project_submissions,'account_key')
engagement_unique = unique_acc(daily_engagement,'acct')

print('Unique enrollments: {0}, Users with project submissions: {1}, users with some daily activity: {2}'.format(enrollments_unique,projects_unique_users,engagement_unique))

Enrollment count: 1640, projects count: 3642, engagement count: 136240
Unique enrollments: 1302, Users with project submissions: 743, users with some daily activity: 1237


In [117]:
#Same as above, with variable names from course framework

import unicodecsv
import timeit

def read_csv(filename):
    with open(filename, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

#enrollments = read_csv('/datasets/ud170/udacity-students/enrollments.csv')
#daily_engagement = read_csv('/datasets/ud170/udacity-students/daily_engagement.csv')
#project_submissions = read_csv('/datasets/ud170/udacity-students/project_submissions.csv')
    
### For each of these three tables, find the number of rows in the table and
### the number of unique students in the table. To find the number of unique
### students, you might want to create a set of the account keys in each table.
start = timeit.default_timer()
def unique_acc(table,col):
    accounts = []
    for account in table:
        if account[col] not in accounts:
            accounts.append(account[col])
    return len(accounts)

enrollment_num_rows = len(enrollments)            # Replace this with your code
enrollment_num_unique_students = unique_acc(enrollments,'account_key')  # Replace this with your code

engagement_num_rows = len(daily_engagement)             # Replace this with your code
engagement_num_unique_students = unique_acc(daily_engagement,'acct')  # Replace this with your code

submission_num_rows = len(project_submissions)             # Replace this with your code
submission_num_unique_students = unique_acc(project_submissions,'account_key')  # Replace this with your code

stop = timeit.default_timer()

print(stop - start)

1.4733701959994505


In [118]:
#Quiz: Investigating the data: answer from course video
import timeit
start = timeit.default_timer()

len(enrollments)

unique_enrolled_students = set()
for enrollment in enrollments:
    unique_enrolled_students.add(enrollment['account_key'])
len(unique_enrolled_students)

len(daily_engagement)

unique_engagement_students = set()
for engagement_record in daily_engagement:
    unique_engagement_students.add(engagement_record['acct'])
len(unique_engagement_students)

len(project_submissions)

unique_project_submitters = set()
for submission in project_submissions:
    unique_project_submitters.add(submission['account_key'])
len(unique_project_submitters)

stop = timeit.default_timer()

print(stop - start)

0.051765757001703605


In [119]:
#Quiz: Problems in the data
#Unifying name for account key in all tables (dictionaries)

print(daily_engagement[3])#['acct'])

for rec in daily_engagement:
    rec['account_key'] = rec['acct']
    del[rec['acct']]



{'lessons_completed': 0, 'projects_completed': 0, 'total_minutes_visited': 33.4892696667, 'utc_date': datetime.datetime(2015, 1, 12, 0, 0), 'acct': '0', 'num_courses_visited': 1}


In [120]:
#Missing engagement reports
print(type(unique_enrolled_students))

for missing in unique_engagement_students:
    if missing not in unique_enrolled_students:
        print(missing + " from engagement")
        break

for missing in unique_enrolled_students:
    if missing not in unique_engagement_students:
        if missing not in ('799','926','870','1079'):
            print(missing + " from enrolled")
            break


<class 'set'>
766 from enrolled


In [121]:
for enrolled in enrollments:
    if enrolled['account_key'] in ('799','926','870','1079'):
        print(enrolled)
        
# def thebest():
#   entries = [enr['key2'] for d in list if d['key1']]
#   return len(entries), sum(entries)

canceled = int()
not_canceled = int()

for i in enrollments:
    if i['status'] == 'canceled':
        canceled += 1
    else:
        not_canceled += 1

print(canceled,not_canceled)

{'status': 'canceled', 'join_date': datetime.datetime(2015, 4, 6, 0, 0), 'account_key': '926', 'days_to_cancel': 0, 'is_udacity': False, 'cancel_date': datetime.datetime(2015, 4, 6, 0, 0), 'is_canceled': True}
{'status': 'canceled', 'join_date': datetime.datetime(2015, 2, 6, 0, 0), 'account_key': '799', 'days_to_cancel': 0, 'is_udacity': False, 'cancel_date': datetime.datetime(2015, 2, 6, 0, 0), 'is_canceled': True}
{'status': 'canceled', 'join_date': datetime.datetime(2015, 5, 12, 0, 0), 'account_key': '870', 'days_to_cancel': 0, 'is_udacity': False, 'cancel_date': datetime.datetime(2015, 5, 12, 0, 0), 'is_canceled': True}
{'status': 'canceled', 'join_date': datetime.datetime(2015, 2, 9, 0, 0), 'account_key': '1079', 'days_to_cancel': 0, 'is_udacity': False, 'cancel_date': datetime.datetime(2015, 2, 9, 0, 0), 'is_canceled': True}
988 652


In [122]:
canceled = int()
not_canceled = int()

for i in enrollments:
    if i['is_canceled']:
        canceled += 1
    else:
        not_canceled += 1

print(canceled,not_canceled)

def counter(table,dic,val):
    count_true = int()
    count_false = int()
    for i in table:
        if i[dic] == val:
            count_true += 1
        else:
            count_false += 1
    print('Count of {0}:{1} is {2}. Count of other values for {0} is {3}'.format(dic,val,count_true,count_false))
    del count_true,count_false
    
counter(enrollments,'days_to_cancel',0)

988 652
Count of days_to_cancel:0 is 92. Count of other values for days_to_cancel is 1548


In [123]:
#quiz checking for more problems
missing_students = unique_enrolled_students - unique_engagement_students

print(len(missing_students))

count_true = int()
count_false = int()

for act in missing_students:
    for miss in enrollments:
        if act == miss['account_key']:
            if miss['days_to_cancel'] == 0:
                count_true += 1
            else:
                count_false += 1

print('Count of days_to_cancel:0 is {0}. Count of other values for days_to_cancel is {1}'.format(count_true,count_false))

del count_true,count_false
            
            
            

65
Count of days_to_cancel:0 is 68. Count of other values for days_to_cancel is 3


In [124]:
for act in missing_students:
    for miss in enrollments:
        if act == miss['account_key']:
            if miss['days_to_cancel'] != 0:
                print(miss)

{'status': 'current', 'join_date': datetime.datetime(2015, 2, 25, 0, 0), 'account_key': '1101', 'days_to_cancel': None, 'is_udacity': True, 'cancel_date': None, 'is_canceled': False}
{'status': 'canceled', 'join_date': datetime.datetime(2015, 1, 10, 0, 0), 'account_key': '1304', 'days_to_cancel': 59, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), 'is_canceled': True}
{'status': 'canceled', 'join_date': datetime.datetime(2015, 3, 10, 0, 0), 'account_key': '1304', 'days_to_cancel': 99, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), 'is_canceled': True}


In [125]:
#Quiz: answer from video
num_problem_students = 0
for enrollment in enrollments:
    student = enrollment['account_key']
    if (student not in unique_engagement_students and 
            enrollment['join_date'] != enrollment['cancel_date']):
        print(enrollment)
        num_problem_students += 1

num_problem_students

{'status': 'canceled', 'join_date': datetime.datetime(2015, 1, 10, 0, 0), 'account_key': '1304', 'days_to_cancel': 59, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), 'is_canceled': True}
{'status': 'canceled', 'join_date': datetime.datetime(2015, 3, 10, 0, 0), 'account_key': '1304', 'days_to_cancel': 99, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), 'is_canceled': True}
{'status': 'current', 'join_date': datetime.datetime(2015, 2, 25, 0, 0), 'account_key': '1101', 'days_to_cancel': None, 'is_udacity': True, 'cancel_date': None, 'is_canceled': False}


3

In [126]:
testers = list()

for tester in enrollments:
    if tester['is_udacity']:
        print(tester)
        testers.append(tester)

print(len(testers))

{'status': 'canceled', 'join_date': datetime.datetime(2014, 11, 10, 0, 0), 'account_key': '448', 'days_to_cancel': 65, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 1, 14, 0, 0), 'is_canceled': True}
{'status': 'canceled', 'join_date': datetime.datetime(2014, 11, 5, 0, 0), 'account_key': '448', 'days_to_cancel': 5, 'is_udacity': True, 'cancel_date': datetime.datetime(2014, 11, 10, 0, 0), 'is_canceled': True}
{'status': 'canceled', 'join_date': datetime.datetime(2015, 1, 27, 0, 0), 'account_key': '448', 'days_to_cancel': 0, 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 1, 27, 0, 0), 'is_canceled': True}
{'status': 'canceled', 'join_date': datetime.datetime(2014, 11, 10, 0, 0), 'account_key': '448', 'days_to_cancel': 0, 'is_udacity': True, 'cancel_date': datetime.datetime(2014, 11, 10, 0, 0), 'is_canceled': True}
{'status': 'current', 'join_date': datetime.datetime(2015, 3, 10, 0, 0), 'account_key': '448', 'days_to_cancel': None, 'is_udacity': True, 'cancel_date

In [127]:
#above we see there's test users in the dataset. Writing function to remove them.
test_accounts = set()

for i in testers:
    test_accounts.add(i['account_key'])

def remove_udacity(table):
    new = list()
    for row in table:
        if row['account_key'] not in test_accounts:
            new.append(row)
    return new
            
print(test_accounts)

{'1304', '1101', '818', '1069', '312', '448'}


In [128]:
#removing test users
enrollments_non_udacity = remove_udacity(enrollments)
project_submissions_non_udacity = remove_udacity(project_submissions)
daily_engagement_non_udacity = remove_udacity(daily_engagement)

print(len(enrollments))
print(len(enrollments_non_udacity))
print(enrollments[4])
print(enrollments_non_udacity[6])

print(len(project_submissions))
print(len(project_submissions_non_udacity))
print(project_submissions[4])
print(project_submissions_non_udacity[4])

print(len(daily_engagement))
print(len(daily_engagement_non_udacity))
print(daily_engagement[4])
print(daily_engagement_non_udacity[4])


1640
1622
{'status': 'current', 'join_date': datetime.datetime(2015, 3, 10, 0, 0), 'account_key': '448', 'days_to_cancel': None, 'is_udacity': True, 'cancel_date': None, 'is_canceled': False}
{'status': 'canceled', 'join_date': datetime.datetime(2014, 11, 10, 0, 0), 'account_key': '1300', 'days_to_cancel': 6, 'is_udacity': False, 'cancel_date': datetime.datetime(2014, 11, 16, 0, 0), 'is_canceled': True}
3642
3634
{'processing_state': 'EVALUATED', 'completion_date': datetime.datetime(2015, 3, 3, 0, 0), 'account_key': '434', 'assigned_rating': 'INCOMPLETE', 'creation_date': datetime.datetime(2015, 2, 17, 0, 0), 'lesson_key': '3176718735'}
{'processing_state': 'EVALUATED', 'completion_date': datetime.datetime(2015, 3, 3, 0, 0), 'account_key': '434', 'assigned_rating': 'INCOMPLETE', 'creation_date': datetime.datetime(2015, 2, 17, 0, 0), 'lesson_key': '3176718735'}
136240
135656
{'lessons_completed': 0, 'projects_completed': 0, 'total_minutes_visited': 64.7796776667, 'account_key': '0', 'ut

In [129]:
#Quiz: refining the question
'''
Exploration question: How do numbers in the daily engagement table differ for students who pass their first project?
Revision after identifying Q problem areas: Only look at engagement from first week,
and exclude students who cancel within a week
'''

#Creating table (dictionary). Conditions: have not canceled or have stayed enrolled more than 7 days.
#Keys account_key, values enrollment_date
paid_students = {}

for paid in enrollments_non_udacity:
    if paid['cancel_date'] == None or paid['days_to_cancel'] > 7:
        account_key = paid['account_key']
        enrollment_date = paid['join_date']
        #if statement to ensure we're keeping the latest enrollment_date
        if account_key not in paid_students or paid_students[account_key] < enrollment_date:
            paid_students[account_key] = enrollment_date
        
print(len(paid_students))

count = 0
for rec in paid_students:
    count += 1 
    print(rec,paid_students[rec])
    if co == 5:
        break
        

995
175 2015-05-09 00:00:00
800 2015-03-04 00:00:00
696 2014-11-10 00:00:00
825 2015-05-09 00:00:00
1048 2015-03-11 00:00:00


In [152]:
paid_students[1]

KeyError: 1

In [130]:
#Quiz: getting data from first week
stop = timeit.default_timer()

def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7 and time_delta.days >= 0

engagement_first_week = []

for row in paid_students:
    for day in daily_engagement_non_udacity:
        if row == day['account_key']:
            if within_one_week(paid_students[row],day['utc_date']):
                engagement_first_week.append(day)

print(len(daily_engagement_non_udacity))
print(len(engagement_first_week))
print(engagement_first_week[1:10])

stop = timeit.default_timer()

print(stop - start)

135656
6919
[{'lessons_completed': 1, 'projects_completed': 0, 'total_minutes_visited': 265.835024167, 'account_key': '175', 'utc_date': datetime.datetime(2015, 5, 10, 0, 0), 'num_courses_visited': 1}, {'lessons_completed': 2, 'projects_completed': 0, 'total_minutes_visited': 287.549442, 'account_key': '175', 'utc_date': datetime.datetime(2015, 5, 11, 0, 0), 'num_courses_visited': 1}, {'lessons_completed': 1, 'projects_completed': 0, 'total_minutes_visited': 550.245353667, 'account_key': '175', 'utc_date': datetime.datetime(2015, 5, 12, 0, 0), 'num_courses_visited': 1}, {'lessons_completed': 2, 'projects_completed': 0, 'total_minutes_visited': 365.714609667, 'account_key': '175', 'utc_date': datetime.datetime(2015, 5, 13, 0, 0), 'num_courses_visited': 2}, {'lessons_completed': 1, 'projects_completed': 0, 'total_minutes_visited': 352.5950265, 'account_key': '175', 'utc_date': datetime.datetime(2015, 5, 14, 0, 0), 'num_courses_visited': 2}, {'lessons_completed': 0, 'projects_completed': 

In [177]:
#Quiz: getting data from first week, answer from video. Written here to test performance vs own solution

start = timeit.default_timer()

def remove_free_trial_cancels(data):
    new_data = []
    for data_point in data:
        if data_point['account_key'] in paid_students:
            new_data.append(data_point)
    return new_data

paid_enrollments = remove_free_trial_cancels(enrollments_non_udacity)
paid_engagement = remove_free_trial_cancels(daily_engagement_non_udacity)
paid_submissions = remove_free_trial_cancels(project_submissions_non_udacity)

for rec in paid_engagement:
    if rec['lessons_completed'] > 0:
        rec['has_visited'] = 1
    elif rec['lessons_completed'] == 0:
        rec['has_visited'] = 0

print('paid_enrollments:',len(paid_enrollments))
print('paid_engagement:',len(paid_engagement))
print('paid_engagement:',len(paid_submissions))

paid_engagement_in_first_week = []
for engagement_record in paid_engagement:
    account_key = engagement_record['account_key']
    join_date = paid_students[account_key]
    engagement_record_date = engagement_record['utc_date']

    if within_one_week(join_date, engagement_record_date):
        paid_engagement_in_first_week.append(engagement_record)

print(len(daily_engagement_non_udacity))
print(len(engagement_first_week))
print(engagement_first_week[1:10])

stop = timeit.default_timer()

print(stop - start)

paid_enrollments: 1293
paid_engagement: 134549
paid_engagement: 3618
135656
6919
[{'lessons_completed': 1, 'projects_completed': 0, 'total_minutes_visited': 265.835024167, 'account_key': '175', 'utc_date': datetime.datetime(2015, 5, 10, 0, 0), 'has_visited': 1, 'num_courses_visited': 1}, {'lessons_completed': 2, 'projects_completed': 0, 'total_minutes_visited': 287.549442, 'account_key': '175', 'utc_date': datetime.datetime(2015, 5, 11, 0, 0), 'has_visited': 1, 'num_courses_visited': 1}, {'lessons_completed': 1, 'projects_completed': 0, 'total_minutes_visited': 550.245353667, 'account_key': '175', 'utc_date': datetime.datetime(2015, 5, 12, 0, 0), 'has_visited': 1, 'num_courses_visited': 1}, {'lessons_completed': 2, 'projects_completed': 0, 'total_minutes_visited': 365.714609667, 'account_key': '175', 'utc_date': datetime.datetime(2015, 5, 13, 0, 0), 'has_visited': 1, 'num_courses_visited': 2}, {'lessons_completed': 1, 'projects_completed': 0, 'total_minutes_visited': 352.5950265, 'acco

In [179]:
has_count = 0
has_not_count = 0
for rec in paid_engagement:
    if rec['has_visited'] == 1:
        has_count += 1
    elif rec['has_visited'] == 0:
        has_not_count += 1
        
print('has visited:',has_count)
print('has not visited:',has_not_count)
print(has_count+has_not_count)
print(len(paid_engagement))

has visited: 12854
has not visited: 121695
134549
134549


In [181]:
#Exploring student engagement, code from video
from collections import defaultdict

# Create a dictionary of engagement grouped by student.
# The keys are account keys, and the values are lists of engagement records.
engagement_by_account = defaultdict(list)
for engagement_record in paid_engagement_in_first_week:
    account_key = engagement_record['account_key']
    engagement_by_account[account_key].append(engagement_record)

In [182]:
#Exploring student engagement, code from video


# Create a dictionary with the total minutes each student spent in the classroom during the first week.
# The keys are account keys, and the values are numbers (total minutes)
total_minutes_by_account = {}
for account_key, engagement_for_student in engagement_by_account.items():
    total_minutes = 0
    for engagement_record in engagement_for_student:
        total_minutes += engagement_record['total_minutes_visited']
    total_minutes_by_account[account_key] = total_minutes

In [183]:
#Exploring student engagement, code from video
#My edit: made total_minutes into list, a necessity in Python 3

import numpy as np

# Summarize the data about minutes spent in the classroom
total_minutes = list(total_minutes_by_account.values())
print('Mean:', np.mean(total_minutes))
print('Standard deviation:', np.std(total_minutes))
print('Minimum:', np.min(total_minutes))
print('Maximum:', np.max(total_minutes))

Mean: 306.708326753
Standard deviation: 412.996933409
Minimum: 0.0
Maximum: 3564.7332645


We here see that maximum is greater than total number of hours in the week. It's time for debugging.

In [184]:
import operator

sorted_total = sorted(total_minutes_by_account.items(), key=operator.itemgetter(1),reverse=True)

minutes_in_week = 60*24*7
print('Minutes in a week: {0}'.format(minutes_in_week))

for i in sorted_total[0:10]:
    print('ID: {0}    Minutes: {1}   fraction of week: {2}'.format(i[0],round(i[1],2),round((i[1]/minutes_in_week),2)))
    

Minutes in a week: 10080
ID: 163    Minutes: 3564.73   fraction of week: 0.35
ID: 317    Minutes: 2778.32   fraction of week: 0.28
ID: 303    Minutes: 2700.49   fraction of week: 0.27
ID: 359    Minutes: 2530.56   fraction of week: 0.25
ID: 218    Minutes: 2393.54   fraction of week: 0.24
ID: 175    Minutes: 2375.8   fraction of week: 0.24
ID: 140    Minutes: 2050.12   fraction of week: 0.2
ID: 530    Minutes: 1896.75   fraction of week: 0.19
ID: 604    Minutes: 1890.99   fraction of week: 0.19
ID: 171    Minutes: 1887.39   fraction of week: 0.19


In [185]:
sorted_total[1][1]/2

1389.1580723353352

In [186]:
#printing records of suspicious data point (minutes spent exceeds available minutes)
count = 0
for record in paid_engagement_in_first_week:
    if record['account_key'] == '108':
        count += 1
        print(record)
        
print(count)

{'lessons_completed': 0, 'projects_completed': 0, 'total_minutes_visited': 25.9137858334, 'account_key': '108', 'utc_date': datetime.datetime(2015, 4, 20, 0, 0), 'has_visited': 0, 'num_courses_visited': 2}
{'lessons_completed': 0, 'projects_completed': 0, 'total_minutes_visited': 3.40682316667, 'account_key': '108', 'utc_date': datetime.datetime(2015, 4, 21, 0, 0), 'has_visited': 0, 'num_courses_visited': 1}
{'lessons_completed': 0, 'projects_completed': 0, 'total_minutes_visited': 99.1186611667, 'account_key': '108', 'utc_date': datetime.datetime(2015, 4, 22, 0, 0), 'has_visited': 0, 'num_courses_visited': 2}
{'lessons_completed': 0, 'projects_completed': 2, 'total_minutes_visited': 35.8316206667, 'account_key': '108', 'utc_date': datetime.datetime(2015, 4, 23, 0, 0), 'has_visited': 0, 'num_courses_visited': 1}
{'lessons_completed': 0, 'projects_completed': 0, 'total_minutes_visited': 0.0, 'account_key': '108', 'utc_date': datetime.datetime(2015, 4, 24, 0, 0), 'has_visited': 0, 'num_c

In [187]:
#Checking for duplicate entries of account key 108 in paid_engagements
for entry in paid_engagement:
    if entry['account_key'] == '108':
        print(entry)

{'lessons_completed': 0, 'projects_completed': 0, 'total_minutes_visited': 50.9938951667, 'account_key': '108', 'utc_date': datetime.datetime(2015, 1, 7, 0, 0), 'has_visited': 0, 'num_courses_visited': 1}
{'lessons_completed': 5, 'projects_completed': 0, 'total_minutes_visited': 688.3034385, 'account_key': '108', 'utc_date': datetime.datetime(2015, 1, 8, 0, 0), 'has_visited': 1, 'num_courses_visited': 2}
{'lessons_completed': 1, 'projects_completed': 0, 'total_minutes_visited': 427.691757667, 'account_key': '108', 'utc_date': datetime.datetime(2015, 1, 9, 0, 0), 'has_visited': 1, 'num_courses_visited': 2}
{'lessons_completed': 0, 'projects_completed': 0, 'total_minutes_visited': 165.6270925, 'account_key': '108', 'utc_date': datetime.datetime(2015, 1, 10, 0, 0), 'has_visited': 0, 'num_courses_visited': 3}
{'lessons_completed': 0, 'projects_completed': 0, 'total_minutes_visited': 0.0, 'account_key': '108', 'utc_date': datetime.datetime(2015, 1, 11, 0, 0), 'has_visited': 0, 'num_courses_

Based on the findings, which is that the function adding records to paid_engagement_first_week is inaccurate, I will change that function.

In [188]:
#Quiz: Lessons completed in first week

#creating function to add total of engagement parameter per student (SQL: grouped by, count())
def total_activity(para):
    total_pairs = {}
    for account_key,amount in engagement_by_account.items():
        total_amount = 0
        for record in amount:
            total_amount += record[para]
        total_pairs[account_key] = total_amount        
    return total_pairs

total_lessons_by_account = total_activity('lessons_completed')    


In [189]:
print(len(total_lessons_by_account))

# Summarize the data about minutes spent in the classroom
#creating function to give descriptive statistics of dictionary
def desc_stats_of_dictionary(dictionary):
    total_list = list(dictionary.values())
    print('Mean:', np.mean(total_list))
    print('Standard deviation:', np.std(total_list))
    print('Minimum:', np.min(total_list))
    print('Maximum:', np.max(total_list))

desc_stats_of_dictionary(total_lessons_by_account)

995
Mean: 1.63618090452
Standard deviation: 3.00256129983
Minimum: 0
Maximum: 36


In [190]:
#Quiz: Lessons completed in first week, answer from video

from collections import defaultdict

def group_data(data, key_name):
    grouped_data = defaultdict(list)
    for data_point in data:
        key = data_point[key_name]
        grouped_data[key].append(data_point)
    return grouped_data

engagement_by_account = group_data(paid_engagement_in_first_week,
                                   'account_key')

def sum_grouped_items(grouped_data, field_name):
    summed_data = {}
    for key, data_points in grouped_data.items():
        total = 0
        for data_point in data_points:
            total += data_point[field_name]
        summed_data[key] = total
    return summed_data

total_minutes_by_account = sum_grouped_items(engagement_by_account,
                                             'total_minutes_visited')

import numpy as np

def describe_data(data):
    print('Mean:', np.mean(data))
    print('Standard deviation:', np.std(data))
    print('Minimum:', np.min(data))
    print ('Maximum:', np.max(data))

describe_data(total_minutes_by_account.values())
#Then she called the functions she created to analyze the lessons completed in the first week as follows:

lessons_completed_by_account = sum_grouped_items(engagement_by_account,
                                                 'lessons_completed')
describe_data(lessons_completed_by_account.values())


SyntaxError: invalid syntax (<ipython-input-190-bc6aba49644f>, line 30)

In [192]:
count = 0
for rec in engagement_by_account:
    for day in engagement_by_account[rec]:
        if day['lessons_completed'] > 0:
            count += 1    

print(count)

1075


In [195]:
#def desc_stats_of_dictionary(dictionary):
#    total_list = list(dictionary.values())
#    print('Mean:', np.mean(total_list))
#    print('Standard deviation:', np.std(total_list))
#    print('Minimum:', np.min(total_list))
#    print('Maximum:', np.max(total_list))

desc_stats_of_dictionary(total_activity('has_visited'))

Mean: 1.08040201005
Standard deviation: 1.47596075148
Minimum: 0
Maximum: 7


In [148]:
def sum_grouped_items(grouped_data, field_name):
    summed_data = {}
    for key, data_points in grouped_data.items():
        total = 0
        for data_point in data_points:
            total += data_point[field_name]
        summed_data[key] = total
    return summed_data

In [196]:
days_with_lesson_completed = sum_grouped_items(engagement_by_account,'has_visited')

In [207]:
engagement_by_account['995']

[]

In [217]:
sum(1 for x in days_with_lesson_completed.values() if x > 0)

489

In [220]:
print(sum(1 for x in enrollments if x['is_udacity'] == True))
print(sum(1 for x in enrollments if x['is_udacity'] == False))

len(enrollments)

18
1622


1640

In [205]:
def print_10_rows(dictionary):
    count = 0
    for key in dictionary:
        print(key,':',dictionary[key])
        count += 1
        if count > 10:
            break

In [206]:
print_10_rows(days_with_lesson_completed)

995
175 : 6
800 : 1
277 : 2
694 : 1
825 : 0
1048 : 0
887 : 0
670 : 2
55 : 2
1001 : 0
125 : 5


In [198]:
desc_stats_of_dictionary(days_with_lesson_completed)

Mean: 1.08040201005
Standard deviation: 1.47596075148
Minimum: 0
Maximum: 7


In [216]:
summa = 0
for key in days_with_lesson_completed:
    summa += days_with_lesson_completed[key]
    
print(summa)

zero,one,two,three,four,five,six,seven = 0,0,0,0,0,0,0,0

for key in days_with_lesson_completed:
   if days_with_lesson_completed[key] == 0:
    zero += 1
   elif days_with_lesson_completed[key] == 1: 
    one += 1
   elif days_with_lesson_completed[key] == 2: 
    two += 1
   elif days_with_lesson_completed[key] == 3: 
    three += 1
   elif days_with_lesson_completed[key] == 4: 
    four += 1
   elif days_with_lesson_completed[key] == 5: 
    five += 1
   elif days_with_lesson_completed[key] == 6: 
    six += 1
   elif days_with_lesson_completed[key] == 7: 
    seven += 1
    
print(zero,one,two,three,four,five,six,seven)

1075/489

1075
506 217 110 65 59 24 11 3


2.198364008179959

In [147]:
engagement_by_account['743']

[{'account_key': '743',
  'has_visited': 0,
  'lessons_completed': 0,
  'num_courses_visited': 0,
  'projects_completed': 0,
  'total_minutes_visited': 0.0,
  'utc_date': datetime.datetime(2014, 12, 9, 0, 0)},
 {'account_key': '743',
  'has_visited': 0,
  'lessons_completed': 0,
  'num_courses_visited': 0,
  'projects_completed': 0,
  'total_minutes_visited': 0.0,
  'utc_date': datetime.datetime(2014, 12, 10, 0, 0)},
 {'account_key': '743',
  'has_visited': 0,
  'lessons_completed': 0,
  'num_courses_visited': 0,
  'projects_completed': 0,
  'total_minutes_visited': 0.0,
  'utc_date': datetime.datetime(2014, 12, 11, 0, 0)},
 {'account_key': '743',
  'has_visited': 0,
  'lessons_completed': 0,
  'num_courses_visited': 0,
  'projects_completed': 0,
  'total_minutes_visited': 0.0,
  'utc_date': datetime.datetime(2014, 12, 12, 0, 0)},
 {'account_key': '743',
  'has_visited': 0,
  'lessons_completed': 0,
  'num_courses_visited': 0,
  'projects_completed': 0,
  'total_minutes_visited': 0.0,
