## 1: Load data from a csv file

In [1]:
import unicodecsv

enrollments_filename = 'enrollments.csv'
engagement_filename = 'daily_engagement.csv'
submissions_filename = 'project_submissions.csv'

# Open files and read .csv file into a list

with open(enrollments_filename, 'rb') as f1:
    reader = unicodecsv.DictReader(f1)
    enrollments = list(reader)

with open(engagement_filename, 'rb') as f2:
    reader = unicodecsv.DictReader(f2)
    daily_engagement = list(reader)

with open(submissions_filename, 'rb') as f3:
    reader = unicodecsv.DictReader(f3)
    project_submissions = list(reader)

print(enrollments[0])
print(daily_engagement[0])
print(project_submissions[0])

{u'status': u'canceled', u'is_udacity': u'True', u'is_canceled': u'True', u'join_date': u'2014-11-10', u'account_key': u'448', u'cancel_date': u'2015-01-14', u'days_to_cancel': u'65'}
{u'lessons_completed': u'0.0', u'num_courses_visited': u'1.0', u'total_minutes_visited': u'11.6793745', u'projects_completed': u'0.0', u'acct': u'0', u'utc_date': u'2015-01-09'}
{u'lesson_key': u'3176718735', u'processing_state': u'EVALUATED', u'account_key': u'256', u'assigned_rating': u'UNGRADED', u'completion_date': u'2015-01-16', u'creation_date': u'2015-01-14'}


In [2]:
# Merging code above into a single function

def read_csv(filename):
    with open(filename, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader) # Return here because exiting with closes the file

enrollments = read_csv(enrollments_filename)
daily_engagement = read_csv(engagement_filename)
project_submissions = read_csv(submissions_filename)

print(enrollments[0])
print(daily_engagement[0])
print(project_submissions[0])

{u'status': u'canceled', u'is_udacity': u'True', u'is_canceled': u'True', u'join_date': u'2014-11-10', u'account_key': u'448', u'cancel_date': u'2015-01-14', u'days_to_cancel': u'65'}
{u'lessons_completed': u'0.0', u'num_courses_visited': u'1.0', u'total_minutes_visited': u'11.6793745', u'projects_completed': u'0.0', u'acct': u'0', u'utc_date': u'2015-01-09'}
{u'lesson_key': u'3176718735', u'processing_state': u'EVALUATED', u'account_key': u'256', u'assigned_rating': u'UNGRADED', u'completion_date': u'2015-01-16', u'creation_date': u'2015-01-14'}


## 2: Fix data types

In [3]:
from datetime import datetime as dt

# Takes a date as a string, and returns a Python datetime object. 
# Returns None if there is no date given.
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')

# Takes a string which is either an empty string or represents an integer.
# Returns an int if non-empty string, None if empty string.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

In [4]:
# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]

{u'account_key': u'448',
 u'cancel_date': datetime.datetime(2015, 1, 14, 0, 0),
 u'days_to_cancel': 65,
 u'is_canceled': True,
 u'is_udacity': True,
 u'join_date': datetime.datetime(2014, 11, 10, 0, 0),
 u'status': u'canceled'}

In [5]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed'])) # Convert string to float then int
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
daily_engagement[0]

{u'acct': u'0',
 u'lessons_completed': 0,
 u'num_courses_visited': 1,
 u'projects_completed': 0,
 u'total_minutes_visited': 11.6793745,
 u'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}

In [6]:
# Clean up the data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

project_submissions[0]

{u'account_key': u'256',
 u'assigned_rating': u'UNGRADED',
 u'completion_date': datetime.datetime(2015, 1, 16, 0, 0),
 u'creation_date': datetime.datetime(2015, 1, 14, 0, 0),
 u'lesson_key': u'3176718735',
 u'processing_state': u'EVALUATED'}

## 3: Investigate the data

1. Find the total number of rows in each table
2. Find the total number of unique students (account keys) in each table

In [7]:
# Find the total number of rows and unique students in enrollments table

print(len(enrollments)) # Total number of rows in enrollments table

unique_students = set() # Create an empty set to store unique student account keys

# For each entry in enrollments, add the account_key to unique_students.
# Because unique_students is a set, it ignores duplicates
for student in enrollments:
    unique_students.add(student['account_key'])

print(len(unique_students)) # Total number of unique students in enrollments table

1640
1302


In [8]:
# Find the total number of rows and unique students in daily_engagement table

print(len(daily_engagement)) # Total number of rows in daily_engagement table

unique_student_engagement = set()

for entry in daily_engagement:
    unique_student_engagement.add(entry['acct'])

print(len(unique_student_engagement))

136240
1237


In [9]:
# Find the total number of rows and unique students in project_submissions table

print(len(project_submissions))

unique_student_submissions = set()

for entry in project_submissions:
    unique_student_submissions.add(entry['account_key'])
    
print(len(unique_student_submissions))

3642
743


## 4: Rename keys in list of dicts

In [10]:
## Make a copy of the old list for comparison purposes

# Load csv
daily_engagement2 = read_csv(engagement_filename)

# Clean up the data types in daily_engagement2
for entry in daily_engagement2:
    entry['lessons_completed'] = int(float(entry['lessons_completed'])) # Convert string to float then int
    entry['num_courses_visited'] = int(float(entry['num_courses_visited']))
    entry['projects_completed'] = int(float(entry['projects_completed']))
    entry['total_minutes_visited'] = float(entry['total_minutes_visited'])
    entry['utc_date'] = parse_date(entry['utc_date'])

# For each entry (a dictionary) in daily_engagement, rename the 'acct' key
for entry in daily_engagement2:
    entry['account_key'] = entry['acct']
    del[entry['acct']]

daily_engagement2[0]

{'account_key': u'0',
 u'lessons_completed': 0,
 u'num_courses_visited': 1,
 u'projects_completed': 0,
 u'total_minutes_visited': 11.6793745,
 u'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}

In [11]:
daily_engagement[0] # For comparison

{u'acct': u'0',
 u'lessons_completed': 0,
 u'num_courses_visited': 1,
 u'projects_completed': 0,
 u'total_minutes_visited': 11.6793745,
 u'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}

In [13]:
daily_engagement2[0]['account_key']

u'0'

## 5: Rewrite into a single function

In [14]:
# Takes a list of dictionaries
# Returns a set of the unique students in the list input
def get_unique_students(data_list):
    unique_students = set() # Create empty set
    for entry in data_list:
        unique_students.add(entry['account_key'])
    return unique_students

print(len(get_unique_students(enrollments)))
print(len(unique_students))

print(len(get_unique_students(daily_engagement2)))
print(len(unique_student_engagement))

print(len(get_unique_students(project_submissions)))
print(len(unique_student_submissions))

1302
1302
1237
1237
743
743
