## Load Data from CSVs

In [1]:
import unicodecsv

with open('enrollments.csv','rb') as f:
    reader = unicodecsv.DictReader(f)
    enrollments = list(reader) 
    
        
enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', '2014-11-10'),
             ('cancel_date', '2015-01-14'),
             ('days_to_cancel', '65'),
             ('is_udacity', 'True'),
             ('is_canceled', 'True')])

In [2]:
type(enrollments)

list

In [3]:
enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', '2014-11-10'),
             ('cancel_date', '2015-01-14'),
             ('days_to_cancel', '65'),
             ('is_udacity', 'True'),
             ('is_canceled', 'True')])

In [4]:
#####################################
#                 1                 #
#####################################

## Read in the data from daily_engagement.csv and project_submissions.csv 
## and store the results in the below variables.
## Then look at the first row of each table.

with open('daily_engagement.csv','rb') as f : 
    reader = unicodecsv.DictReader(f)
    daily_engagement = list(reader)

daily_engagement[0]


#project_submissions = 

OrderedDict([('acct', '0'),
             ('utc_date', '2015-01-09'),
             ('num_courses_visited', '1.0'),
             ('total_minutes_visited', '11.6793745'),
             ('lessons_completed', '0.0'),
             ('projects_completed', '0.0')])

In [5]:
with open('project_submissions.csv','rb') as f : 
    reader =unicodecsv.DictReader(f)
    project_submissions = list(reader)

project_submissions[0]

OrderedDict([('creation_date', '2015-01-14'),
             ('completion_date', '2015-01-16'),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

In [6]:
daily_engagement[0]

OrderedDict([('acct', '0'),
             ('utc_date', '2015-01-09'),
             ('num_courses_visited', '1.0'),
             ('total_minutes_visited', '11.6793745'),
             ('lessons_completed', '0.0'),
             ('projects_completed', '0.0')])

In [7]:
project_submissions[0]

OrderedDict([('creation_date', '2015-01-14'),
             ('completion_date', '2015-01-16'),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

In [8]:
#make reader function file to make a work easier
def read_csv(filename):
    with open(filename,'rb') as f : 
        reader =unicodecsv.DictReader(f)
        return list(reader)

    
    
enrollments = read_csv("enrollments.csv")



daily_engagement= read_csv("daily_engagement.csv")
project_submissions= read_csv("project_submissions.csv")

daily_engagement[0]
project_submissions[0]



OrderedDict([('creation_date', '2015-01-14'),
             ('completion_date', '2015-01-16'),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

## Fixing Data Types

In [9]:
from datetime import datetime as dt

# Takes a date as a string, and returns a Python datetime object. 
# If there is no date given, returns None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', datetime.datetime(2014, 11, 10, 0, 0)),
             ('cancel_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('days_to_cancel', 65),
             ('is_udacity', True),
             ('is_canceled', True)])

In [10]:
enrollments[4]

OrderedDict([('account_key', '448'),
             ('status', 'current'),
             ('join_date', datetime.datetime(2015, 3, 10, 0, 0)),
             ('cancel_date', None),
             ('days_to_cancel', None),
             ('is_udacity', True),
             ('is_canceled', False)])

In [11]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
daily_engagement[0]

OrderedDict([('acct', '0'),
             ('utc_date', datetime.datetime(2015, 1, 9, 0, 0)),
             ('num_courses_visited', 1),
             ('total_minutes_visited', 11.6793745),
             ('lessons_completed', 0),
             ('projects_completed', 0)])

In [12]:
# Clean up the data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

project_submissions[0]

OrderedDict([('creation_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('completion_date', datetime.datetime(2015, 1, 16, 0, 0)),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

## Investigating the Data

In [13]:
#####################################
#                 2                 #
#####################################

## Find the total number of rows and the number of unique students (account keys)
## in each table.

In [14]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [15]:
enrollments.head()  #This is not a dataframe so I can't use head fuction or info module

AttributeError: 'list' object has no attribute 'head'

In [16]:
#make dataframe function
def change2DF(data):
    return pd.DataFrame(data)

In [17]:
enrollmentsDF = change2DF(enrollments)
engagementDF   =change2DF(daily_engagement)
project_subDF  =change2DF(project_submissions)

In [18]:
enrollmentsDF.info()     # I can check total number by Rangeindex 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1640 entries, 0 to 1639
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   account_key     1640 non-null   object        
 1   status          1640 non-null   object        
 2   join_date       1640 non-null   datetime64[ns]
 3   cancel_date     988 non-null    datetime64[ns]
 4   days_to_cancel  988 non-null    float64       
 5   is_udacity      1640 non-null   bool          
 6   is_canceled     1640 non-null   bool          
dtypes: bool(2), datetime64[ns](2), float64(1), object(2)
memory usage: 67.4+ KB


In [19]:
enrollmentsDF

Unnamed: 0,account_key,status,join_date,cancel_date,days_to_cancel,is_udacity,is_canceled
0,448,canceled,2014-11-10,2015-01-14,65.0,True,True
1,448,canceled,2014-11-05,2014-11-10,5.0,True,True
2,448,canceled,2015-01-27,2015-01-27,0.0,True,True
3,448,canceled,2014-11-10,2014-11-10,0.0,True,True
4,448,current,2015-03-10,NaT,,True,False
...,...,...,...,...,...,...,...
1635,1176,current,2015-08-12,NaT,,False,False
1636,1110,current,2015-08-13,NaT,,False,False
1637,1116,canceled,2015-08-15,2015-08-18,3.0,False,True
1638,874,current,2015-08-22,NaT,,False,False


In [20]:
engagementDF.info()
project_subDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136240 entries, 0 to 136239
Data columns (total 6 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   acct                   136240 non-null  object        
 1   utc_date               136240 non-null  datetime64[ns]
 2   num_courses_visited    136240 non-null  int64         
 3   total_minutes_visited  136240 non-null  float64       
 4   lessons_completed      136240 non-null  int64         
 5   projects_completed     136240 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 6.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3642 entries, 0 to 3641
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   creation_date     3642 non-null   datetime64[ns]
 1   completion_date   3636 non-null   datetime64[ns]
 2   as

In [21]:
enrollmentsDF['account_key'].dtypes #object
type(enrollmentsDF['account_key'])   #series

pandas.core.series.Series

In [22]:
enrollmentsDF['account_key'].astype({'account_key':'int'}).dtypes

dtype('int64')

In [23]:
enrollmentsDF['account_key'].dtypes

dtype('O')

In [24]:
enrollmentsDF['account_key'].values

array(['448', '448', '448', ..., '1116', '874', '686'], dtype=object)

In [25]:
#count down unique keys
"""
unique_enrolled_students =set()
for enrollment in enrollmentsDF:
    unique_enrolled_students.add(int(enrollment['account_key']))
    
""" 

"\nunique_enrolled_students =set()\nfor enrollment in enrollmentsDF:\n    unique_enrolled_students.add(int(enrollment['account_key']))\n    \n"

In [26]:
#count down unique key
unique_enrolled_students =set()
for enrollment in enrollmentsDF["account_key"]:
    unique_enrolled_students.add(enrollment)
len(unique_enrolled_students)

1302

In [27]:
# unique_enrolled_students  setobj

In [28]:
#acct - daily engagement 
#account_key proj_submisssions

unique_engagement_students =set()
for engagement in engagementDF["acct"]:
    unique_engagement_students.add(engagement)

unique_proj_students =set()
for proj in project_subDF["account_key"]:
    unique_proj_students.add(proj)

In [29]:
print(len(unique_engagement_students))                   
print(len(unique_proj_students))

1237
743


## Problems in the Data

In [30]:
#####################################
#                 3                 #
#####################################

## Rename the "acct" column in the daily_engagement table to "account_key".

IN Dataframeway

In [31]:
## wrong : engagementDF["acct"] = engagementDF["ab"] to rename
##use : df.rename(columns = {'oldnm':'newnm'},index={'old':'new'},inplace = True)
engagementDF.rename(columns={"acct":"account_key"},inplace = True)
engagementDF.head()

Unnamed: 0,account_key,utc_date,num_courses_visited,total_minutes_visited,lessons_completed,projects_completed
0,0,2015-01-09,1,11.679374,0,0
1,0,2015-01-10,2,37.284887,0,0
2,0,2015-01-11,2,53.633746,0,0
3,0,2015-01-12,1,33.48927,0,0
4,0,2015-01-13,1,64.779678,0,0


### IN argument Tableway (change data in Dictionary type)
>it's impossible to rename keyvalue so we need to 'pop' out and putit new one

In [32]:
type(daily_engagement)

list

In [33]:
#daily_engagement['account_key'] = daily_engagement['acct']
#list integers must be a integers

In [34]:
for engagement_record in daily_engagement:
    engagement_record['account_key'] = engagement_record['acct']
    del [engagement_record['acct']]

In [35]:
def get_unique_students(data):
    unique_students = set()
    for data_point in data:
        unique_students.add(data_point['account_key'])
    return unique_students
print("len of enrollments : "+ str(len(enrollments)))
unique_enrolled_students = get_unique_students(enrollments)
print("len of unique enrolled_students:"+str(len(unique_enrolled_students)))
print("len of daily engagement: " + str(len(daily_engagement)))
unique_engagement_students = get_unique_students(daily_engagement)
print("unique engagement students: "+str(len(unique_engagement_students)))
print("len of project_submissions: "+str(len(project_submissions)))
unique_project_submitters = get_unique_students(project_submissions)
print("len of inique project submiitters: "+ str(len(unique_project_submitters)))

len of enrollments : 1640
len of unique enrolled_students:1302
len of daily engagement: 136240
unique engagement students: 1237
len of project_submissions: 3642
len of inique project submiitters: 743


## Missing Engagement Records

In [36]:
#####################################
#                 4                 #
#####################################

## Find any one student enrollments where the student is missing from the daily engagement table.
## Output that enrollment.
##using pyton breaak method

In [37]:
enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', datetime.datetime(2014, 11, 10, 0, 0)),
             ('cancel_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('days_to_cancel', 65),
             ('is_udacity', True),
             ('is_canceled', True)])

In [38]:
enrollments[0]['account_key']

'448'

In [39]:
#enrollments['account_key']
#error: list must be integers or slices, not str

In [40]:
for enrollment in enrollments:
    student = enrollment['account_key']
    if student not in unique_engagement_students:
        print( enrollment)
        break


OrderedDict([('account_key', '1219'), ('status', 'canceled'), ('join_date', datetime.datetime(2014, 11, 12, 0, 0)), ('cancel_date', datetime.datetime(2014, 11, 12, 0, 0)), ('days_to_cancel', 0), ('is_udacity', False), ('is_canceled', True)])


## Checking for More Problem Records

In [41]:
#####################################
#                 5                 #
#####################################

## Find the number of surprising data points (enrollments missing from
## the engagement table) that remain, if any.

In [42]:
count = 0 ;
for enrollment in enrollments:
    student = enrollment['account_key']
    if student not in unique_engagement_students:
        count += 1
        
print(count)

71


## 71 is the number of originally surprising data points 
 that is, the number of enrollments with no corresponding engagement data. 
 How many of those also stayed enrolled at least 1 day?

In [43]:
sub1 = enrollments[0]['cancel_date'] - enrollments[0]['join_date']
type(sub1)

#wrong way to think we need a data of least 1day.

#  sub = enrollment['cancel_date']-enrollments['join_date']
#   if sub == 1:
#       count +=1 >>>>>>>>>>>> not propre think more easy focus on data meaning 

datetime.timedelta

In [44]:
count = 0 

for enrollment in enrollments:
    student = enrollment['account_key']
    if student not in unique_engagement_students \
    and enrollment['join_date'] != enrollment['cancel_date']:
        count += 1

count

3

In [45]:
count = 0 

for enrollment in enrollments:
    student = enrollment['account_key']
    if student not in unique_engagement_students \
    and enrollment['join_date'] != enrollment['cancel_date']:
        count += 1
        print(enrollment)

OrderedDict([('account_key', '1304'), ('status', 'canceled'), ('join_date', datetime.datetime(2015, 1, 10, 0, 0)), ('cancel_date', datetime.datetime(2015, 3, 10, 0, 0)), ('days_to_cancel', 59), ('is_udacity', True), ('is_canceled', True)])
OrderedDict([('account_key', '1304'), ('status', 'canceled'), ('join_date', datetime.datetime(2015, 3, 10, 0, 0)), ('cancel_date', datetime.datetime(2015, 6, 17, 0, 0)), ('days_to_cancel', 99), ('is_udacity', True), ('is_canceled', True)])
OrderedDict([('account_key', '1101'), ('status', 'current'), ('join_date', datetime.datetime(2015, 2, 25, 0, 0)), ('cancel_date', None), ('days_to_cancel', None), ('is_udacity', True), ('is_canceled', False)])


In [46]:
print(enrollment)

OrderedDict([('account_key', '686'), ('status', 'current'), ('join_date', datetime.datetime(2015, 8, 23, 0, 0)), ('cancel_date', None), ('days_to_cancel', None), ('is_udacity', False), ('is_canceled', False)])


## Tracking Down the Remaining Problems

In [47]:
# Create a set of the account keys for all Udacity test accounts
udacity_test_accounts = set()
for enrollment in enrollments:
    if enrollment['is_udacity']:
        udacity_test_accounts.add(enrollment['account_key'])
len(udacity_test_accounts)

6

In [48]:
udacity_test_accounts
#output show us acccountkey

{'1069', '1101', '1304', '312', '448', '818'}

#### udacit_test_accounts :: only have 'is_udacity' are true

In [49]:
# Given some data with an account_key field, removes any records corresponding to Udacity test accounts
def remove_udacity_accounts(data):
    non_udacity_data = []
    for data_point in data:
        if data_point['account_key'] not in udacity_test_accounts:
            non_udacity_data.append(data_point)
    return non_udacity_data

In [50]:
# Remove Udacity test accounts from all three tables
non_udacity_enrollments = remove_udacity_accounts(enrollments)
non_udacity_engagement = remove_udacity_accounts(daily_engagement)
non_udacity_submissions = remove_udacity_accounts(project_submissions)

print(len(non_udacity_enrollments))
print(len(non_udacity_engagement))
print(len(non_udacity_submissions))

1622
135656
3634


## Refining the Question

In [51]:
#####################################
#                 6                 #
#####################################

## Create a dictionary named paid_students containing all students who either
## haven't canceled yet or who remained enrolled for more than 7 days. The keys
## should be account keys, and the values should be the date the student enrolled.


paid_students = {}
#make dict type
for enrollment in non_udacity_enrollments:
    if not enrollment['is_canceled'] or enrollment['days_to_cancel'] > 7 :
        account_key = enrollment['account_key']
        enrollment_date = enrollment['join_date']
        paid_students[account_key] = enrollment_date
    
len(paid_students)

995

#### Same student can enroll in multiple time

In [52]:
paid_students = {}

for enrollment in non_udacity_enrollments:
    if not enrollment['is_canceled'] or enrollment['days_to_cancel'] > 7 :
        account_key = enrollment['account_key']
        enrollment_date = enrollment['join_date']
        paid_students[account_key] = enrollment_date
        
        if account_key not in paid_students or \
                    enrollment_date > paid_students[account_key]:
            paid_students[account_key] = enrollment_date
        
        
len(paid_students)

995

## Getting Data from First Week

In [53]:
# Takes a student's join date and the date of a specific engagement record,
# and returns True if that engagement record happened within one week
# of the student joining.
def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7

In [54]:
def remove_free_trial_cancels(data):
    new_data = []
    for data_point in data:
        if data_point['account_key'] in paid_students:
            new_data.append(data_point)
    return new_data

In [55]:
paid_enrollments = remove_free_trial_cancels(non_udacity_enrollments)
paid_engagement  = remove_free_trial_cancels(non_udacity_engagement)
paid_submissions = remove_free_trial_cancels(non_udacity_submissions)

In [56]:
print(len(paid_enrollments))
print(len(paid_engagement))
print(len(paid_submissions))

1293
134549
3618


In [57]:
# within_one_week(paid_students['join_date'],paid_students['engagement_date'])
#wrong way to use error for ' joindate' key error in dict 

In [58]:
#####################################
#                 7                 #
#####################################

## Create a list of rows from the engagement table including only rows where
## the student is one of the paid students you just found, and the date is within
## one week of the student's join date.

paid_engagement_in_first_week = []

for engagement_record in paid_engagement: 
    account_key = engagement_record['account_key']
    join_date = paid_students[account_key]
    engagement_record_date = engagement_record['utc_date']
    
    if within_one_week(join_date,engagement_record_date):
        paid_engagement_in_first_week.append(engagement_record)

len(paid_engagement_in_first_week)

17210

In [59]:
print(paid_engagement_in_first_week[0])

OrderedDict([('utc_date', datetime.datetime(2015, 1, 9, 0, 0)), ('num_courses_visited', 1), ('total_minutes_visited', 11.6793745), ('lessons_completed', 0), ('projects_completed', 0), ('account_key', '0')])


## Exploring Student Engagement

In [67]:
# Create a dictionary of engagement grouped by student.
# The keys are account keys, and the values are lists of engagement records.

from collections import defaultdict

engagement_by_account = defaultdict(list)
for engagement_record in paid_engagement_in_first_week:
    account_key = engagement_record['account_key']
    engagement_by_account[account_key].append(engagement_record)

In [66]:
# Create a dictionary with the total minutes each student spent in the classroom during the first week.
# The keys are account keys, and the values are numbers (total minutes)
total_minutes_by_account = {}
for account_key, engagement_for_student in engagement_by_account.items():
    total_minutes = 0
    for engagement_record in engagement_for_student:
        total_minutes += engagement_record['total_minutes_visited']
    total_minutes_by_account[account_key] = total_minutes

In [69]:
total_minutes = total_minutes_by_account.values()

In [72]:
total_minutes = np.array(total_minutes)

In [74]:
#np.mean(total_minutes)
#unsupported operand because of  python3 

In [75]:
total_minutes = list(total_minutes_by_account.values())

In [78]:
np.mean(total_minutes)

551.3825094979703

In [76]:
np.std(total_minutes)

964.4304772897452

In [77]:
np.min(total_minutes)

0.0

In Py3, range and dict.key() require the same extra touch.
np.mean first tries to convert the input to an array, 
but with values() that isn't what we want. It makes a single item object array containing this whole object.

in stack overflow

## Debugging Data Analysis Code

In [None]:
#####################################
#                 8                 #
#####################################

## Go through a similar process as before to see if there is a problem.
## Locate at least one surprising piece of data, output it, and take a look at it.

## Lessons Completed in First Week

In [None]:
#####################################
#                 9                 #
#####################################

## Adapt the code above to find the mean, standard deviation, minimum, and maximum for
## the number of lessons completed by each student during the first week. Try creating
## one or more functions to re-use the code above.

## Number of Visits in First Week

In [None]:
######################################
#                 10                 #
######################################

## Find the mean, standard deviation, minimum, and maximum for the number of
## days each student visits the classroom during the first week.

## Splitting out Passing Students

In [None]:
######################################
#                 11                 #
######################################

## Create two lists of engagement data for paid students in the first week.
## The first list should contain data for students who eventually pass the
## subway project, and the second list should contain data for students
## who do not.

subway_project_lesson_keys = ['746169184', '3176718735']

passing_engagement =
non_passing_engagement =

## Comparing the Two Student Groups

In [None]:
######################################
#                 12                 #
######################################

## Compute some metrics you're interested in and see how they differ for
## students who pass the subway project vs. students who don't. A good
## starting point would be the metrics we looked at earlier (minutes spent
## in the classroom, lessons completed, and days visited).

## Making Histograms

In [None]:
######################################
#                 13                 #
######################################

## Make histograms of the three metrics we looked at earlier for both
## students who passed the subway project and students who didn't. You
## might also want to make histograms of any other metrics you examined.

## Improving Plots and Sharing Findings

In [None]:
######################################
#                 14                 #
######################################

## Make a more polished version of at least one of your visualizations
## from earlier. Try importing the seaborn library to make the visualization
## look better, adding axis labels and a title, and changing one or more
## arguments to the hist() function.