In [46]:
#Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [50]:
#Read in csv as dataframe
#Using a smaller subset of the data since testing with 6mil+ rows would be too intensive
df = pd.read_csv('/Users/ethanrowe/Dev/springboard_data/KSU Assessment Page Views Fall 2018.csv', nrows = 10000)

In [48]:
#Eliminate Outliers using the 1.5*interquartile rule
IQR = float(df['duration'].quantile([0.75])) - float(df['duration'].quantile([0.25]))
upper_bound = float(df['duration'].quantile([0.75])) + 1.5*IQR
lower_bound = float(df['duration'].quantile([0.25])) - 1.5*IQR

df_clean = df[(df['duration'] < upper_bound) & (df['duration'] > lower_bound)]

In [19]:
print(df_clean.shape)
print("Null Values: " + str(df_clean.isnull().values.any()))
#df_clean.to_csv('KSU Assessment Page Views Fall 2018 Clean.csv')

(8846, 13)
Null Values: False


In [52]:
#Determine if there are missing values missing values
total =  []
for chunk in pd.read_csv('/Users/ethanrowe/Dev/springboard_data/KSU Assessment Page Views Fall 2018.csv', chunksize = 1000):
    if chunk.isnull().values.any():
        total.append(chunk)
        
#There are missing values in the whole dataframe
#Need to check user_id, canvas_assignment_d, canvas_attempt_id, page_ids
#There are no missing values in the columns that we are taking an interest in for any of our calculations
#Checked each column for missing values and each column had none

In [110]:
#Group by page and by students to compare average assignment durations
assignment_df = pd.DataFrame(df_clean.groupby(['canvas_assignment_id'])['duration'].mean())
print('\nGrouped by Assignment')
assignment_df.columns = ['Average Duration']
print(page_df.shape)

user_df = pd.DataFrame(df_clean.groupby(['user_param_external_user_id', 'canvas_assignment_id'])['duration'].mean())
print('\nGrouped by User')
user_df.columns = ['Average Duration']
print(user_df.shape)


#page_df.to_csv('KSU Assessment Assignment Duration Averages.csv')
#user_df.to_csv('KSU Assessment Student Views by User.csv')    


Grouped by Assignment
(580, 1)

Grouped by User
(1865, 1)


In [111]:
#User_df is good for visualizing the data, but a df that is easier to work with will be labeled sub_df. It has the 
#same columns as user_df, just in a different order with a numerical index. It will also be sorted by 
#canvas_assignment_id in order to match assignment_df's index
sub_df = user_df.reset_index()  
sub_df = sub_df.set_index('canvas_assignment_id')  
sub_df = sub_df.sort_index()     
sub_df = sub_df.reset_index()    

#Using the user_df, turn it back into a dictionary grouped by canvas_assignment ids
dfs = {}       
for entry in sub_df.iterrows():  
    if index != entry[1]['canvas_assignment_id']:  
        index = entry[1]['canvas_assignment_id']  
        df = pd.DataFrame(columns=['canvas_assignment_id', 'user_param_external_user_id', 'Average Duration'])  
        df = df.append(dict(entry[1]), ignore_index = True) 
        dfs[index] = df  
    else:  
        dfs[index] = dfs[index].append(dict(entry[1]), ignore_index = True)  

#Now page_df and this dictionary dfs should have one to one matches for the loop below
#This loop will find all the student-assignment_id pairs that had an above average duration for that assignment
above_avg_df = pd.DataFrame(columns = ['canvas_assignment_id', 'user_param_external_user_id','Average Duration']) 
for entry in assignment_df.iterrows(): 
    df = dfs[entry[0]] 
    above_avg_chunk = df[df['Average Duration'] >= entry[1][0]] 
    above_avg_df = above_avg_df.append(above_avg_chunk) 
#above_avg_df.to_csv('KSU Students with Above Average Assignment Duration.csv')    

In [112]:
print(above_avg_df.size)
print(above_avg_df.head())

3054
   canvas_assignment_id  user_param_external_user_id  Average Duration
0               40876.0                      37827.0      12831.500000
0               40878.0                      37949.0       9549.583333
3               40878.0                      37251.0      10307.500000
0               40879.0                      38399.0      11579.000000
5               40879.0                      40394.0      16064.857143
