In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
quiz_df = pd.read_csv('/content/drive/My Drive/SIADS 699: Capstone/Raw Data/k8_quizzes-2.csv')
prob_df = pd.read_csv('/content/drive/My Drive/SIADS 699: Capstone/Raw Data/k8_probs-2.csv')
mastery_df = pd.read_csv('/content/drive/My Drive/SIADS 699: Capstone/Raw Data/k8_mastery-3.csv')

In [44]:
'''
Cleaning Functions: converting timestamps to pandas datetime format, dropping invalid usernames, convert prob_name to same format as quiz_name, pull grade level from prob_grades_dict
'''
def clean_time_cols(df, time_col):
  df['timestamp'] = pd.to_datetime(df[time_col])
  df['date'] = df['timestamp'].dt.date
  df['time'] = df['timestamp'].dt.time
  return df

invalid_users = ['pmb1', 'gr1', 'gr2', 'gr3', 'gr5', 'gr4', 'gr7', 'gr6', 'gr8', 'jaydenb', 'Zoe', 'pmb112358', 'pmb', 'Wizzieweed', 'Nefoli', 'guest']
def clean_users(df, invalid_users):
  df = df[~df['username'].isin(invalid_users)]
  return df

quiz_probs_dict = {
    "COM1": ['com_gr1', 'com_gr1', 'com_gr1', 'com_gr1', 'com_gr1'],
    "COM2": ['com_gr2', 'com_gr2', 'com_gr2', 'com_gr2', 'com_gr2'],
    "COM3": ["com_w_gr3", "com_f_gr3", "com_w_gr3", "com_f_gr3"],
    "COM4": ["com_w_gr4", "com_f_gr4", "com d_gr4", "com_fd_gr4", "com_conv_fd"],
    "COM5": ["com_fp", "com_dp", "com_conv_fp", "com_conv_dp"],
    "AS": ['as_pv', 'as_rnd', 'as_add', 'as_sub'],
    "AS1": ["as_pv_2D", "as_add_2Dpus10s", "as_add_2D", "as_sub_2Dminus10s", "as_sub_2D"],
    "AS2": ["as_pv_3D", "as_add_3Dplus10s", "as_add_3D", "as_sub_3Dminus10s", "as_sub_3D"],
    "AS3": ["as_pv_4D", "as_rnd_4D", "as_add_4Dplus10s", "as_add_4D", "as_sub_4Dminus10s", "as_sub_4D"],
    "MD": ["md_2x2", "md_1x5", "md_0sx0s", "md_5by1", "md_5by0s", 'md_facts'],
    "MD2": ["md_mult_234510", "md_div_234510", "md_mult_234510", "md_div_234510"],
    "MD3": ["md_mult_6789", "md_div_6789", "md_mult_1D2D", "md_div_3Dby1D"],
    "MD5": ["md_mult_exp", "md_2x2", "md_1x5", "md_0sx0s", "md_5by1", "md_5by0s"],
    "ME": ["me_if", "me_fi", "me_ms", "me_sm", "me_add", "me_sub", "me_mult", "me_div"],
    "AP": ["ap_ra", "ap_rp", "ap_rs", "ap_2ra", "ap_2rp", "ap_2rs"],
    "AP5": ["ap_tri", "ap_tri", "ap_rpv", "ap_rps"],
    "LG3": ["lg_bar", "lg_bar", "lg_bar", "lg_bar", "lg_bar"],
    "LG": ["lg_tg", "lg_gt", "lg_tg", "lg_patt"],
    "LG5": ["lg_lin", "lg_lin_rate", "lg_lin", "lg_lin_rate", "lg_lin", "lg_lin_rate"],
    "FR": ["fr_r", "fr_equiv", "fr_sim", "fr_im", "fr_a", "fr_s", "fr_m", "fr_d"],
    "FR3": ["fr_name", "fr_as_gr3", "fr_name", "fr_as_gr3"],
    "FR5": ["fr_equiv_gr5", "fr_as_gr5", "fr_mult_gr5", "fr_recip", "fr_div_gr5"],
    "DEC": ["dec_r", "dec_a", "dec_s", "dec_m", "dec_d"],
    "DEC5": ["dec_rnd3dp", "dec_as_thou", "dec_as_diffdp", "dec_md_thou", "dec_md_pow10", "dec_md_2sigdig", "dec_pv"],
    "PER": ["per_per", "per_per", "per_per", "per_id", "per_id", "per_id"],
    "OTH5": ["oth_pf", "oth_exp_np", "oth_exp_p", "oth_pf", "oth_exp_np", "oth_exp_p"],
    "PL5": ['as_pv', 'as_rnd', 'as_add', 'as_sub', "me_sub", "me_div", "ap_2ra", "ap_2rp"],
    "COM6": ['com_nn', 'com_nn', 'com_nn', 'com_nn', 'com_nn'],
    "AS6": ['as_add_nn', 'as_sub_nn', 'as_add_nn', 'as_sub_nn', 'as_add_nn', 'as_sub_nn'],
    "AP8": ['ap_pt', 'ap_pt', 'ap_pt', 'ap_pt'],
    "MD6": ["md_mult_nn", "md_div_nn", "md_mult_nn", "md_div_nn"],
    "DEC6": ['dec_div_1sd', 'dec_div_1sd', 'dec_div_1sd', 'dec_div_1sd'],
    "FR6": ['fr_div_f', 'fr_div_f', 'fr_div_f', 'fr_div_f'],
    "PRO6": ['pro_pro', 'pro_pro', 'pro_pro', 'pro_pro'],
    "FOR6": ['for_a_par', 'for_a_trap', 'for_v_cube', 'for_sa_cube', 'for_v_rp', 'for_sa_rp'],
    "FOR7": ['for_speed', 'for_c_circle', 'for_a_circle', 'for_v_pr'],
    "FOR8": ['for_v_pyr', 'for_v_cyl', 'for_v_cone', 'for_sa_sph', 'for_v_sph'],
    "EXP6": ['exp_basic', 'exp_basic', 'exp_basic', 'exp_basic'],
    "EXP7": ['exp_sr_int', 'exp_sr_dec', 'exp_sr_int', 'exp_sr_dec'],
    "EXP8": ['exp_sr_irr', 'exp_simp', 'exp_pl', 'exp_ql', 'exp_nl', 'exp_zl', 'exp_pp'],
    "LE6": ['le_as6', 'e_md6', 'le_as6', 'le_md6', 'le_as6', 'le_md6'],
    "LE7": ['le_as7', 'le_md7', 'le_two7', 'le_three7', 'le_clt7'],
    "LE8": ['le_comp8', 'le_denom8', 'le_comp8', 'le_denom8', 'le_comp8', 'le_denom8'],
    "LI6": ['li_gr', 'li_is6', 'li_gr', 'li_is6'],
    "LI7": ['li_solgr', 'li_is7', 'li_solgr', 'li_is7'],
    "LF7": ['lf_AT1', 'lf_GT', 'lf_AG1', 'lf_TG5', 'lf_TS5', 'lf_GS', 'lf_TS2', 'lf_AS1', 'lf_ST', 'lf_SG'],
    "LF8": ['lf_ES', 'lf_ET', 'lf_EG', 'lf_AE1', 'lf_SE', 'lf_GE', 'lf_TE5', 'lf_TE2', 'lf_TG2', 'lf_TT'],
    "OMS6": ['oms_absval', 'oms_ratio', 'oms_expr', 'oms_expr'],
    "OMS7": ['oms_fact', 'oms_gcf', 'oms_sqfact', 'oms_mults', 'oms_lcm'],
    "GMS6": ['gms_tg', 'gms_gt', 'gms_dist1', 'gms_gt', 'gms_tg', 'gms_hist', 'gms_hist'],
    "GMS7": ['gms_nl', 'gms_nl_mult', 'gms_nl_mult', 'gms_nl_dec', 'gms_nl_fr', 'gms_nl_per'],
    "GMS8": ['gms_dist2', 'gms_scat', 'gms_dist2', 'gms_scat', 'gms_dist2', 'gms_scat'],
    "MF2": ['mf_as'],
    "MF3": ['mf_md']
}

def convert_prob_name(df):
  code_to_category = {code: category for category, codes in quiz_probs_dict.items() for code in codes}
  df["prob_name"] = df["code"].map(code_to_category)
  return df

prob_grades_dict = {
    "COM1": 1,
    "COM2": 2,
    "COM3": 3,
    "COM4": 4,
    "COM5": 5,
    "AS": 4,
    "AS1": 1,
    "AS2": 2,
    "AS3": 3,
    "MD": 4,
    "MD2": 2,
    "MD3": 3,
    "MD5": 5,
    "ME": 4,
    "AP": 4,
    "AP5": 5,
    "LG3": 3,
    "LG": 4,
    "LG5": 5,
    "FR": 4,
    "FR3": 3,
    "FR5": 5,
    "DEC": 4,
    "DEC5": 5,
    "PER": 5,
    "OTH5": 5,
    "PL5": 5,
    "COM6": 6,
    "AS6": 6,
    "AP8": 8,
    "MD6": 6,
    "DEC6": 6,
    "FR6": 6,
    "PRO6": 6,
    "FOR6": 6,
    "FOR7": 7,
    "FOR8": 8,
    "EXP6": 6,
    "EXP7": 7,
    "EXP8": 8,
    "LE6": 6,
    "LE7": 7,
    "LE8": 8,
    "LI6": 6,
    "LI7": 7,
    "LF7": 7,
    "LF8": 8,
    "OMS6": 6,
    "OMS7": 7,
    "GMS6": 6,
    "GMS7": 7,
    "GMS8": 8,
    "MF2": 2,
    "MF3": 3
}

def get_grade_level(df, grade_col, name_col):
  df.loc[df[grade_col].isna(), grade_col] = df[name_col].map(prob_grades_dict)
  return df

In [45]:
'''
Mastery Data Cleaning: clean_time_cols, clean_users, drop unneeded columns and rearrange
'''

mastery_df = clean_time_cols(mastery_df, 'quiz_time')
mastery_df = clean_users(mastery_df, invalid_users)
mastery_df = mastery_df[['username', 'quiz_name', 'category', 'grade', 'timestamp', 'date', 'time']]

In [46]:
'''
Quiz Data Cleaning: clean_time_cols, clean_users, create percent_correct column, drop unneeded columns and rearrange
Merging Quiz and Mastery Data: merge on username and quiz_name, rename columns, create and populate binary mastery column, drop unneeded columns and rearrange
Filling Missing Values: if grade is NaN, pull using get_grade_level
Correct Data Types: convert floats to integers in grade column
'''

quiz_df = clean_time_cols(quiz_df, 'quiz_time')
quiz_df = clean_users(quiz_df, invalid_users)

quiz_df['percent_correct'] = quiz_df['num_correct'] / quiz_df['num_total']

quiz_df = quiz_df[['username', 'quiz_name', 'timestamp', 'date', 'time', 'num_total', 'num_correct', 'percent_correct']]

quiz_df = quiz_df.merge(
    mastery_df,
    on=['username', 'quiz_name'],
    how='left',
    suffixes=('', '_mastery')
)

quiz_df.rename(columns={
    'timestamp_mastery': 'mastery_timestamp',
    'date_mastery': 'mastery_date',
    'time_mastery': 'mastery_time'
}, inplace=True)

quiz_df['mastery'] = (
    (quiz_df['mastery_timestamp'].notna()) &
    (quiz_df['timestamp'] >= quiz_df['mastery_timestamp'])
).astype(int)

quiz_df = quiz_df[['username', 'quiz_name', 'category', 'grade', 'timestamp', 'date', 'time', 'num_total', 'num_correct', 'percent_correct', 'mastery',
                   'mastery_timestamp', 'mastery_date', 'mastery_time']]

quiz_df = get_grade_level(quiz_df, 'grade', 'quiz_name')

quiz_df['grade'] = quiz_df['grade'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quiz_df['percent_correct'] = quiz_df['num_correct'] / quiz_df['num_total']


In [47]:
quiz_df.head()

Unnamed: 0,username,quiz_name,category,grade,timestamp,date,time,num_total,num_correct,percent_correct,mastery,mastery_timestamp,mastery_date,mastery_time
0,CharlotteC14,AS1,AS,1,2024-07-24 11:43:24,2024-07-24,11:43:24,5,5,1.0,0,2024-12-10 14:22:42,2024-12-10,14:22:42
1,CyrusC3,AS2,AS,2,2024-07-24 11:47:02,2024-07-24,11:47:02,5,5,1.0,0,2024-08-28 12:50:34,2024-08-28,12:50:34
2,CharlotteC14,MD3,MD,3,2024-07-24 11:49:29,2024-07-24,11:49:29,4,3,0.75,0,2025-01-28 15:06:53,2025-01-28,15:06:53
3,FoxF2,AS1,AS,1,2024-07-24 11:57:14,2024-07-24,11:57:14,5,5,1.0,0,2025-03-06 15:01:13,2025-03-06,15:01:13
4,KnoxK2,MD,,4,2024-07-24 12:09:47,2024-07-24,12:09:47,5,3,0.6,0,NaT,,


In [48]:
'''
Problem Data Cleaning: clean_time_cols, clean_users, convert_prob_name, create binary correct and practice columns, drop unneeded columns and rearrange
Merging Problem and Mastery Data: merge on username and prob_name, rename columns, create and populate binary mastery column, drop unneeded columns and rearrange
Filling Missing Values: pull grade level from prob
Correct Data Types: convert floats to integers in grade and practice columns
'''

prob_df = clean_time_cols(prob_df, 'prob_time')
prob_df['quiz_time'] = prob_df['quiz_time'].replace('0000-00-00 00:00:00.000000', pd.NaT)
prob_df["quiz_time"] = pd.to_datetime(prob_df["quiz_time"], errors="coerce")

prob_df = clean_users(prob_df, invalid_users)

prob_df = convert_prob_name(prob_df)

prob_df['correct'] = prob_df['correct'].map({'N': 0, 'Y': 1})

for _, row in prob_df.iterrows():
  if pd.isna(row['quiz_time']):
    prob_df.at[_, 'practice'] = int(1)
  else:
    prob_df.at[_, 'practice'] = int(0)

prob_df = prob_df[['username', 'timestamp', 'date', 'time', 'correct', 'prob_name', 'category', 'code', 'quiz_time', 'practice']]

prob_df = prob_df.merge(
    mastery_df,
    left_on=['username', 'prob_name'],
    right_on=['username', 'quiz_name'],
    how='left',
    suffixes=('', '_mastery')
)

prob_df.rename(columns={
    'timestamp_mastery': 'mastery_timestamp',
    'date_mastery': 'mastery_date',
    'time_mastery': 'mastery_time'
}, inplace=True)

prob_df['mastery'] = (
    (prob_df['mastery_timestamp'].notna()) &
    (prob_df['timestamp'] >= prob_df['mastery_timestamp'])
).astype(int)

prob_df = prob_df.drop(columns=['quiz_name'])
prob_df = prob_df[['username', 'prob_name', 'category', 'code', 'grade', 'timestamp', 'date', 'time', 'correct', 'practice', 'quiz_time',
                   'mastery', 'mastery_timestamp', 'mastery_date', 'mastery_time']]

prob_df = get_grade_level(prob_df, 'grade', 'prob_name')

prob_df['grade'] = prob_df['grade'].astype(int)
prob_df['practice'] = prob_df['practice'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["prob_name"] = df["code"].map(code_to_category)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prob_df['correct'] = prob_df['correct'].map({'N': 0, 'Y': 1})


In [49]:
'''
Problem Data Preprocessing: create probs_before_mastery column, create time_to_mastery column
'''

prob_df = prob_df.sort_values(by=['username', 'prob_name', 'timestamp'])
prob_df['probs_before_mastery'] = prob_df.groupby(['username', 'prob_name']).apply(
    lambda group: (group['timestamp'] < group['mastery_timestamp']).cumsum()
).reset_index(drop=True)
prob_df['probs_before_mastery'] = prob_df['probs_before_mastery'].fillna(0)

first_attempts = prob_df.groupby(['username', 'prob_name'])['timestamp'].transform('min')
prob_df['time_to_mastery'] = prob_df['mastery_timestamp'] - first_attempts
prob_df.loc[prob_df['mastery_timestamp'].isna(), 'time_to_mastery'] = pd.NaT
prob_df['time_to_mastery'] = prob_df['time_to_mastery'].fillna(pd.Timedelta(seconds=0))

  prob_df['probs_before_mastery'] = prob_df.groupby(['username', 'prob_name']).apply(


In [50]:
'''
Merging Problem and Quiz Data: merge on username and quiz_time (timestamp), rename duplicated columns, drop redundant columns and rearrange
Filling NaN Values: treat practice problems as single problem quizzes, fill missing grade levels
Correct Data Types: convert floats to integers in grade, practice, num_correct, and num_total columns
'''

merged_df = prob_df.merge(
    quiz_df,
    left_on=['username', 'quiz_time'],
    right_on=['username', 'timestamp'],
    how='left'
)

merged_df.rename(columns={
    'category_x': 'prob_category',
    'grade_x': 'prob_grade_level',
    'timestamp_x': 'prob_timestamp',
    'date_x': 'prob_date',
    'time_x': 'prob_time',
    'quiz_time': 'quiz_timestamp',
    'mastery_timestamp_x': 'mastery_timestamp',
    'mastery_date_x': 'mastery_date',
    'mastery_time_x': 'mastery_time',
    'mastery_x': 'mastery',
    'category_y': 'quiz_category',
    'grade_y': 'quiz_grade_level'
}, inplace=True)

merged_df = merged_df.drop(columns=['mastery_timestamp_y', 'mastery_date_y', 'mastery_time_y', 'mastery_y', 'timestamp_y', 'time_y', 'date_y'])

merged_df = merged_df[['username', 'prob_name', 'prob_category', 'code', 'prob_grade_level', 'prob_timestamp', 'correct', 'practice', 'mastery', 'mastery_timestamp', 'quiz_name',
                       'quiz_category', 'quiz_grade_level', 'num_total', 'num_correct', 'percent_correct', 'probs_before_mastery', 'time_to_mastery', 'quiz_timestamp']]

merged_df.loc[merged_df['num_correct'].isna(), 'num_correct'] = merged_df['correct']
merged_df.loc[merged_df['num_total'].isna(), 'num_total'] = 1
merged_df.loc[merged_df['percent_correct'].isna(), 'percent_correct'] = merged_df['correct']
merged_df.loc[merged_df['quiz_name'].isna(), 'quiz_name'] = merged_df['prob_name']
merged_df.loc[merged_df['quiz_category'].isna(), 'quiz_category'] = merged_df['prob_category']

merged_df = get_grade_level(merged_df, 'prob_grade_level', 'prob_name')
merged_df.loc[merged_df['quiz_grade_level'].isna(), 'quiz_grade_level'] = merged_df['prob_grade_level']

merged_df['prob_grade_level'] = merged_df['prob_grade_level'].astype(int)
merged_df['practice'] = merged_df['practice'].astype(int)
merged_df['quiz_grade_level'] = merged_df['quiz_grade_level'].astype(int)
merged_df['num_total'] = merged_df['num_total'].astype(int)
merged_df['num_correct'] = merged_df['num_correct'].astype(int)
merged_df['probs_befpre_mastery'] = merged_df['probs_before_mastery'].astype(int)

In [51]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25542 entries, 0 to 25541
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype          
---  ------                --------------  -----          
 0   username              25542 non-null  object         
 1   prob_name             25542 non-null  object         
 2   prob_category         25542 non-null  object         
 3   code                  25542 non-null  object         
 4   prob_grade_level      25542 non-null  int64          
 5   prob_timestamp        25542 non-null  datetime64[ns] 
 6   correct               25542 non-null  int64          
 7   practice              25542 non-null  int64          
 8   mastery               25542 non-null  int64          
 9   mastery_timestamp     20015 non-null  datetime64[ns] 
 10  quiz_name             25542 non-null  object         
 11  quiz_category         25542 non-null  object         
 12  quiz_grade_level      25542 non-null  int64          
 13  n

In [52]:
merged_df.head(10)

Unnamed: 0,username,prob_name,prob_category,code,prob_grade_level,prob_timestamp,correct,practice,mastery,mastery_timestamp,quiz_name,quiz_category,quiz_grade_level,num_total,num_correct,percent_correct,probs_before_mastery,time_to_mastery,quiz_timestamp,probs_befpre_mastery
0,AyaA2,AP,AP,ap_ra,4,2024-10-09 09:31:36,0,0,0,NaT,AP,AP,4,1,0,0.0,309,0 days,2024-10-09 09:30:16,309
1,AyaA2,AP,AP,ap_rp,4,2024-10-09 09:35:21,1,0,0,NaT,AP,AP,4,1,1,1.0,313,0 days,2024-10-09 09:30:16,313
2,AyaA2,AP,AP,ap_ra,4,2024-10-09 09:36:39,1,0,0,NaT,AP,AP,4,6,3,0.5,315,0 days,2024-10-09 09:35:31,315
3,AyaA2,AP,AP,ap_rp,4,2024-10-09 09:39:09,1,0,0,NaT,AP,AP,4,6,3,0.5,317,0 days,2024-10-09 09:35:31,317
4,AyaA2,AP,AP,ap_rs,4,2024-10-09 09:40:47,1,0,0,NaT,AP,AP,4,6,3,0.5,318,0 days,2024-10-09 09:35:31,318
5,AyaA2,AP,AP,ap_2rs,4,2024-10-09 09:46:59,0,0,0,NaT,AP,AP,4,6,3,0.5,325,0 days,2024-10-09 09:35:31,325
6,AyaA2,AP,AP,ap_ra,4,2024-10-15 10:23:50,1,0,0,NaT,AP,AP,4,1,1,1.0,433,0 days,2024-10-15 10:21:48,433
7,AyaA2,AP,AP,ap_rp,4,2024-10-15 10:26:15,1,0,0,NaT,AP,AP,4,1,1,1.0,433,0 days,2024-10-15 10:21:48,433
8,AyaA2,AP,AP,ap_rs,4,2024-10-15 10:26:44,0,0,0,NaT,AP,AP,4,1,0,0.0,433,0 days,2024-10-15 10:21:48,433
9,AyaA2,AP,AP,ap_ra,4,2024-11-18 09:57:55,1,1,0,NaT,AP,AP,4,1,1,1.0,139,0 days,NaT,139


In [53]:
quiz_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1659 entries, 0 to 1658
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   username           1659 non-null   object        
 1   quiz_name          1659 non-null   object        
 2   category           1329 non-null   object        
 3   grade              1659 non-null   int64         
 4   timestamp          1659 non-null   datetime64[ns]
 5   date               1659 non-null   object        
 6   time               1659 non-null   object        
 7   num_total          1659 non-null   int64         
 8   num_correct        1659 non-null   int64         
 9   percent_correct    1659 non-null   float64       
 10  mastery            1659 non-null   int64         
 11  mastery_timestamp  1329 non-null   datetime64[ns]
 12  mastery_date       1329 non-null   object        
 13  mastery_time       1329 non-null   object        
dtypes: datet

In [54]:
quiz_df.head(10)

Unnamed: 0,username,quiz_name,category,grade,timestamp,date,time,num_total,num_correct,percent_correct,mastery,mastery_timestamp,mastery_date,mastery_time
0,CharlotteC14,AS1,AS,1,2024-07-24 11:43:24,2024-07-24,11:43:24,5,5,1.0,0,2024-12-10 14:22:42,2024-12-10,14:22:42
1,CyrusC3,AS2,AS,2,2024-07-24 11:47:02,2024-07-24,11:47:02,5,5,1.0,0,2024-08-28 12:50:34,2024-08-28,12:50:34
2,CharlotteC14,MD3,MD,3,2024-07-24 11:49:29,2024-07-24,11:49:29,4,3,0.75,0,2025-01-28 15:06:53,2025-01-28,15:06:53
3,FoxF2,AS1,AS,1,2024-07-24 11:57:14,2024-07-24,11:57:14,5,5,1.0,0,2025-03-06 15:01:13,2025-03-06,15:01:13
4,KnoxK2,MD,,4,2024-07-24 12:09:47,2024-07-24,12:09:47,5,3,0.6,0,NaT,,
5,PaytonP4,MD3,,3,2024-07-24 12:17:07,2024-07-24,12:17:07,4,3,0.75,0,NaT,,
6,PaytonP4,MD2,MD,2,2024-07-24 12:22:16,2024-07-24,12:22:16,2,2,1.0,0,2024-08-06 12:16:04,2024-08-06,12:16:04
7,KnoxK2,MD,,4,2024-07-24 12:19:32,2024-07-24,12:19:32,5,3,0.6,0,NaT,,
8,OarielO,MD2,MD,2,2024-07-24 12:35:26,2024-07-24,12:35:26,2,2,1.0,0,2024-07-31 11:59:27,2024-07-31,11:59:27
9,OarielO,MD2,MD,2,2024-07-24 12:36:18,2024-07-24,12:36:18,2,2,1.0,0,2024-07-31 11:59:27,2024-07-31,11:59:27


In [55]:
prob_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25542 entries, 5691 to 4513
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype          
---  ------                --------------  -----          
 0   username              25542 non-null  object         
 1   prob_name             25542 non-null  object         
 2   category              25542 non-null  object         
 3   code                  25542 non-null  object         
 4   grade                 25542 non-null  int64          
 5   timestamp             25542 non-null  datetime64[ns] 
 6   date                  25542 non-null  object         
 7   time                  25542 non-null  object         
 8   correct               25542 non-null  int64          
 9   practice              25542 non-null  int64          
 10  quiz_time             9389 non-null   datetime64[ns] 
 11  mastery               25542 non-null  int64          
 12  mastery_timestamp     20015 non-null  datetime64[ns] 
 13  mast

In [56]:
prob_df.head(10)

Unnamed: 0,username,prob_name,category,code,grade,timestamp,date,time,correct,practice,quiz_time,mastery,mastery_timestamp,mastery_date,mastery_time,probs_before_mastery,time_to_mastery
5691,AyaA2,AP,AP,ap_ra,4,2024-10-09 09:31:36,2024-10-09,09:31:36,0,0,2024-10-09 09:30:16,0,NaT,,,309,0 days
5695,AyaA2,AP,AP,ap_rp,4,2024-10-09 09:35:21,2024-10-09,09:35:21,1,0,2024-10-09 09:30:16,0,NaT,,,313,0 days
5697,AyaA2,AP,AP,ap_ra,4,2024-10-09 09:36:39,2024-10-09,09:36:39,1,0,2024-10-09 09:35:31,0,NaT,,,315,0 days
5699,AyaA2,AP,AP,ap_rp,4,2024-10-09 09:39:09,2024-10-09,09:39:09,1,0,2024-10-09 09:35:31,0,NaT,,,317,0 days
5700,AyaA2,AP,AP,ap_rs,4,2024-10-09 09:40:47,2024-10-09,09:40:47,1,0,2024-10-09 09:35:31,0,NaT,,,318,0 days
5707,AyaA2,AP,AP,ap_2rs,4,2024-10-09 09:46:59,2024-10-09,09:46:59,0,0,2024-10-09 09:35:31,0,NaT,,,325,0 days
6246,AyaA2,AP,AP,ap_ra,4,2024-10-15 10:23:50,2024-10-15,10:23:50,1,0,2024-10-15 10:21:48,0,NaT,,,433,0 days
6257,AyaA2,AP,AP,ap_rp,4,2024-10-15 10:26:15,2024-10-15,10:26:15,1,0,2024-10-15 10:21:48,0,NaT,,,433,0 days
6258,AyaA2,AP,AP,ap_rs,4,2024-10-15 10:26:44,2024-10-15,10:26:44,0,0,2024-10-15 10:21:48,0,NaT,,,433,0 days
8634,AyaA2,AP,AP,ap_ra,4,2024-11-18 09:57:55,2024-11-18,09:57:55,1,1,NaT,0,NaT,,,139,0 days


In [57]:
'''
Split Data for Training and Testing: split data on prob_time using 2-10-2025 as the cutoff date for testing
'''

merged_df['prob_timestamp'] = pd.to_datetime(merged_df['prob_timestamp'])
merged_train_df = merged_df[merged_df['prob_timestamp'] < pd.to_datetime('2025-02-11')]
merged_test_df = merged_df[merged_df['prob_timestamp'] >= pd.to_datetime('2025-02-11')]

quiz_df['date'] = pd.to_datetime(quiz_df['date'])
quiz_train_df = quiz_df[quiz_df['date'] < pd.to_datetime('2025-02-11')]
quiz_test_df = quiz_df[quiz_df['date'] >= pd.to_datetime('2025-02-11')]

prob_df['timestamp'] = pd.to_datetime(prob_df['timestamp'])
prob_train_df = prob_df[prob_df['timestamp'] < pd.to_datetime('2025-02-11')]
prob_test_df = prob_df[prob_df['timestamp'] >= pd.to_datetime('2025-02-11')]

In [58]:
merged_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3674 entries, 88 to 25539
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype          
---  ------                --------------  -----          
 0   username              3674 non-null   object         
 1   prob_name             3674 non-null   object         
 2   prob_category         3674 non-null   object         
 3   code                  3674 non-null   object         
 4   prob_grade_level      3674 non-null   int64          
 5   prob_timestamp        3674 non-null   datetime64[ns] 
 6   correct               3674 non-null   int64          
 7   practice              3674 non-null   int64          
 8   mastery               3674 non-null   int64          
 9   mastery_timestamp     1893 non-null   datetime64[ns] 
 10  quiz_name             3674 non-null   object         
 11  quiz_category         3674 non-null   object         
 12  quiz_grade_level      3674 non-null   int64          
 13  num_to

In [59]:
merged_test_df.head()

Unnamed: 0,username,prob_name,prob_category,code,prob_grade_level,prob_timestamp,correct,practice,mastery,mastery_timestamp,quiz_name,quiz_category,quiz_grade_level,num_total,num_correct,percent_correct,probs_before_mastery,time_to_mastery,quiz_timestamp,probs_befpre_mastery
88,AyaA2,AP,AP,ap_ra,4,2025-02-18 14:30:19,1,0,0,NaT,AP,AP,4,1,1,1.0,336,0 days 00:00:00,2025-02-18 14:30:04,336
89,AyaA2,AP,AP,ap_rp,4,2025-02-18 14:30:31,0,0,0,NaT,AP,AP,4,1,0,0.0,340,0 days 00:00:00,2025-02-18 14:30:04,340
90,AyaA2,AP,AP,ap_ra,4,2025-03-13 11:45:31,0,1,0,NaT,AP,AP,4,1,0,0.0,20,0 days 00:00:00,NaT,20
91,AyaA2,AP,AP,ap_ra,4,2025-03-19 11:51:19,1,1,0,NaT,AP,AP,4,1,1,1.0,86,0 days 00:00:00,NaT,86
175,AyaA2,AS1,AS,as_pv_2D,1,2025-03-06 15:02:27,1,0,1,2024-11-19 12:59:59,AS1,AS,1,5,5,1.0,23,48 days 01:33:43,2025-03-06 15:02:20,23


In [60]:
quiz_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 163 entries, 1496 to 1658
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   username           163 non-null    object        
 1   quiz_name          163 non-null    object        
 2   category           99 non-null     object        
 3   grade              163 non-null    int64         
 4   timestamp          163 non-null    datetime64[ns]
 5   date               163 non-null    datetime64[ns]
 6   time               163 non-null    object        
 7   num_total          163 non-null    int64         
 8   num_correct        163 non-null    int64         
 9   percent_correct    163 non-null    float64       
 10  mastery            163 non-null    int64         
 11  mastery_timestamp  99 non-null     datetime64[ns]
 12  mastery_date       99 non-null     object        
 13  mastery_time       99 non-null     object        
dtypes: datetime

In [61]:
quiz_test_df.head()

Unnamed: 0,username,quiz_name,category,grade,timestamp,date,time,num_total,num_correct,percent_correct,mastery,mastery_timestamp,mastery_date,mastery_time
1496,CyrusC3,COM5,,5,2025-02-13 09:42:58,2025-02-13,09:42:58,4,4,1.0,0,NaT,,
1497,CyrusC3,COM5,,5,2025-02-13 09:50:12,2025-02-13,09:50:12,4,4,1.0,0,NaT,,
1498,OarielO,COM5,,5,2025-02-13 10:00:06,2025-02-13,10:00:06,4,0,0.0,0,NaT,,
1499,CyrusC3,COM5,,5,2025-02-13 09:59:41,2025-02-13,09:59:41,4,4,1.0,0,NaT,,
1500,CyrusC3,AS,AS,4,2025-02-13 10:04:45,2025-02-13,10:04:45,4,4,1.0,1,2025-01-13 11:13:26,2025-01-13,11:13:26


In [62]:
prob_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3674 entries, 22372 to 25106
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype          
---  ------                --------------  -----          
 0   username              3674 non-null   object         
 1   prob_name             3674 non-null   object         
 2   category              3674 non-null   object         
 3   code                  3674 non-null   object         
 4   grade                 3674 non-null   int64          
 5   timestamp             3674 non-null   datetime64[ns] 
 6   date                  3674 non-null   object         
 7   time                  3674 non-null   object         
 8   correct               3674 non-null   int64          
 9   practice              3674 non-null   int64          
 10  quiz_time             889 non-null    datetime64[ns] 
 11  mastery               3674 non-null   int64          
 12  mastery_timestamp     1893 non-null   datetime64[ns] 
 13  mas

In [63]:
prob_test_df.head()

Unnamed: 0,username,prob_name,category,code,grade,timestamp,date,time,correct,practice,quiz_time,mastery,mastery_timestamp,mastery_date,mastery_time,probs_before_mastery,time_to_mastery
22372,AyaA2,AP,AP,ap_ra,4,2025-02-18 14:30:19,2025-02-18,14:30:19,1,0,2025-02-18 14:30:04,0,NaT,,,336,0 days 00:00:00
22376,AyaA2,AP,AP,ap_rp,4,2025-02-18 14:30:31,2025-02-18,14:30:31,0,0,2025-02-18 14:30:04,0,NaT,,,340,0 days 00:00:00
24403,AyaA2,AP,AP,ap_ra,4,2025-03-13 11:45:31,2025-03-13,11:45:31,0,1,NaT,0,NaT,,,20,0 days 00:00:00
24822,AyaA2,AP,AP,ap_ra,4,2025-03-19 11:51:19,2025-03-19,11:51:19,1,1,NaT,0,NaT,,,86,0 days 00:00:00
23567,AyaA2,AS1,AS,as_pv_2D,1,2025-03-06 15:02:27,2025-03-06,15:02:27,1,0,2025-03-06 15:02:20,1,2024-11-19 12:59:59,2024-11-19,12:59:59,23,48 days 01:33:43


In [66]:
'''
Download Data: merged_df, merged_test_df, quiz_df, quiz_test_df, prob_df, prob_test_df, mastery_df
'''
'''
merged_train_df.to_csv("merged_data.csv", index=False)
merged_test_df.to_csv("merged_test_data.csv", index=False)
mastery_df.to_csv("mastery_data.csv", index=False)
quiz_train_df.to_csv("quiz_data.csv", index=False)
quiz_test_df.to_csv("quiz_test_data.csv", index=False)
prob_train_df.to_csv("prob_data.csv", index=False)
prob_test_df.to_csv("prob_test_data.csv", index=False)
'''
merged_df.to_csv("merged_data_complete.csv", index=False)
quiz_df.to_csv("quiz_data_complete.csv", index=False)
prob_df.to_csv("prob_data_complete.csv", index=False)

from google.colab import files
'''
files.download("merged_data.csv")
files.download("merged_test_data.csv")
files.download("mastery_data.csv")
files.download("quiz_data.csv")
files.download("quiz_test_data.csv")
files.download("prob_data.csv")
files.download("prob_test_data.csv")
'''
files.download("merged_data_complete.csv")
files.download("quiz_data_complete.csv")
files.download("prob_data_complete.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>