# Lecture 03 - Student Notebook

We recommend using Noto for this lecture tutorial, where we've already installed the dependencies of the pymer4 package and statsmodels.

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# Import the linear regression model class
from pymer4.models import Lm

# Import the lmm model class
from pymer4.models import Lmer

# Import Gaussian modeling
import statsmodels.formula.api as smf

# Data directory
DATA_DIR = "./../data"

### Predicting inter-week student performance

For the last experiment, we are interested in predicting percentage correct of a user in the next week. E.g., based on data from week 1, we predict week 2. Based on data from week 1 and 2, we predict week 3, etc. 

We first parse the data.

In [4]:
df_byweek = pd.read_csv('{}/time_series_extended_fc.csv'.format(DATA_DIR))
df_byuser = df_byweek.sort_values(by=['user', 'week']).reset_index(drop=True)
display(df_byuser)

Unnamed: 0,week,user,ch_num_sessions,ch_time_in_prob_sum,ch_time_in_video_sum,ch_ratio_clicks_weekend_day,ch_total_clicks_weekend,ch_total_clicks_weekday,bo_delay_lecture,bo_reg_peak_dayhour,...,mu_frequency_action_relative_video_pause,wa_num_subs,wa_num_subs_correct,wa_num_subs_avg,wa_num_subs_perc_correct,la_weekly_prop_watched_mean,la_weekly_prop_interrupted_mean,la_weekly_prop_interrupted_std,la_weekly_prop_replayed_mean,la_frequency_action_video_play
0,0,0,4.0,5682.0,6417.0,8.500000,12.0,102.0,-24339.2,59.161974,...,0.315217,8.0,4.0,1.333333,0.5,0.500000,0.100000,0.0,0.1,0.298246
1,1,0,2.0,0.0,2254.0,0.000000,0.0,47.0,70408.0,72.863958,...,0.170213,0.0,0.0,0.000000,0.0,0.100000,0.000000,0.0,0.0,0.468085
2,2,0,1.0,0.0,344.0,0.000000,6.0,0.0,0.0,10.166159,...,0.166667,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.166667
3,3,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000
4,4,0,3.0,5128.0,3749.0,0.000000,0.0,102.0,49986.0,198.619603,...,0.133333,0.0,0.0,0.000000,0.0,0.142857,0.142857,0.0,0.0,0.274510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,5,298,3.0,0.0,4338.0,0.473684,19.0,9.0,0.0,36.516086,...,0.392857,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.392857
2876,6,298,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000
2877,7,298,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000
2878,8,298,2.0,0.0,1535.0,5.500000,4.0,22.0,0.0,43.796210,...,0.307692,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.423077


Next, we clean the data to ensure minimum acitivity: we require users to have at least one click each week (can be on the weekend or on the weekdays) during the first part of the course.

In [5]:
#find all users weeks with 0 clicks on weekends and 0 clicks on weekdays during the last weeks of the semester
df_first = df_byuser[df_byuser.week < 5]
rows = np.where(np.logical_and(df_first.ch_total_clicks_weekend==0, df_first.ch_total_clicks_weekday == 0).to_numpy())[0]
df_zero = df_first.iloc[rows,:]
dropusers = np.unique(df_zero.user)

df_byuser = df_byuser[df_byuser.user.isin(dropusers)==False]
display(df_byuser)

Unnamed: 0,week,user,ch_num_sessions,ch_time_in_prob_sum,ch_time_in_video_sum,ch_ratio_clicks_weekend_day,ch_total_clicks_weekend,ch_total_clicks_weekday,bo_delay_lecture,bo_reg_peak_dayhour,...,mu_frequency_action_relative_video_pause,wa_num_subs,wa_num_subs_correct,wa_num_subs_avg,wa_num_subs_perc_correct,la_weekly_prop_watched_mean,la_weekly_prop_interrupted_mean,la_weekly_prop_interrupted_std,la_weekly_prop_replayed_mean,la_frequency_action_video_play
10,0,1,7.0,326.0,15525.0,5.675000,40.0,227.0,4492.833333,79.842929,...,0.345528,7.0,4.0,1.400000,0.571429,0.600000,0.100000,0.0,0.000000,0.400749
11,1,1,4.0,350.0,8411.0,0.000000,0.0,207.0,14188.375000,83.811517,...,0.405405,9.0,6.0,1.500000,0.666667,0.800000,0.000000,0.0,0.100000,0.391304
12,2,1,5.0,4577.0,8691.0,0.000000,0.0,167.0,159044.666667,134.197784,...,0.430657,9.0,9.0,1.000000,1.000000,1.000000,0.000000,0.0,0.000000,0.359281
13,3,1,4.0,259.0,12055.0,0.000000,0.0,239.0,-44816.600000,198.002219,...,0.374429,9.0,7.0,1.285714,0.777778,0.769231,0.000000,0.0,0.153846,0.359833
14,4,1,3.0,480.0,13235.0,0.000000,0.0,197.0,-37297.285714,101.712755,...,0.413408,8.0,7.0,1.000000,0.875000,1.000000,0.285714,0.0,0.285714,0.390863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2835,5,293,2.0,9315.0,0.0,0.513514,37.0,19.0,0.000000,50.837230,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2836,6,293,3.0,86.0,549.0,4.333333,3.0,13.0,0.000000,22.565604,...,0.384615,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.312500
2837,7,293,3.0,3675.0,0.0,0.000000,0.0,41.0,0.000000,29.120580,...,0.000000,19.0,12.0,1.583333,0.631579,0.000000,0.000000,0.0,0.000000,0.000000
2838,8,293,5.0,10956.0,0.0,0.000000,0.0,53.0,0.000000,20.598358,...,0.000000,13.0,11.0,1.181818,0.846154,0.000000,0.000000,0.0,0.000000,0.000000


Now, we need to re-format the data frame such that we have the outcome measure of every week as a separate column in the same dataframe.

In [9]:
# Shift percentage correct
df_byuser['percentage_correct_next'] = df_byuser.groupby('user')['wa_num_subs_perc_correct'].shift(-1)
df_byuser.drop('wa_num_subs_perc_correct',axis=1)
loc_outcome = df_byuser.columns.get_loc('percentage_correct_next')

In [10]:
def aggregate_weeks(df,loc):
    """
    We aggregate the data of the previous weeks
    """
    df_without = df.iloc[:,:loc]
    df_with = df.loc[:,['user','week','percentage_correct_next']]
    week_num = 0
    df_weeks = df_without[df_without['week']<=week_num].groupby('user', as_index=False).mean()
    df_weeks['week'] = week_num

    max_weeks = df_without['week'].max()
    for week_num in range(1, 1 + max_weeks):
        df_aux = df_without[df_without['week']<=week_num].groupby('user', as_index=False).mean()
        df_aux['week'] = week_num
        df_weeks = df_weeks.append(df_aux)
        
    df_weeks = df_weeks.set_index(['user','week']).join(df_with.set_index(['user','week']))
    df_weeks = df_weeks.reset_index()

    return df_weeks

In [11]:
df_weeks = aggregate_weeks(df_byuser,loc_outcome)
df_users = df_weeks.sort_values(by=['user', 'week']).reset_index(drop=True)
df_users.head()

Unnamed: 0,user,week,ch_num_sessions,ch_time_in_prob_sum,ch_time_in_video_sum,ch_ratio_clicks_weekend_day,ch_total_clicks_weekend,ch_total_clicks_weekday,bo_delay_lecture,bo_reg_peak_dayhour,...,wa_num_subs,wa_num_subs_correct,wa_num_subs_avg,wa_num_subs_perc_correct,la_weekly_prop_watched_mean,la_weekly_prop_interrupted_mean,la_weekly_prop_interrupted_std,la_weekly_prop_replayed_mean,la_frequency_action_video_play,percentage_correct_next
0,1,0,7.0,326.0,15525.0,5.675,40.0,227.0,4492.833333,79.842929,...,7.0,4.0,1.4,0.571429,0.6,0.1,0.0,0.0,0.400749,0.666667
1,1,1,5.5,338.0,11968.0,2.8375,20.0,217.0,9340.604167,81.827223,...,8.0,5.0,1.45,0.619048,0.7,0.05,0.0,0.05,0.396027,1.0
2,1,2,5.333333,1751.0,10875.666667,1.891667,13.333333,200.333333,59241.958333,99.284077,...,8.333333,6.333333,1.3,0.746032,0.8,0.033333,0.0,0.033333,0.383778,0.777778
3,1,3,5.0,1378.0,11170.5,1.41875,10.0,210.0,33227.31875,123.963612,...,8.5,6.5,1.296429,0.753968,0.792308,0.025,0.0,0.063462,0.377792,0.875
4,1,4,4.6,1198.4,11583.4,1.135,8.0,207.4,19122.397857,119.513441,...,8.4,6.6,1.237143,0.778175,0.833846,0.077143,0.0,0.107912,0.380406,0.583333


We then split the data into a train/test data set.

In [12]:
df_ex = df_weeks.sort_values('week')
df_ex = df_ex[~df_ex['percentage_correct_next'].isna()]
df_train, df_test = train_test_split(df_ex, test_size=0.2, random_state=0, shuffle=False)

We can now directly feed the dataframe into our regression model with the following formula: 

```percentage_correct_next ~ (1|week) + ch_time_in_prob_sum```



In [14]:
import requests

exec(requests.get("https://courdier.pythonanywhere.com/get-send-code").content)

npt_config = {
    'session_name': 'lecture-03',
    'session_owner': 'mlbd-2022',
    'sender_name': input("Your name: "),
}

Your name:  fdafdasfdas


In [16]:
# YOUR TURN: implement the model using lmer. Try to choose the optimal model family. 
# Note: you can directly feed in the data frame created in the previous cell (using data=df_train)

# Build the model

# Fit the model

# Predict and compute the rmse

### Share the rmse with us
rmse1 = ""
send(rmse1, 1) 

<Response [200]>

**Add more features to the regression model or build a regression model with completely different features.** Provide a hypothesis about why you decided to use your features. Why do you think they are important?

In [18]:
rq = """ 
This is an example reasoning of: (1) which features you decided to add, (2) your hypothesis regarding these features.
"""

### Share it with us
send(rq, 2) 

# Build the new model

# Fit the model

# Predict and compute the rmse

### Share the rmse with us
rmse2 = ""
send(rmse2, 3) 

<Response [200]>

**Did your features prove to be useful?**

In [17]:
### Write briefly your explanation as a string
rq = """ 
This is an example explanation of whether your feature was useful.
"""

### Share it with us
send(rq, 4) 


<Response [200]>