# Baseline algorithms for early warning systems

In [None]:
from tslearn.clustering import TimeSeriesKMeans, silhouette_score
from sklearn.metrics import silhouette_score, confusion_matrix
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.preprocessing import StandardScaler
from datetime import datetime, timedelta, date
from scipy.cluster.hierarchy import dendrogram
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from scipy.stats import mannwhitneyu
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from datetime import datetime as dt
from tqdm import tqdm
import seaborn as sns
import pandas as pd
import numpy as np
import calplot
import time
import math
import json
import sys
import os

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
sys.path.append(os.path.abspath(os.path.join('..')))

In [None]:
from helpers.db_connector import MySQLConnector
from helpers.feature_extraction import *
from helpers.data_process import *
from helpers.db_query import *
from helpers.time import *

In [None]:
from extractors.boroujeni_et_al import BoroujeniEtAl
from extractors.lalle_conati import LalleConati
from extractors.lemay_doleck import LemayDoleck

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Linear Algebra Dataset from Courseware

Since Fall 2017, the stream of the EPFL's Linear Algebra course has been taught in a flipped format. The implementation of the flipped classroom was carried out in an incremental manner, as described below:

- **Year 2017-2018**: traditional manner (weeks 1-13) - flipped manner (week 14).
- **Year 2018-2019**: traditional manner (weeks 1-4, 10-14) - flipped manner (weeks 5-9).
- **Year 2019-2020**: traditional manner (weeks 1-4) - flipped manner (weeks 5-14).

In [None]:
rounds = ['Y2-2018-19', 'Y3-2019-20']

### Identifying Students


The flipped course was offered only to volunteering students. The volunteers were collectively assigned into either the experimental and the control group. A stratified random sampling based on gender and the prior background (secondary educational level) of students were used.

In [None]:
%time user_data = getUserInfo(prior_knowledge=True)

The initial data of volunteers was cleaned, and some participants were removed before we analyzed the data:
- The volunteering students who have not been graded were removed. 
- The repeating students were filtered out, where repeating students are those accessing videos in two different years. 
- The less active students, i.e., those who have provided less 60 interactions in the platform, were removed. 

In [None]:
sns.displot(user_data, x='Round')
plt.ylabel('Number of Students')
plt.show()

Given that the Y1-2017-2018 round included only one week in a flipped classroom setting, we decided to remove the students of that round.  

In [None]:
user_data = user_data[user_data['Round'].isin(rounds)]

In [None]:
sns.displot(user_data, x='Gender')
plt.ylabel('Number of Students')
plt.show()

This is how our user table looks like. 

In [None]:
user_data.head()

In [None]:
"Hence, we will work with " + str(len(user_data)) + " students." 

### Getting Students' Records

#### Video Clickstream Records

In [None]:
%time video_data = getVideoEventsInfo().rename(columns={'VideoID': 'ElementID'})

In [None]:
video_data = video_data[video_data['Round'].isin(rounds)]

This is how our video event table looks like.

In [None]:
video_data.head()

In [None]:
"Hence, we will work with " + str(len(video_data)) + " video interactions." 

#### Problem Clickstream Records

In [None]:
%time problem_data = getProblemEventsInfo().rename(columns={'ProblemID': 'ElementID'})

In [None]:
problem_data = problem_data[problem_data['Round'].isin(rounds)]

This is how our problem event table looks like.

In [None]:
problem_data.head()

In [None]:
"Hence, we will work with " + str(len(problem_data)) + " problem interactions." 

#### Exam Records

In [None]:
%time exam_data = getExamInfo()

We filter the exam records of students who have not participated into the study

In [None]:
exam_data = exam_data[exam_data['Round'].isin(rounds)]

This is how our exam data looks like. 

In [None]:
exam_data.head()

#### Event Records

In [None]:
d1 = video_data[['AccountUserID', 'ElementID', 'TimeStamp', 'EventType', 'Round']]
d2 = problem_data[['AccountUserID', 'ElementID', 'TimeStamp', 'EventType', 'Round']]

events = d1.append(d2)

This is how our join event table looks like.

In [None]:
events.head()

#### Identify Week

In [None]:
noCourseWeeks = 20

We get the configuration file (e.g, start and end date) for each round of the course. 

In [None]:
with open('../config/linear_algebra.json') as f:
    config = json.load(f)

In [None]:
config[rounds[0].split('-')[-2]]

We assign each video interaction to a specific week of the course, with the first week of the course round having id 0. 

In [None]:
events['Date'] = events['TimeStamp'].apply(lambda x:string2Datetime(dt.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')))

In [None]:
tmp_events = []
for r in rounds:
    round_events = events[events['Round'] == r]
    tmp_events.append(processWeek(round_events, 'Date', config[r.split('-')[-2]]['Start']))
events = pd.concat(tmp_events).copy()

In [None]:
events['Week'] = events['Week'].apply(lambda x: int(x))

Then, we filter only the 14 course weeks. 

In [None]:
events = events[events['Week'].isin(range(noCourseWeeks))]

This is how our event table looks like, after week addition. 

In [None]:
events.sort_values(by='Week')

## Train-Test Data Split

In [None]:
mode = 'random' # per-year
train_ratio = 0.90
task = 'binary' # multi-class, regression

In [None]:
x = exam_data['AccountUserID'].values

if task == 'binary':
    print('Binarizing the student grades')
    y = [(1 if grade >= 4.0 else 0) for grade in exam_data['Grade']]
    print('Pass', y.count(1), 'Fail', y.count(0))

In [None]:
if mode == 'random':
    print('Spitting the whole student population randomly')
    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=1 - train_ratio)
    print('Train Pass', y_train.count(1), 'Test Pass', y_test.count(1), 'Train Fail', y_train.count(0), 'Test Fail', y_test.count(0))

## Feature Extractors

In [None]:
week_thresholds = np.arange(4, 24, 4)

In [None]:
feature_labels = {
    BoroujeniEtAl(),
    LalleConati(),
    LemayDoleck(),
}

In [None]:
feature_sets = {} 
for ffunc in feature_labels:
    flabel = ffunc.getName()
    feature_sets[flabel] = {}
    for wid in week_thresholds:
        feature_sets[flabel][wid] = {}
        feature_sets[flabel][wid]['train'] = []
        feature_sets[flabel][wid]['test'] = []
        scaler = StandardScaler()
        for uindex, uid in enumerate(x_train): 
            print('\r', 'Set:', flabel, '\tWeek:', wid, '\tMode: train', '\tProgress:', uindex + 1, len(x_train), end='')
            udata = events[(events['AccountUserID'] == uid) & (events['Week'] < wid)]
            feature_sets[flabel][wid]['train'].append(ffunc.getUserFeatures(udata))
        feature_sets[flabel][wid]['train'] = scaler.fit_transform(np.array(feature_sets[flabel][wid]['train']))
        print()
        for uindex, uid in enumerate(x_test): 
            print('\r', 'Set:', flabel, '\tWeek:', wid, '\tMode: test', '\tProgress:', uindex + 1, len(x_test), end='')
            udata = events[(events['AccountUserID'] == uid) & (events['Week'] < wid)]
            feature_sets[flabel][wid]['test'].append(ffunc.getUserFeatures(udata))
        feature_sets[flabel][wid]['test'] = scaler.fit_transform(np.array(feature_sets[flabel][wid]['test']))
        print()

## Predictive Models

In [None]:
classifiers_types = {
    'ada': AdaBoostClassifier(),
    'dt': DecisionTreeClassifier(max_depth=5),
    'gnb': GaussianNB(),
    'lr': LogisticRegression(),
    'mlp': MLPClassifier(alpha=1, max_iter=1000),
    'knn': KNeighborsClassifier(3),
    'rf': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    'svm': SVC(gamma=2, C=1)
}

In [None]:
trained_models = {}
for ffunc in feature_labels:
    flabel = ffunc.getName()
    trained_models[flabel] = {}
    for wid in week_thresholds:
        trained_models[flabel][wid] = {}
        print(flabel, wid, end='\t')
        for mid, clf in classifiers_types.items(): 
            print(mid, end=' ')
            trained_models[flabel][wid][mid] = clf.fit(feature_sets[flabel][wid]['train'], y_train)
        print()

## Evaluation Metrics

In [None]:
def tn(y_true, y_pred):
    return confusion_matrix(y_true, y_pred).ravel()[0]

def fp(y_true, y_pred):
    return confusion_matrix(y_true, y_pred).ravel()[1]

def fn(y_true, y_pred):
    return confusion_matrix(y_true, y_pred).ravel()[2]

def tp(y_true, y_pred):
    return confusion_matrix(y_true, y_pred).ravel()[3]

In [None]:
evaluation_metrics = {
    'acc': accuracy_score,
    'f1': f1_score,
    'p': precision_score, 
    'r': recall_score,
    'tp': tp,
    'tn': tn,
    'fp': fp,
    'fn': fn
}

In [None]:
results = {}
for ffunc in feature_labels:
    flabel = ffunc.getName()
    results[flabel] = {}
    for wid in week_thresholds:
        print(flabel, wid, end='\t')
        results[flabel][wid] = {}
        for mid, clf in classifiers_types.items(): 
            print(mid, end=' ')
            results[flabel][wid][mid] = {}
            for emid, mfunc in evaluation_metrics.items():
                results[flabel][wid][mid][emid] = mfunc(y_test, clf.predict(feature_sets[flabel][wid]['test']))
        print()

## Results Presentation

In [None]:
lst_data = []
lst_name = []
for flabel in results.keys():
    for wid in results[flabel].keys():
        for mid in results[flabel][wid].keys():
            lst_data.append([wid, flabel, mid] + [value for _, value in results[flabel][wid][mid].items()])  
            lst_name = ['week', 'set', 'clf'] + [emid for emid, _ in results[flabel][wid][mid].items()]

In [None]:
df_results = pd.DataFrame(lst_data, columns = lst_name)

In [None]:
df_results[df_results['week'] == 8].set_index(['week', 'set', 'clf'])

In [None]:
df_results[df_results['week'] == 16].set_index(['week', 'set', 'clf'])