# Exploratory study on existing early warning systems

## * Setup of the working environment *

### Import traditional Python packages

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datetime import datetime as dt, timedelta, date

from matplotlib.lines import Line2D
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import pandas as pd
import numpy as np
import time
import math
import json
import sys
import os

In [None]:
import warnings
warnings.filterwarnings('ignore')

### Import custom Python modules

In [None]:
sys.path.append(os.path.abspath(os.path.join('..')))

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from helpers.db_connector import MySQLConnector
from helpers.db_query import *

from helpers.data_process import *
from helpers.feature_extraction import *

from extractors.akpinar_et_al import AkpinarEtAl
from extractors.boroujeni_et_al import BoroujeniEtAl
from extractors.chen_cui import ChenCui
from extractors.he_et_al import HeEtAl
from extractors.lalle_conati import LalleConati
from extractors.lemay_doleck import LemayDoleck
from extractors.mbouzao_et_al import MbouzaoEtAl
from extractors.mubarak_et_al import MubarakEtAl
from extractors.wan_et_al import WanEtAl

from helpers.ml_utils import *

from helpers.time import *

## * Load the clickstream data *

Since Fall 2017, the stream of the EPFL's Linear Algebra course has been taught in a flipped format. The implementation of the flipped classroom was carried out in an incremental manner, as described below:

- **Year 2017-2018**: traditional manner (weeks 1-13) - flipped manner (week 14).
- **Year 2018-2019**: traditional manner (weeks 1-4, 10-14) - flipped manner (weeks 5-9).
- **Year 2019-2020**: traditional manner (weeks 1-4) - flipped manner (weeks 5-14).

In [None]:
rounds = ['Y2-2018-19', 'Y3-2019-20']

### Identifying Students


The flipped course was offered only to volunteering students. The volunteers were collectively assigned into either the experimental and the control group. A stratified random sampling based on gender and the prior background (secondary educational level) of students were used.

In [None]:
%time userData = getUserInfo(prior_knowledge=True)

The initial data of volunteers was cleaned, and some participants were removed before we analyzed the data:
- The volunteering students who have not been graded were removed. 
- The repeating students were filtered out, where repeating students are those accessing videos in two different years. 
- The less active students, i.e., those who have provided less 60 interactions in the platform, were removed. 

Given that the Y1-2017-2018 round included only one week in a flipped classroom setting, we will remove the students of that round.  

In [None]:
userData = userData[userData['Round'].isin(rounds)]

In [None]:
userData.head()

Some of the statistics on the user data are provided below. 

In [None]:
"Number of students:", len(userData)

In [None]:
sns.displot(userData, x='Round')
plt.ylabel('Number of Students')
plt.show()

In [None]:
sns.displot(userData, x='Gender')
plt.ylabel('Number of Students')
plt.show()

In [None]:
sns.displot(userData, x='Category')
plt.ylabel('Number of Students')
plt.xticks(rotation=45)
plt.show()

### Getting Students' Records

#### Video Clickstream Records

In [None]:
%time videoData = getVideoEventsInfo(mode='all')

In [None]:
videoData = videoData[videoData['Round'].isin(rounds)]

In [None]:
videoData = videoData[videoData['AccountUserID'].isin(userData['AccountUserID'])].rename(columns={'VideoID': 'ElementID'})

In [None]:
"Number of video events:", len(videoData)

In [None]:
videoData.head()

#### Problem Clickstream Records

In [None]:
%time problemData = getProblemEventsInfo()

In [None]:
problemData = problemData[problemData['Round'].isin(rounds)].rename(columns={'ProblemID': 'ElementID'})

In [None]:
problemData = problemData[problemData['AccountUserID'].isin(userData['AccountUserID'])]

In [None]:
"Number of problem events:", len(problemData)

In [None]:
problemData.head()

#### Exam Records

In [None]:
%time examData = getExamInfo()

In [None]:
examData = examData[examData['Round'].isin(rounds)]

In [None]:
examData = examData[examData['AccountUserID'].isin(userData['AccountUserID'])]

In [None]:
"Number of graded students:", len(examData)

In [None]:
examData.head(10)

#### Event Records

In [None]:
d1 = videoData[['AccountUserID', 'ElementID', 'TimeStamp', 'EventType', 'Round']]
d2 = problemData[['AccountUserID', 'ElementID', 'TimeStamp', 'EventType', 'Round']]
eventData = d1.append(d2)

#### Course Week Column

We get the configuration file (e.g, start and end date) for each round of the course. 

In [None]:
with open('../config/linear_algebra.json') as f:
    config = json.load(f)

We assign each video interaction to a specific week of the course, with the first week of the course round having id 0. 

In [None]:
eventData['TimeStamp']=eventData['TimeStamp'].apply(lambda x:string2Datetime(dt.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')))

In [None]:
tmp_events = []
for r in rounds:
    round_events = eventData[eventData['Round'] == r]
    tmp_events.append(processWeek(round_events, 'TimeStamp', config[r.split('-')[-2]]['Start']))
eventData = pd.concat(tmp_events).copy()

In [None]:
eventData['Week'] = eventData['Week'].apply(lambda x: int(x))

Then, we filter only the first *noCourseWeeks* course weeks. 

In [None]:
eventData = eventData[eventData['Week'].isin(range(20))]

In [None]:
eventData['ElementID'] = eventData['ElementID'].astype('category').cat.codes.astype('str')

In [None]:
eventData['ElementID'] = eventData.apply(lambda x: x['EventType'][0] + x['ElementID'], axis=1)

In [None]:
eventData['EventType'] = eventData['EventType'].apply(lambda x: x if x != 'Video.Load' else 'Video.Play')

In [None]:
eventData.head()

### Session Explorative Analysis

In [None]:
def getSeconds(row):
    if 'Start' in row and 'End' in row:
        return (row['End'] - row['Start'])
    return 0.0

In [None]:
def getInteractions(group):
    elements = []
    for index, row in group.groupby('SessionID'):
        elements.append(np.array([[e1, e2, e3 if i > 0 else 0] for i, (e1, e2, e3) in enumerate(zip(row['ElementID'], row['EventType'], row['Interval']))]))
    return np.array(elements)

In [None]:
def getSessions(df, maxSessionLength=120, minNoActions=3):
    sessions = []
    for index, group in df.groupby(['AccountUserID']):
        group = group[~group['EventType'].str.contains('Transcript')][['ElementID', 'TimeStamp', 'EventType', 'Round', 'Week']].sort_values('TimeStamp')
        group['Interval'] = (group['TimeStamp'] - group['TimeStamp'].shift(1))
        group['Interval'] = group['Interval'].apply(lambda x: x.total_seconds())
        group['Interval'] = group['Interval'].fillna(0)
        group['SessionID'] = (group['TimeStamp'] - group['TimeStamp'].shift(1) > pd.Timedelta(maxSessionLength, 'm')).cumsum() + 1
        session = group.groupby('SessionID').count()
        session['NoEvents'] = session['TimeStamp']
        session['Round'] = group.drop_duplicates(subset=['SessionID'], keep='first')['Round'].values
        session['Week'] = group.drop_duplicates(subset=['SessionID'], keep='first')['Week'].values
        session['Start'] = group.drop_duplicates(subset=['SessionID'], keep='first')['TimeStamp'].values
        session['End'] = group.drop_duplicates(subset=['SessionID'], keep='last')['TimeStamp'].values
        session['Duration'] = session.apply(lambda row: getSeconds(row), axis=1)
        session['AccountUserID'] = index
        session['Interactions'] = getInteractions(group)        
        session['Elements'] = group.groupby('SessionID')['ElementID'].apply(','.join).values
        session['Event'] = group.groupby('SessionID')['EventType'].apply(','.join).values
        session['Interval'] = group.groupby('SessionID')['Interval'].apply(lambda x: list(x)[1:]).values
        session = session[['AccountUserID', 'Round', 'Week', 'Start', 'End', 'Duration', 'NoEvents', 'Interactions', 'Event', 'Interval', 'Elements']].reset_index()
        sessions.append(session)
    sessions = pd.concat(sessions, ignore_index=True)
    sessions = sessions[sessions['NoEvents'] >= minNoActions]
    return sessions

In [None]:
sessions = getSessions(eventData, maxSessionLength=60, minNoActions=3)

In [None]:
sessions.head()

In [None]:
mappingGrade = {e1:(1 if e2 < 4 else 0) for e1, e2 in zip(examData['AccountUserID'], examData['Grade'])}
sessions['Grade'] = sessions['AccountUserID'].apply(lambda x: mappingGrade[x])

In [None]:
mappingKnowledge = {e1:e2 for e1, e2 in zip(userData['AccountUserID'], userData['Category'])}
sessions['Knowledge'] = sessions['AccountUserID'].apply(lambda x: mappingKnowledge[x])

In [None]:
mappingGender = {e1:e2 for e1, e2 in zip(userData['AccountUserID'], userData['Gender'])}
sessions['Gender'] = sessions['AccountUserID'].apply(lambda x: mappingGender[x])

In [None]:
mappingRound = {e1:e2 for e1, e2 in zip(userData['AccountUserID'], userData['Round'])}

In [None]:
data = []
for index, group in sessions.groupby(by='AccountUserID'):
    data.append([v[:, 2].astype(float) for i, v in enumerate(group['Interactions'].values)])

evt = []
for index, group in sessions.groupby(by='AccountUserID'):
    evt.append([[a for a in v[:, 1]] for i, v in enumerate(group['Interactions'].values)])

els = []
for index, group in sessions.groupby(by='AccountUserID'):
    els.append([[True] + [v.split(',')[i-1] != v.split(',')[i] for i in range(1, len(v.split(',')))] for i, v in enumerate(group['Elements'].values)])

els1 = []
for index, group in sessions.groupby(by='AccountUserID'):
    els1.append([v.split(',') for i, v in enumerate(group['Elements'].values)])

weekers = []
for index, group in sessions.groupby(by='AccountUserID'):
    weekers.append(group['Week'] - group['Week'].shift(1))
    weekers[-1][0] = 0

In [None]:
fels = []
for i, (v, b) in enumerate(zip(els1, els)):
    fels.append([[(v2 if b2 else '') for v2, b2 in zip(v1, b1)] for v1, b1 in zip(v, b)])

In [None]:
eventsSoFar = list(np.sort(np.unique(eventData['EventType'])))[:11]

In [None]:
colors = ['#522e38','#a8201a','#ef7b45','#ffd166','#fffd82','#fff1d0',
          '#5b8e7d','#90a955','#affc41','#1dd3b0','#143642']

In [None]:
for user_seq, user_id in tqdm(enumerate(np.unique(sessions['AccountUserID']))):
    
    if user_seq >= 199:
    
        max_length = np.max([len(x) for x in data[user_seq]])
        no_sessions = len(data[user_seq])

        plt.figure(num=None, figsize=(20, 10), dpi=80, facecolor='w', edgecolor='k')
        width = 0.6

        prev = np.zeros(no_sessions)
        for i in range(max_length):
            line = []
            linev = []

            for j in range(no_sessions):
                line += [np.max([data[user_seq][j][i], 10])] if i < len(data[user_seq][j]) else [0]
                linev += [evt[user_seq][j][i]] if i < len(evt[user_seq][j]) else ['']

            #for k, v in enumerate(prev):
            #    plt.text(k-0.15, v, fels[user_seq][k][i] if i < len(fels[user_seq][k]) else '', color='white')

            newprev = []
            for k, v in enumerate(prev):
                newprev.append((v if fels[user_seq][k][i] == '' else v + 50) if i < len(fels[user_seq][k]) else v)
            prev = np.array(newprev)    

            if i == 0:
                plt.bar(np.arange(no_sessions), np.array(line), width=width)
            else:
                plt.bar(np.arange(no_sessions), np.array(line), width=width, bottom=np.array(prev), color=[colors[eventsSoFar.index(e)] if e != '' else 'red' for e in linev])

            prev = np.array(prev) + np.array(line)

        custom_lines = [Line2D([0], [0], color=colors[i], lw=4) for i, _ in enumerate(eventsSoFar)] 

        count = 0
        prev = 0
        for i, w in enumerate(weekers[user_seq]):
            if i == 0:
                w = 0
            if w > 0:
                plt.text(prev + (i - prev) / 2.0 - (0.50 if count < 10 else 0.65), np.max(prev)-4500, 'W' + str(int(count)), fontsize=12, weight='bold', rotation=90)
                plt.axvline(x=i - 0.5, color='gray', linestyle='--')
                prev = i
            count += w
        plt.text(prev + (i + 1 - prev) / 2.0 - (0.70 if count < 10 else 0.65), np.max(prev)-4500, 'W' + str(int(count)), fontsize=12, weight='bold', rotation=90)
        plt.axvline(x=i - 0.5, color='gray', linestyle='--')

        plt.ylabel('Session Time [s]')
        plt.ylim([0, 50000])
        plt.xlim([-0.5, no_sessions])
        plt.legend(custom_lines, [e for e in eventsSoFar], loc='upper right')
        plt.xticks(np.arange(no_sessions))
        plt.tight_layout()
        plt.savefig('../data/figures/' + str(mappingRound[user_id]) + '-' + ('Fail' if mappingGrade[user_id] == 1 else 'Pass') + '-' + str(mappingGender[user_id]) + '-' + mappingKnowledge[user_id].replace('.', '_') + '-' + str(user_seq) + '-' + str(user_id) + '.png')