In [24]:
from datetime import datetime, timedelta
import seaborn as sns
import pandas as pd
import numpy as np
import time
import sys
import os

In [2]:
sys.path.append(os.path.abspath(os.path.join('..')))

In [18]:
%load_ext autoreload
%autoreload 2

from helpers.data_process import *
from helpers.plotting import *
from helpers.db_query import *
from helpers.feature_extraction import *

# Data processing

In [4]:
%time video_events = getVideoEvents(isa_only=True)

CPU times: user 9.76 s, sys: 878 ms, total: 10.6 s
Wall time: 14.5 s


In [5]:
%time video_events = delRepentants(video_events)

CPU times: user 17.2 s, sys: 172 ms, total: 17.4 s
Wall time: 17.8 s


In [6]:
%time video_events = delLessActive(video_events)

CPU times: user 2.02 s, sys: 104 ms, total: 2.12 s
Wall time: 1.76 s


In [9]:
print(len(video_events))
video_events.head()

350093


Unnamed: 0,AccountUserID,Count,DataPackageID,VideoID,TimeStamp,EventType,Year
0,10094,98,EPFL-AlgebreLineaire-2017_T3,bd22143ef4524edfb0dbab0fbb275ca7,1513259282,Video.Play,2017
1,10094,98,EPFL-AlgebreLineaire-2017_T3,6f7c4414324047a4b9d12a1d87d5875e,1510064721,Video.Load,2017
2,10094,98,EPFL-AlgebreLineaire-2017_T3,97b12bced6d649a2bca7465b246a1b0f,1513088547,Video.Pause,2017
3,10094,98,EPFL-AlgebreLineaire-2017_T3,73f3f756e43047c0b161c9ea29fb08a2,1508761566,Video.Seek,2017
4,10094,98,EPFL-AlgebreLineaire-2017_T3,227d5967cb02415ca694728f475146e0,1510148759,Video.Play,2017


In [8]:
%time problem_events = getProblemEvents(isa_only=True)

CPU times: user 4.61 s, sys: 386 ms, total: 5 s
Wall time: 6.93 s


In [10]:
problem_events.head()

Unnamed: 0,AccountUserID,DataPackageID,ProblemID,TimeStamp,EventType,ProblemType,Year
0,46461,EPFL-AlgebreLineaire-2018,44c46d58e2eb440b9308893be55aae3c,1543677480,Problem.Check,Quiz,2018
1,46461,EPFL-AlgebreLineaire-2018,ca64971b751847c798f5ff753e3eddfe,1547885100,Problem.Check,Quiz,2018
2,46461,EPFL-AlgebreLineaire-2018,b31684ee6a684d12ba0f4afc51fe0ca4,1544256079,Problem.Check,Quiz,2018
3,46461,EPFL-AlgebreLineaire-2018,6983ed426ef14e41900f72b155f3444e,1541263833,Problem.Check,Quiz,2018
4,46461,EPFL-AlgebreLineaire-2018,1d5df58214374b80974bf8776313e3fc,1540406407,Problem.Check,Quiz,2018


# Weekly feature engineering
| Measure | Meaning | Description | Source | 
| --- | --- | --- | --- |
| PDH | Peak on day hour | Identifies if a student's activities are centred around a particular hour of the day | [1] |
| PWD | Peak on week day | Identifies if a student's activities are centred around a particular day of the week | [1] |
| WS1 | Weekly Similarities in daily activity | Identifies if student works on same weekdays | [1] | 
| WS2 | Weekly Similarities in daily activity | Identifies if there is a similar distribution of workload among weekdays | [1] | 
| WS3 | Weekly Similarities in daily activity | Identifies if there is a similar time repartition of workload among weekdays | [1] | 
| FDH | Hourly Pattern over days | Identifies if hourly pattern is repeating over days | [1] | 
| FWH | Hourly Pattern over weeks | Identifies if hourly pattern is repeating over weeks| [1] | 
| FWD | Daily Pattern over Weeks | Identifies if daily pattern is repeating over weeks | [1] |
| NQZ | Number of quiz |Counts the total number of quiz completed by a student over the semester | - |
| PQZ | Percentage of quiz | Counts the percentage of quiz completed by a student over the flipped period| - |

In [13]:
def get_rand_id(X):
    studentID =  list(X["AccountUserID"].sample(1))[0]
    print("Random ID: {}".format(studentID))
    return studentID

In [16]:
sid, T, Lw = getStudentTimeStamps(video_events, get_rand_id(video_events))

Random ID: 43044


In [84]:
def weekly_feature(Lw, T, feature):
    """
    Compute the feature over each week, delimited by start_ts and end_ts.
    """
    T = np.array(T)
    deltatime = np.array([timedelta(seconds=t) for t in T]) #Convert timestamps to timedelta for ease of use
    weekly_values = []
    start_ts, end_ts = timedelta(seconds=0), timedelta(weeks=1)
    max_ts = deltatime[-1]
    
    while start_ts < max_ts:
        week_idx = np.where((deltatime >= start_ts) & (deltatime <= end_ts))[0]
        week_T = T[week_idx]
        weekly_values.append(feature(Lw, week_T))
        start_ts += timedelta(weeks=1)
        end_ts += timedelta(weeks=1)
    return weekly_values

In [94]:
weekly_feature(Lw, T, FWD)

[1.414213562373095,
 1.414213562373095,
 0.5549581320873711,
 1.2469796037174667,
 0.4450418679126288,
 0.8019377358048384,
 1.4142135623730947,
 1.4142135623730951,
 1.8019377358048387]