In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
from datetime import datetime, timedelta, date
import time
import os
import sys
import json
sys.path.append(os.path.abspath(os.path.join('..')))

In [4]:
%load_ext autoreload
%autoreload 2

from helpers.db_query import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Weekly video count

In [63]:
video_dates = pd.read_csv('../data/lin_alg_moodle/videos.csv', index_col=0)
video_dates['Due_date'] = pd.to_datetime(video_dates.Due_date)

In [64]:
def select_by_year(video_dates, year):
    return video_dates.loc[video_dates.Due_date.dt.year == year]

video_dates_2017, video_dates_2018, video_dates_2019 = [select_by_year(video_dates, year) for year in [2017,2018, 2019]]

In [76]:
def weekly_count(video_dates):
    return video_dates.groupby(pd.Grouper(key='Due_date',freq='W-THU')).size().values

count_2017, count_2018, count_2019 = [weekly_count(dates) for dates in [video_dates_2017, video_dates_2018,video_dates_2019]]
print("2017", count_2017)
print("2018", count_2018)
print("2019", count_2019)

2017 [ 6  9  6 12  7]
2018 [ 7 10 10  6 13  7  8  7  9  6]
2019 [ 5  4  8  7  7 10 11 13  7  8  6  9  6]


# Features

In [5]:
events = getVideoEvents(mode='all')

In [150]:
events.sample()

Unnamed: 0,DataPackageID,AccountUserID,VideoID,TimeStamp,EventType,SeekType,OldTime,CurrentTime,NewTime,OldSpeed,NewSpeed,Date,Year
133248,EPFL-AlgebreLineaire-2019,94660,7201a7b5bf5c451786a44481cba36cfd,1569706274,Video.Pause,,,436.475,,,,2019-09-28 21:31:14,2019


In [151]:
user_events = events.loc[events.AccountUserID == '94660']

In [28]:
def total_views(df):
    """ 
    Counts the total of videos views (rewatch included)
    Assumption: consider that a video is watched at most once per day
    """
    copy = df.copy()
    copy['Day'] = df.Date.dt.date
    #From the assumption the video view is a unique pair (video id, day)
    return len(copy.drop_duplicates(subset=['VideoID','Day'])) 

In [152]:
total_views(user_events)

133

In [164]:
def week_video_total(year):
    """
    Returns a Series with week numbers as index and the number of videos to watch per week
    """
    with open('../config/linear_algebra.json') as f:
        config = json.load(f)
    year = str(year)
    weekly_count = config[year]["WeeklyVideoCount"]
    flipped_weeks = len(config[year]["FlippedWeeks"])
    start_week = int(datetime.strptime(config[year]["StartFlipped"], '%Y-%m-%d').strftime("%V")) #Get the 1st week number
    weeks = list(range(start_week, start_week + flipped_weeks))
    return pd.DataFrame(index=weeks, data=weekly_count, columns=["Total"])

def weekly_prop_watched(df):
    """
    Compute the proportion of videos watched (nb of videos watched / nb of videos assigned)
    Problem: we are counting videos that are assigned on other weeks. That makes proportion 
    greater than 1 possible so we have to clip. Therefore if the student watches with delay or
    advance some information may be lost. 
    """
    first_views = user_events.drop_duplicates(subset=["VideoID"]) #Only keep the first views per video
    #Freq Weekly starting on Thursday since the last due date is on Thursday
    weekly_count = first_views.groupby(pd.Grouper(key="Date", freq="W-THU")).size().to_frame(name="Count")
    #Convert dates to week number
    weekly_count.index = [int(week) for week in weekly_count.index.strftime("%V")]
    weekly_total = week_video_total(user_events.Year.iloc[0])
    
    week_prop = weekly_total.merge(weekly_count, left_index=True, right_index=True)
    return np.clip((week_prop.Count / week_prop.Total).values,0,1)

def avg_weekly_prop_watched(df):
    return weekly_prop_watched(df).mean()

def std_weekly_prop_watched(df):
    return weekly_prop_watched(df).std()

In [166]:
std_weekly_prop_watched(user_events)

0.3642773230544097