In [96]:
from constants import data_path, entities_path, relations_path, Path
import pandas as pd
import seaborn as sns
import numpy as np

In [2]:
sample_data = Path("mooc_cube_x-100k")

In [3]:
video_id2ccid = pd.read_csv(relations_path / "video_id-ccid.txt", sep="\t", names=["video_id", "ccid"])

In [4]:
# Exploring downsampled version first
user2video = pd.read_parquet(sample_data / "user-video.parquet.gzip")

### Cleaning goals
1. Count the overall number of segments to understand how many is cleaned - $158\, 200$
2. For each user-video: Remove segments less than `<THRESHOLD>`
    * A fraction of the segments actually have negative duration (start point is located after the end point)
    * The number fo segments are quite few ($ 658 / 158200 \approx 0.416\%$) -> Discarded
    * Based on distribution of the raw durations - ~46 % of segments are less than 10s, the threshold is set too: 5
        * ~0.7% < 5s, 5s < ~45% < 10s 
    * With watch speed accounted for, the distribution is quite similar, though more sections between \[5, 10\] seconds
        * ~0.9% < 5s, 5s < ~56% < 10s
3. For each user-video: Figure out distances between segments and distribution of those distances
    * For each of the segments for a given video - 94% have a start time less than ~1 hour for raw durations, adjusted for the end time of the video
    * Using adjusted duration: 
4. Remove outliers of repeaters, e.g. set a threshold of 20 repetition of a given video - Explore distribution first
5. Map videos to only concepts related CS field
    1. Create mapping between chinese and english fields

In [239]:
all_sequences = user2video.explode("seq").reset_index()#.str["segment"]
all_sequences = all_sequences.assign(video_id=all_sequences["seq"].str["video_id"])
all_sequences.head()

Unnamed: 0,index,seq,user_id,video_id
0,49,"{'segment': [{'end_point': 869.0, 'local_start...",U_4243,V_1385204
1,49,"{'segment': [{'end_point': 698.0, 'local_start...",U_4243,V_1385205
2,49,"{'segment': [{'end_point': 138.0, 'local_start...",U_4243,V_1385206
3,49,"{'segment': [{'end_point': 158.0, 'local_start...",U_4243,V_1385207
4,79,"{'segment': [{'end_point': 24.499, 'local_star...",U_10007,V_6181150


In [240]:
all_sequences.reset_index().drop(columns="index").rename(columns={"level_0": 

Unnamed: 0,level_0,index,seq,user_id,video_id
0,0,49,"{'segment': [{'end_point': 869.0, 'local_start...",U_4243,V_1385204
1,1,49,"{'segment': [{'end_point': 698.0, 'local_start...",U_4243,V_1385205
2,2,49,"{'segment': [{'end_point': 138.0, 'local_start...",U_4243,V_1385206
3,3,49,"{'segment': [{'end_point': 158.0, 'local_start...",U_4243,V_1385207
4,4,79,"{'segment': [{'end_point': 24.499, 'local_star...",U_10007,V_6181150
...,...,...,...,...,...
61864,61864,289264,"{'segment': [{'end_point': 16.0, 'local_start_...",U_34710701,V_6292686
61865,61865,289264,"{'segment': [{'end_point': 655.0, 'local_start...",U_34710701,V_6292688
61866,61866,289264,"{'segment': [{'end_point': 317.0, 'local_start...",U_34710701,V_6292689
61867,61867,289302,"{'segment': [{'end_point': 14.9, 'local_start_...",U_34711221,V_6186685


In [6]:
# Task 1 - Segment Count
total_num_segments = all_sequences["seq"].str["segment"].str.len().sum()
total_num_segments

158200

In [256]:
segments_flattened = all_sequences["seq"].str["segment"].explode().apply(pd.Series)

In [257]:
segments_flattened.head()

Unnamed: 0,end_point,local_start_time,speed,start_point
0,869.0,1581880000.0,1.0,9.0
1,698.0,1581881000.0,1.0,9.0
2,138.0,1581882000.0,1.0,4.0
2,568.0,1581882000.0,1.0,148.0
3,158.0,1581882000.0,1.0,3.0


In [275]:
# Task 2 - Remove short segments - What is the distribution of segments
# Need to account for speed

segments_flattened_leveled = pd.DataFrame(all_sequences.groupby(["user_id", "video_id"]).apply(lambda x: x["seq"].str["segment"].explode())).apply(lambda x: pd.Series(x["seq"]), axis=1)

In [276]:
segments_flattened_leveled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,end_point,local_start_time,speed,start_point
user_id,video_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U_10007,V_6181150,4,24.499,1598293000.0,1.0,4.865
U_10007,V_6181150,4,46.93,1598293000.0,1.0,29.809
U_10007,V_6181150,4,222.225,1598293000.0,1.25,53.339
U_10007,V_6181152,5,34.456,1598293000.0,1.0,4.53
U_10007,V_6181152,5,74.288,1598293000.0,1.0,39.53


In [285]:
def assign_segment_idx(x):
    x["segment_idx"] = np.arange(len(x))
    return x

In [286]:
segments_flattened_ctx = segments_flattened_leveled.reset_index()\
    .sort_values(["user_id", "video_id", "local_start_time"])\
    .groupby(["user_id", "video_id"]).apply(assign_segment_idx)

In [287]:
segments_flattened_ctx

Unnamed: 0,user_id,video_id,level_2,end_point,local_start_time,speed,start_point,segment_idx
0,U_10007,V_6181150,4,24.499,1.598293e+09,1.00,4.865,0
1,U_10007,V_6181150,4,46.930,1.598293e+09,1.00,29.809,1
2,U_10007,V_6181150,4,222.225,1.598293e+09,1.25,53.339,2
3,U_10007,V_6181152,5,34.456,1.598293e+09,1.00,4.530,0
4,U_10007,V_6181152,5,74.288,1.598293e+09,1.00,39.530,1
...,...,...,...,...,...,...,...,...
158195,U_998508,V_6377076,1031,176.250,1.600924e+09,1.25,139.000,2
158196,U_998508,V_6377076,1031,200.250,1.600924e+09,1.25,182.000,3
158197,U_998508,V_6377076,1031,208.000,1.600925e+09,1.25,190.000,4
158198,U_998508,V_6377079,1032,10.250,1.600925e+09,1.25,4.000,0


In [288]:
segment_duration = segments_flattened_ctx.assign(raw_duration=segments_flattened_ctx["end_point"] - segments_flattened_ctx["start_point"],
                                                 adjusted_duration = (segments_flattened_ctx["end_point"] - segments_flattened_ctx["start_point"]) / segments_flattened_ctx["speed"],
                                                 start_time=pd.to_datetime(segments_flattened_ctx['local_start_time'],unit='s'))
                                                   

In [289]:
segment_duration.head()

Unnamed: 0,user_id,video_id,level_2,end_point,local_start_time,speed,start_point,segment_idx,raw_duration,adjusted_duration,start_time
0,U_10007,V_6181150,4,24.499,1598293000.0,1.0,4.865,0,19.634,19.634,2020-08-24 18:17:28
1,U_10007,V_6181150,4,46.93,1598293000.0,1.0,29.809,1,17.121,17.121,2020-08-24 18:17:53
2,U_10007,V_6181150,4,222.225,1598293000.0,1.25,53.339,2,168.886,135.1088,2020-08-24 18:18:13
3,U_10007,V_6181152,5,34.456,1598293000.0,1.0,4.53,0,29.926,29.926,2020-08-24 18:23:26
4,U_10007,V_6181152,5,74.288,1598293000.0,1.0,39.53,1,34.758,34.758,2020-08-24 18:24:01


Exploring likely thresholds for a segment duration

In [290]:
segment_duration_count = segment_duration[segment_duration['raw_duration'] > 0]['raw_duration'].value_counts(normalize=True, bins=500)
segment_duration_count.sort_index().reset_index()

Unnamed: 0,index,raw_duration
0,"(-2.4659999999999997, 4.937]",0.006858
1,"(4.937, 9.872]",0.453389
2,"(9.872, 14.807]",0.144250
3,"(14.807, 19.742]",0.023437
4,"(19.742, 24.677]",0.028695
...,...,...
495,"(2442.825, 2447.76]",0.000000
496,"(2447.76, 2452.695]",0.000000
497,"(2452.695, 2457.63]",0.000000
498,"(2457.63, 2462.565]",0.000000


In [291]:
segment_duration_count = segment_duration[segment_duration['adjusted_duration'] > 0]['adjusted_duration'].value_counts(normalize=True, bins=400)
segment_duration_count.sort_index().reset_index()

Unnamed: 0,index,adjusted_duration
0,"(-1.964, 4.915]",0.009303
1,"(4.915, 9.827]",0.561127
2,"(9.827, 14.74]",0.056019
3,"(14.74, 19.653]",0.028689
4,"(19.653, 24.566]",0.023685
...,...,...
395,"(1940.536, 1945.449]",0.000000
396,"(1945.449, 1950.362]",0.000000
397,"(1950.362, 1955.275]",0.000000
398,"(1955.275, 1960.187]",0.000000


#### Task 2 - Duration between segment watching

In [None]:
segments_start_discounted = segment_duration_filter.assign(local_end_time=segment_duration_filter["local_start_time"] + segment_duration_filter["adjusted_duration"])\
    .assign(end_time=pd.to_datetime(segments_start_discounted["local_end_time"], unit="s"))

In [None]:
segment_watch_time_gaps = segments_start_discounted.assign(gap=pd.to_timedelta((segments_start_discounted\
                                                           .groupby([segment_duration_filter.index])["local_start_time"].shift(-1) - segments_start_discounted["local_end_time"]), unit="s"))

In [None]:
segment_watch_time_gaps[segment_watch_time_gaps.index == 49]

In [None]:
unequal_durations = segments_start_discounted[(segments_start_discounted["local_end_time"] - segments_start_discounted["local_start_time"]) != segments_start_discounted["duration"]]
unequal_with_diff = unequal_durations.assign(diff=unequal_durations["local_end_time"] - unequal_durations["local_start_time"])
# Differences are according to floating point differences
unequal_with_diff[(unequal_with_diff["duration"] - unequal_with_diff["diff"]).abs() < .0000001].head(3)

In [None]:
segments_start_discounted[-4:]

In [None]:
segment_watch_time_gaps.iloc[3]["local_start_time"] - segment_watch_time_gaps.iloc[2]["local_end_time"]# == segment_watch_time_gaps[:2]
#["start_time_discounted"].diff().fillna(0).astype('timedelta64[s]')

In [None]:
# Without discounted watch duration
segment_watch_distances.quantile(np.arange(.75, 1, 0.01))

In [None]:
segment_watch_distances.quantile(.94)

In [None]:
segment_watch_time_gaps["gap"].quantile(np.arange(.75, 1, 0.01))

In [None]:
segment_watch_time_gaps["gap"].quantile(.94)

The accounting for the watch time barely makes a difference in the distribution, though the distribution is slightly shifted to the left (reduced gap)
By setting the threshold on repetition count is 1 hour, each watching session is split into multiple sessions

#### Task 4 - Removing repetition outliers

#### Task 5 - Filter videos to CS concepts

#### Task 6 - Apply all filteres and merge back the data

In [None]:
# Code for cleaning - Used in script to be run long term
# Segment threshold filtering
duration_thresh = 5
# Accounting for the speed the video is watched at
segments_durations = all_sequences["seq"].str["segment"].explode().transform(lambda row: (row["end_point"] - row["start_point"]) / row["speed"])
segments_threshold_filtered = segments_durations[segments_duration > duration_thresh] 
# 

### Feature extraction goals
**Basic**
* Number of segments
* Fraction of video watched -- Ignoring speed
* Weighted average speed - Complications as the number of "true seconds" watched might be unequal to the actual number of seconds due to the speed,

**Advanced**
* Video concepts - Downsize space to concepts within CS --> Need textual embedding, could just use string lookup to start with
    * For proper NLP embeddings, might have to distinguish the english and chinese concepts, if they are mixed, or exclude them