In [310]:
from constants import data_path, entities_path, relations_path, Path
import pandas as pd
import seaborn as sns
import numpy as np
import dask.dataframe as dd

In [243]:
sample_data = Path("mooc_cube_x-100k")

In [244]:
video_id2ccid = pd.read_csv(relations_path / "video_id-ccid.txt", sep="\t", names=["video_id", "ccid"])

In [323]:
concept2ccid = pd.read_csv(relations_path / "concept-video.txt", sep="\t", names=["concept_id", "ccid"])

In [245]:
# Exploring downsampled version first
user2video = pd.read_parquet(sample_data / "user-video.parquet.gzip")

### Cleaning goals
1. Count the overall number of segments to understand how many is cleaned - $158\, 200$
2. For each user-video: Remove segments less than `<THRESHOLD>`
    * A fraction of the segments actually have negative duration (start point is located after the end point)
    * The number fo segments are quite few ($ 658 / 158200 \approx 0.416\%$) -> Discarded
    * Based on distribution of the raw durations - ~46 % of segments are less than 10s, the threshold is set too: 5
        * ~0.7% < 5s, 5s < ~45% < 10s 
    * With watch speed accounted for, the distribution is quite similar, though more sections between \[5, 10\] seconds
        * ~0.9% < 5s, 5s < ~56% < 10s
3. For each user-video: Figure out distances between segments and distribution of those distances
    * For each of the segments for a given video - 94% have a start time less than ~1 hour for raw durations, adjusted for the end time of the video
    * Using adjusted duration: There's a huge difference (perhaps the first time around was wrong?) - Highly left adjusted
    * Still using threshold 1 hour -> Example some video sessions are maximium split into 21 "sessions"
    * **Result** Increased number of watching sessions from 36711 to 40798 (~11.1% increase) and the max number of sessions for a video_id is 20
4. Remove outliers of repeaters, e.g. set a threshold of 20 repetition of a given video - Explore distribution first
5. Map videos to only concepts related CS field
    * Roughly 42% of videos do not have a related concept field
    * $\gt 0.1\%$ of videos are missing a ccid
    1. Create mapping between chinese and english fields

In [320]:
dd_video_id2ccid = dd.from_pandas(video_id2ccid, npartitions=10)

In [312]:
dd_user2video = dd.from_pandas(user2video, npartitions=10)

In [324]:
dd_concept2ccid = dd.from_pandas(concept2ccid, npartitions=10)

In [317]:
all_sequences_basic = dd_user2video.explode("seq").reset_index()#.str["segment"]
all_sequences_basic = all_sequences_basic.assign(video_id=all_sequences_basic["seq"].str["video_id"])

In [321]:
#all_sequences = all_sequences_basic.merge(dd_video_id2ccid, on="video_id", how="left")
#all_sequences.head()

Unnamed: 0,index,seq,user_id,video_id,ccid
0,245611,"{'segment': [{'end_point': 25.1, 'local_start_...",U_33285100,V_7035471,19533419322C29289C33DC5901307461
1,245611,"{'segment': [{'end_point': 463.9, 'local_start...",U_33285100,V_7035503,86AF5644CF3626959C33DC5901307461
2,245673,"{'segment': [{'end_point': 206.9, 'local_start...",U_33286676,V_6217958,73F77C7861CBB9499C33DC5901307461
3,246079,"{'segment': [{'end_point': 41.0, 'local_start_...",U_33303278,V_6197617,62B45DECFEDCBD3A9C33DC5901307461
4,246207,"{'segment': [{'end_point': 786.0, 'local_start...",U_33310485,V_6202013,3B0266DA43F2F3869C33DC5901307461


In [327]:
# Task 1 - Segment Count
total_num_segments = all_sequences_basic["seq"].str["segment"].str.len().sum()
total_num_segments.compute()

158200

In [336]:
segments_meta_arg={'end_point': "float", 'local_start_time':"int64", 'speed': "float", 'start_point': "float"}

In [337]:
segments_flattened = all_sequences_basic["seq"].str["segment"].explode().apply(pd.Series, meta=segments_meta_arg)

In [338]:
segments_flattened.head()

Unnamed: 0,end_point,local_start_time,speed,start_point
0,869.0,1581880000.0,1.0,9.0
1,698.0,1581881000.0,1.0,9.0
2,138.0,1581882000.0,1.0,4.0
2,568.0,1581882000.0,1.0,148.0
3,158.0,1581882000.0,1.0,3.0


In [371]:
# Task 2 - Remove short segments - What is the distribution of segments
# Need to account for speed
# Fit for dd: need meta_arg

segment_explode_meta={"user_id": object, "video_id": object, "segment": object }
segments_df_meta_args = dict({"user_id": object, "video_id": object, "level_2": 'int64'}, **segments_meta_arg)
segments_df_meta_args
                             

{'user_id': object,
 'video_id': object,
 'level_2': 'int64',
 'end_point': 'float',
 'local_start_time': 'int64',
 'speed': 'float',
 'start_point': 'float'}

In [360]:
all_sequences_basic.compute()

Unnamed: 0,index,seq,user_id,video_id
0,49,"{'segment': [{'end_point': 869.0, 'local_start...",U_4243,V_1385204
1,49,"{'segment': [{'end_point': 698.0, 'local_start...",U_4243,V_1385205
2,49,"{'segment': [{'end_point': 138.0, 'local_start...",U_4243,V_1385206
3,49,"{'segment': [{'end_point': 158.0, 'local_start...",U_4243,V_1385207
4,79,"{'segment': [{'end_point': 24.499, 'local_star...",U_10007,V_6181150
...,...,...,...,...
2982,289264,"{'segment': [{'end_point': 16.0, 'local_start_...",U_34710701,V_6292686
2983,289264,"{'segment': [{'end_point': 655.0, 'local_start...",U_34710701,V_6292688
2984,289264,"{'segment': [{'end_point': 317.0, 'local_start...",U_34710701,V_6292689
2985,289302,"{'segment': [{'end_point': 14.9, 'local_start_...",U_34711221,V_6186685


##### TODO: FIX SO CELL RUNS AS DASK -> Slowest op so far

In [364]:
all_sequences = all_sequences_basic.compute()

In [388]:
segments_flattened_leveled = pd.DataFrame(all_sequences.groupby(["user_id", "video_id"]).apply(lambda x: x["seq"].str["segment"].explode()))#.apply(lambda x: pd.Series(x["seq"]), axis=1)
#, meta=segment_explode_meta)

In [373]:
dd_segments_flattened = dd.from_pandas(segments_flattened_leveled.reset_index(), npartitions=10)
dd_segments_flattened.head()

In [None]:
segments_flattened_leveled = segments_flattened_leveled.apply(lambda x: pd.Series(x["seq"]), axis=1)#, meta=segments_df_meta_args)
segments_flattened_leveled.head()

In [414]:
segments_flattened_ext = segments_flattened_leveled.reset_index().merge(video_id2ccid, on="video_id", how="left")#.merge(concept2ccid, on="ccid", how="outer")

In [415]:
segments_flattened_ext.head()

Unnamed: 0,user_id,video_id,level_2,end_point,local_start_time,speed,start_point,ccid
0,U_10007,V_6181150,4,24.499,1598293000.0,1.0,4.865,A17BE683B6F2EAB59C33DC5901307461
1,U_10007,V_6181150,4,46.93,1598293000.0,1.0,29.809,A17BE683B6F2EAB59C33DC5901307461
2,U_10007,V_6181150,4,222.225,1598293000.0,1.25,53.339,A17BE683B6F2EAB59C33DC5901307461
3,U_10007,V_6181152,5,34.456,1598293000.0,1.0,4.53,439EAFC2EA7521699C33DC5901307461
4,U_10007,V_6181152,5,74.288,1598293000.0,1.0,39.53,439EAFC2EA7521699C33DC5901307461


In [416]:
def assign_segment_idx(x):
    x["segment_idx"] = np.arange(len(x))
    return x

In [None]:
# Not really used
segments_flattened_ctx = segments_flattened_ext.reset_index()\
    .sort_values(["user_id", "video_id", "local_start_time"])\
    .groupby(["user_id", "video_id"]).apply(assign_segment_idx)

In [420]:
segments_flattened_ext

Unnamed: 0,user_id,video_id,level_2,end_point,local_start_time,speed,start_point,ccid
0,U_10007,V_6181150,4,24.499,1.598293e+09,1.00,4.865,A17BE683B6F2EAB59C33DC5901307461
1,U_10007,V_6181150,4,46.930,1.598293e+09,1.00,29.809,A17BE683B6F2EAB59C33DC5901307461
2,U_10007,V_6181150,4,222.225,1.598293e+09,1.25,53.339,A17BE683B6F2EAB59C33DC5901307461
3,U_10007,V_6181152,5,34.456,1.598293e+09,1.00,4.530,439EAFC2EA7521699C33DC5901307461
4,U_10007,V_6181152,5,74.288,1.598293e+09,1.00,39.530,439EAFC2EA7521699C33DC5901307461
...,...,...,...,...,...,...,...,...
158195,U_998508,V_6377076,1031,176.250,1.600924e+09,1.25,139.000,1D1985A62D63D3FE9C33DC5901307461
158196,U_998508,V_6377076,1031,200.250,1.600924e+09,1.25,182.000,1D1985A62D63D3FE9C33DC5901307461
158197,U_998508,V_6377076,1031,208.000,1.600925e+09,1.25,190.000,1D1985A62D63D3FE9C33DC5901307461
158198,U_998508,V_6377079,1032,10.250,1.600925e+09,1.25,4.000,EC96CE1ED355A4319C33DC5901307461


In [421]:
segment_duration = segments_flattened_ext.assign(raw_duration=segments_flattened_ext["end_point"] - segments_flattened_ext["start_point"],
                                                 adjusted_duration = (segments_flattened_ext["end_point"] - segments_flattened_ext["start_point"]) / segments_flattened_ext["speed"],
                                                 start_time=pd.to_datetime(segments_flattened_ext['local_start_time'],unit='s'))
                                                   

In [422]:
segment_duration.head()

Unnamed: 0,user_id,video_id,level_2,end_point,local_start_time,speed,start_point,ccid,raw_duration,adjusted_duration,start_time
0,U_10007,V_6181150,4,24.499,1598293000.0,1.0,4.865,A17BE683B6F2EAB59C33DC5901307461,19.634,19.634,2020-08-24 18:17:28
1,U_10007,V_6181150,4,46.93,1598293000.0,1.0,29.809,A17BE683B6F2EAB59C33DC5901307461,17.121,17.121,2020-08-24 18:17:53
2,U_10007,V_6181150,4,222.225,1598293000.0,1.25,53.339,A17BE683B6F2EAB59C33DC5901307461,168.886,135.1088,2020-08-24 18:18:13
3,U_10007,V_6181152,5,34.456,1598293000.0,1.0,4.53,439EAFC2EA7521699C33DC5901307461,29.926,29.926,2020-08-24 18:23:26
4,U_10007,V_6181152,5,74.288,1598293000.0,1.0,39.53,439EAFC2EA7521699C33DC5901307461,34.758,34.758,2020-08-24 18:24:01


Exploring likely thresholds for a segment duration

In [423]:
segment_duration_count = segment_duration[segment_duration['raw_duration'] > 0]['raw_duration'].value_counts(normalize=True, bins=500)
segment_duration_count.sort_index().reset_index()

Unnamed: 0,index,raw_duration
0,"(-2.4659999999999997, 4.937]",0.006858
1,"(4.937, 9.872]",0.453389
2,"(9.872, 14.807]",0.144250
3,"(14.807, 19.742]",0.023437
4,"(19.742, 24.677]",0.028695
...,...,...
495,"(2442.825, 2447.76]",0.000000
496,"(2447.76, 2452.695]",0.000000
497,"(2452.695, 2457.63]",0.000000
498,"(2457.63, 2462.565]",0.000000


In [448]:
segment_duration_count = segment_duration[segment_duration['adjusted_duration'] > 0]['adjusted_duration'].value_counts(normalize=True, bins=400)
segment_duration_count.sort_index().reset_index()[:10]

Unnamed: 0,index,adjusted_duration
0,"(-1.964, 4.915]",0.009303
1,"(4.915, 9.827]",0.561127
2,"(9.827, 14.74]",0.056019
3,"(14.74, 19.653]",0.028689
4,"(19.653, 24.566]",0.023685
5,"(24.566, 29.478]",0.021056
6,"(29.478, 34.391]",0.048589
7,"(34.391, 39.304]",0.027089
8,"(39.304, 44.217]",0.012795
9,"(44.217, 49.129]",0.011614


#### Task 2 - Duration between segment watching

In [425]:
duration_threshold = 5
segment_duration_filter = segment_duration[segment_duration["adjusted_duration"] >= 5]
segment_duration_filter

Unnamed: 0,user_id,video_id,level_2,end_point,local_start_time,speed,start_point,ccid,raw_duration,adjusted_duration,start_time
0,U_10007,V_6181150,4,24.499,1.598293e+09,1.00,4.865,A17BE683B6F2EAB59C33DC5901307461,19.634,19.6340,2020-08-24 18:17:28
1,U_10007,V_6181150,4,46.930,1.598293e+09,1.00,29.809,A17BE683B6F2EAB59C33DC5901307461,17.121,17.1210,2020-08-24 18:17:53
2,U_10007,V_6181150,4,222.225,1.598293e+09,1.25,53.339,A17BE683B6F2EAB59C33DC5901307461,168.886,135.1088,2020-08-24 18:18:13
3,U_10007,V_6181152,5,34.456,1.598293e+09,1.00,4.530,439EAFC2EA7521699C33DC5901307461,29.926,29.9260,2020-08-24 18:23:26
4,U_10007,V_6181152,5,74.288,1.598293e+09,1.00,39.530,439EAFC2EA7521699C33DC5901307461,34.758,34.7580,2020-08-24 18:24:01
...,...,...,...,...,...,...,...,...,...,...,...
158195,U_998508,V_6377076,1031,176.250,1.600924e+09,1.25,139.000,1D1985A62D63D3FE9C33DC5901307461,37.250,29.8000,2020-09-24 05:12:34
158196,U_998508,V_6377076,1031,200.250,1.600924e+09,1.25,182.000,1D1985A62D63D3FE9C33DC5901307461,18.250,14.6000,2020-09-24 05:13:09
158197,U_998508,V_6377076,1031,208.000,1.600925e+09,1.25,190.000,1D1985A62D63D3FE9C33DC5901307461,18.000,14.4000,2020-09-24 05:15:49
158198,U_998508,V_6377079,1032,10.250,1.600925e+09,1.25,4.000,EC96CE1ED355A4319C33DC5901307461,6.250,5.0000,2020-09-24 05:16:27


In [426]:
segments_start_discounted = segment_duration_filter.assign(local_end_time=segment_duration_filter["local_start_time"] + segment_duration_filter["adjusted_duration"])
segments_end_time = segments_start_discounted.assign(end_time=pd.to_datetime(segments_start_discounted["local_end_time"], unit="s"))
segments_end_time.head()

Unnamed: 0,user_id,video_id,level_2,end_point,local_start_time,speed,start_point,ccid,raw_duration,adjusted_duration,start_time,local_end_time,end_time
0,U_10007,V_6181150,4,24.499,1598293000.0,1.0,4.865,A17BE683B6F2EAB59C33DC5901307461,19.634,19.634,2020-08-24 18:17:28,1598293000.0,2020-08-24 18:17:47.634000128
1,U_10007,V_6181150,4,46.93,1598293000.0,1.0,29.809,A17BE683B6F2EAB59C33DC5901307461,17.121,17.121,2020-08-24 18:17:53,1598293000.0,2020-08-24 18:18:10.120999936
2,U_10007,V_6181150,4,222.225,1598293000.0,1.25,53.339,A17BE683B6F2EAB59C33DC5901307461,168.886,135.1088,2020-08-24 18:18:13,1598293000.0,2020-08-24 18:20:28.108800000
3,U_10007,V_6181152,5,34.456,1598293000.0,1.0,4.53,439EAFC2EA7521699C33DC5901307461,29.926,29.926,2020-08-24 18:23:26,1598293000.0,2020-08-24 18:23:55.926000128
4,U_10007,V_6181152,5,74.288,1598293000.0,1.0,39.53,439EAFC2EA7521699C33DC5901307461,34.758,34.758,2020-08-24 18:24:01,1598293000.0,2020-08-24 18:24:35.757999872


In [427]:
segment_watch_time_gaps = segments_end_time.assign(gap=pd.to_timedelta((segments_end_time\
                                                           .groupby(["user_id", "video_id"])["local_start_time"].shift(-1) - segments_end_time["local_end_time"]), unit="s"))
segment_watch_time_gaps.head()

Unnamed: 0,user_id,video_id,level_2,end_point,local_start_time,speed,start_point,ccid,raw_duration,adjusted_duration,start_time,local_end_time,end_time,gap
0,U_10007,V_6181150,4,24.499,1598293000.0,1.0,4.865,A17BE683B6F2EAB59C33DC5901307461,19.634,19.634,2020-08-24 18:17:28,1598293000.0,2020-08-24 18:17:47.634000128,0 days 00:00:05.365999937
1,U_10007,V_6181150,4,46.93,1598293000.0,1.0,29.809,A17BE683B6F2EAB59C33DC5901307461,17.121,17.121,2020-08-24 18:17:53,1598293000.0,2020-08-24 18:18:10.120999936,0 days 00:00:02.878999949
2,U_10007,V_6181150,4,222.225,1598293000.0,1.25,53.339,A17BE683B6F2EAB59C33DC5901307461,168.886,135.1088,2020-08-24 18:18:13,1598293000.0,2020-08-24 18:20:28.108800000,NaT
3,U_10007,V_6181152,5,34.456,1598293000.0,1.0,4.53,439EAFC2EA7521699C33DC5901307461,29.926,29.926,2020-08-24 18:23:26,1598293000.0,2020-08-24 18:23:55.926000128,0 days 00:00:05.073999882
4,U_10007,V_6181152,5,74.288,1598293000.0,1.0,39.53,439EAFC2EA7521699C33DC5901307461,34.758,34.758,2020-08-24 18:24:01,1598293000.0,2020-08-24 18:24:35.757999872,0 days 00:00:05.242000103


In [None]:
# Exploring calculated differences for adjusted time
unequal_durations = segments_start_discounted[(segments_start_discounted["local_end_time"] - segments_start_discounted["local_start_time"]) != segments_start_discounted["raw_duration"]]
unequal_with_diff = unequal_durations.assign(diff=unequal_durations["local_end_time"] - unequal_durations["local_start_time"])
# Differences are according to floating point differences
unequal_with_diff[(unequal_with_diff["raw_duration"] - unequal_with_diff["diff"]).abs() < .0000001].head(3)

In [277]:
#segments_start_discounted[-4:]

In [278]:
segment_watch_time_gaps.iloc[1]["local_start_time"] - segment_watch_time_gaps.iloc[0]["local_end_time"]# == segment_watch_time_gaps[:2]
#["start_time_discounted"].diff().fillna(0).astype('timedelta64[s]')

5.365999937057495

In [428]:
segment_watch_time_gaps["gap"].quantile(np.arange(.75, 1, 0.01))

0.75             0 days 00:01:05
0.76             0 days 00:01:10
0.77             0 days 00:01:15
0.78             0 days 00:01:20
0.79             0 days 00:01:25
0.80             0 days 00:01:30
0.81             0 days 00:01:35
0.82             0 days 00:01:40
0.83             0 days 00:01:45
0.84             0 days 00:01:55
0.85             0 days 00:02:05
0.86             0 days 00:02:12
0.87             0 days 00:02:25
0.88             0 days 00:02:36
0.89             0 days 00:02:52
0.90   0 days 00:03:11.232335138
0.91             0 days 00:03:35
0.92             0 days 00:04:10
0.93             0 days 00:05:00
0.94             0 days 00:06:15
0.95   0 days 00:08:33.319999956
0.96   0 days 00:13:24.994000005
0.97   0 days 00:28:54.584000062
0.98      0 days 02:58:37.320000
0.99   1 days 04:11:39.759999941
Name: gap, dtype: timedelta64[ns]

In [429]:
segment_watch_time_gaps["gap"].quantile(.94)

Timedelta('0 days 00:06:15')

In [430]:
segment_watch_time_gaps["gap"].describe()

count                         118579
mean       0 days 03:44:23.626128106
std        2 days 15:42:48.310360245
min      -1 days +23:54:28.700000048
25%                  0 days 00:00:05
50%                  0 days 00:00:20
75%                  0 days 00:01:05
max      151 days 00:44:44.950000048
Name: gap, dtype: object

In [431]:
segment_watch_time_gaps[segment_watch_time_gaps["gap"].min() == segment_watch_time_gaps["gap"]]
segment_watch_time_gaps[segment_watch_time_gaps.index.isin([124672,124673])]

Unnamed: 0,user_id,video_id,level_2,end_point,local_start_time,speed,start_point,ccid,raw_duration,adjusted_duration,start_time,local_end_time,end_time,gap
124672,U_31508917,V_1646765,4427,933.5,1598757000.0,1.0,239.2,5C085A7107CA1DD89C33DC5901307461,694.3,694.3,2020-08-30 03:09:21,1598758000.0,2020-08-30 03:20:55.300000000,-1 days +23:54:28.700000048
124673,U_31508917,V_1646765,4427,1059.6,1598757000.0,2.0,939.7,5C085A7107CA1DD89C33DC5901307461,119.9,59.95,2020-08-30 03:15:24,1598757000.0,2020-08-30 03:16:23.950000128,NaT


The accounting for the watch time barely makes a difference in the distribution, though the distribution is slightly shifted to the left (reduced gap)
By setting the threshold on repetition count is 1 hour, each watching session is split into multiple sessions.

**At least 1 part where a user saw multiple videos at once -- Perhaps removed with repeaters**
Try to figure out how common

In [283]:
#segment_watch_time_gaps.assign(interval=pd.IntervalIndex.from_arrays(segment_watch_time_gaps["start_time"], segment_watch_time_gaps["end_time"], closed='both'))#.groupby("user_id")[["start_time", "end_time"]].apply(lambda x: pd.IntervalIndex.from_arrays)

##### Split repetition based on gap into separate sessions

In [432]:
def split_watch_sessions(x):
    """Very slow implementation"""
    session_counter = 0
    session_thresh = 3600
    for idx in x.index:
        x.loc[idx, "session_id"] = session_counter
        if x.loc[idx, "gap"].seconds >= session_thresh:
            session_counter += 1
    return x
            
    # ALTERNATIVE
    #last_segments_in_session = x[x["gap"].dt.seconds >= 3600].index
    #if (num_segments:=last_segments_in_session.shape[0]) > 0:
    #    x.loc[last_segments_in_session, "gap_sep"] = np.random.choice(160000, num_segments)
    #x.loc[:, "gap_sep"].fillna(method="bfill")
    return x

TODO: Improve this gap separation technique

In [433]:
segment_w_gap_sep = segment_watch_time_gaps.assign(session_id=0)
segment_w_gap_sep = segment_w_gap_sep.groupby(["user_id", "video_id"]).apply(split_watch_sessions)# ROLL CHECK IF GAP IS LARGER THAN > threshold -> Assign value, update counter

In [434]:
segment_w_gap_sep.groupby(["user_id", "video_id"])["session_id"].nunique().sum()

40798

In [435]:
segment_w_gap_sep.groupby(["user_id", "video_id"]).apply(list).count()

36711

In [436]:
segment_w_gap_sep["session_id"].max()

20

#### Task 4 - Removing repetition outliers

In [446]:
# Perhaps group by ccid, user_id -> count session_id -> Sum the session counts
# For each user_id, video_id: sum count session_ids as num_sessions
# For each ccid, user_id -> Sum the num_sessions
# compared to 
session_counts = segment_w_gap_sep.groupby(["user_id", "video_id"])["session_id"].nunique().rename("session_count").reset_index()
session_counts


Unnamed: 0,user_id,video_id,session_count
0,U_10007,V_6181150,1
1,U_10007,V_6181152,1
2,U_100294,V_6223480,1
3,U_100294,V_6223485,1
4,U_100294,V_6223486,1
...,...,...,...
36706,U_998508,V_6377054,1
36707,U_998508,V_6377055,1
36708,U_998508,V_6377056,1
36709,U_998508,V_6377076,2


In [447]:
segment_w_gap_sep.merge(session_counts, on=["user_id, video_id"])

KeyError: 'user_id, video_id'

In [440]:
watching_sessions_unique = segment_w_gap_sep[["user_id", "video_id", "session_id", "ccid"]].unique()
watching_sessions_unique

AttributeError: 'DataFrame' object has no attribute 'unique'

In [None]:
# 

In [None]:
user_video_count = user2ccid.value_counts(["ccid", "user_id"]).reset_index().rename(columns={0: "num_watched"})
user_video_count

#### Task 5 - Filter videos to CS concepts

#### Task 6 - Apply all filteres and merge back the data

In [290]:
# Code for cleaning - Used in script to be run long term
# Segment threshold filtering
duration_thresh = 5
# Accounting for the speed the video is watched at
segments_durations = all_sequences["seq"].str["segment"].explode().transform(lambda row: (row["end_point"] - row["start_point"]) / row["speed"])
segments_threshold_filtered = segments_durations[segments_duration > duration_thresh] 
# 

NameError: name 'segments_duration' is not defined

### Feature extraction goals
**Basic**
* Number of segments
* Fraction of video watched -- Ignoring speed
* Weighted average speed - Complications as the number of "true seconds" watched might be unequal to the actual number of seconds due to the speed,

**Advanced**
* Video concepts - Downsize space to concepts within CS --> Need textual embedding, could just use string lookup to start with
    * For proper NLP embeddings, might have to distinguish the english and chinese concepts, if they are mixed, or exclude them