# MOOCCubeX 03 - Dataset creation
1. Remove cold users
2. Slice sequences
3. Explode, split and merge back in Concept ids to concept and field
4. Encode and create embedding table
4. Split datasets
5. Feature scale test and validation based on training scales 
6. Listify

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import dask.dataframe as dd
import os
from multiprocesspandas import applyparallel  # noqa
from tqdm import tqdm
tqdm.pandas()

if (cpu_count:= int(os.environ.get("SLURM_JOB_CPUS_PER_NODE"))) > 1:
    print(f"{cpu_count} cores are available -> Using multiprocesspandas")
    apply_fn = "apply_parallel"
else:
    apply_fn = "progress_apply"

get_apply_op = lambda df: getattr(df, apply_fn)


In [2]:
mooc_path = Path("~/fall_project/MOOCCubeX")
relations_path = mooc_path / "relations"

In [3]:
ITEM_COL = "item_id"
USER_COL = "user_id"
TIME_COL = "timestamp"
SESSION_COL = "session_id"
CONSECUTIVE_COL = "item_consecutive_id"

In [6]:
%%time
sessions_df = dd.read_parquet(relations_path / "sessions_featured_fix").drop(columns=["start", "end", "speed", "local_end_time", "forward_gap", "duration", "name", "length", "backward_gap_td"]).rename(columns={"video_id": "item_id", "local_start_time": "timestamp"}).compute()
sessions_df

CPU times: user 4.77 s, sys: 1.14 s, total: 5.91 s
Wall time: 7.02 s


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time_spent,num_forward,num_backward,num_pause,median_pause,std_speed,avg_speed,eff_speed,seg_rep_0,seg_rep_1,...,seg_rep_60,time_comp,time_played,replay_length,skipped_length,concept_id,item_id,ccid,timestamp,backward_gap
user_id,item_consecutive_id,session_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
U_10012383,1,0,4610.0,83.0,0.0,83.0,35.00,0.0,1.00,0.0,0.0,0.0,...,0.0,510.00,510.00,0.0,4100.00,[],V_6043038,D5404B143F1DBF5F9C33DC5901307461,1597690842,10.0
U_10012383,2,0,4055.0,38.0,0.0,38.0,15.00,0.0,1.00,0.0,0.0,0.0,...,0.0,1905.00,1905.00,0.0,1139.00,[],V_6043053,304D06F0510274079C33DC5901307461,1597713365,50.0
U_10012383,3,0,75.0,0.0,0.0,0.0,,,1.00,0.0,0.0,0.0,...,0.0,75.00,75.00,0.0,0.00,[],V_6043035,107FD4E4305C0D599C33DC5901307461,1597720407,
U_10012383,4,0,700.1,2.0,0.0,2.0,47.45,0.0,1.00,0.0,0.0,0.0,...,0.0,605.20,605.20,0.0,94.80,[],V_6043036,C99194C4BF0F06189C33DC5901307461,1597743216,30.0
U_10012383,5,0,1540.5,5.0,0.0,5.0,30.00,0.0,1.00,0.0,0.0,0.0,...,0.0,1380.50,1380.50,0.0,160.00,[],V_6010430,B6D85659921EC0759C33DC5901307461,1597747231,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U_99746,1,1,280.0,2.0,0.0,2.0,54.60,0.0,1.25,0.0,0.0,0.0,...,0.0,213.50,213.50,0.0,12.50,[],V_4067716,550A64E8D92F96559C33DC5901307461,1597814088,1572.2
U_99746,2,0,378.8,8.0,0.0,8.0,4.80,0.0,1.25,0.0,0.0,0.0,...,0.0,425.00,425.00,0.0,48.00,[],V_4067719,AEEB26CD71B5F4D89C33DC5901307461,1597814435,5.2
U_99746,3,0,536.4,4.0,0.0,4.0,37.40,0.0,1.25,0.0,0.0,0.0,...,0.0,330.00,330.00,0.0,27.00,[],V_4067720,867644A875832E2F9C33DC5901307461,1598154494,192.4
U_99746,4,0,324.8,1.0,0.0,1.0,279.60,0.0,1.25,0.0,0.0,0.0,...,0.0,56.50,56.50,0.0,15.75,[],V_6188761,D292EAA78D90928C9C33DC5901307461,1601742546,279.6


In [7]:
sessions_df.shape[0], sessions_df.index.get_level_values(USER_COL).nunique(), sessions_df[ITEM_COL].nunique()

(2213674, 304807, 186670)

# Dataset creation

### Remove cold users

In [8]:
user_session_count = sessions_df.groupby(USER_COL).size()
warm_thresh = 5
warm_users = user_session_count[user_session_count >= warm_thresh]
warm_users

user_id
U_10001587     6
U_10008027     6
U_10012257     5
U_10012383    18
U_10013620     5
              ..
U_998508      21
U_9988528     16
U_9989964      6
U_9996819      6
U_9999820      6
Length: 116661, dtype: int64

In [9]:
sessions_warm_df = sessions_df[sessions_df.index.get_level_values(USER_COL).isin(warm_users.index)]
sessions_warm_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time_spent,num_forward,num_backward,num_pause,median_pause,std_speed,avg_speed,eff_speed,seg_rep_0,seg_rep_1,...,seg_rep_60,time_comp,time_played,replay_length,skipped_length,concept_id,item_id,ccid,timestamp,backward_gap
user_id,item_consecutive_id,session_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
U_10012383,1,0,4610.0,83.0,0.0,83.0,35.00,0.0,1.00,0.0,0.0,0.0,...,0.0,510.00,510.00,0.0,4100.00,[],V_6043038,D5404B143F1DBF5F9C33DC5901307461,1597690842,10.0
U_10012383,2,0,4055.0,38.0,0.0,38.0,15.00,0.0,1.00,0.0,0.0,0.0,...,0.0,1905.00,1905.00,0.0,1139.00,[],V_6043053,304D06F0510274079C33DC5901307461,1597713365,50.0
U_10012383,3,0,75.0,0.0,0.0,0.0,,,1.00,0.0,0.0,0.0,...,0.0,75.00,75.00,0.0,0.00,[],V_6043035,107FD4E4305C0D599C33DC5901307461,1597720407,
U_10012383,4,0,700.1,2.0,0.0,2.0,47.45,0.0,1.00,0.0,0.0,0.0,...,0.0,605.20,605.20,0.0,94.80,[],V_6043036,C99194C4BF0F06189C33DC5901307461,1597743216,30.0
U_10012383,5,0,1540.5,5.0,0.0,5.0,30.00,0.0,1.00,0.0,0.0,0.0,...,0.0,1380.50,1380.50,0.0,160.00,[],V_6010430,B6D85659921EC0759C33DC5901307461,1597747231,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U_99746,1,1,280.0,2.0,0.0,2.0,54.60,0.0,1.25,0.0,0.0,0.0,...,0.0,213.50,213.50,0.0,12.50,[],V_4067716,550A64E8D92F96559C33DC5901307461,1597814088,1572.2
U_99746,2,0,378.8,8.0,0.0,8.0,4.80,0.0,1.25,0.0,0.0,0.0,...,0.0,425.00,425.00,0.0,48.00,[],V_4067719,AEEB26CD71B5F4D89C33DC5901307461,1597814435,5.2
U_99746,3,0,536.4,4.0,0.0,4.0,37.40,0.0,1.25,0.0,0.0,0.0,...,0.0,330.00,330.00,0.0,27.00,[],V_4067720,867644A875832E2F9C33DC5901307461,1598154494,192.4
U_99746,4,0,324.8,1.0,0.0,1.0,279.60,0.0,1.25,0.0,0.0,0.0,...,0.0,56.50,56.50,0.0,15.75,[],V_6188761,D292EAA78D90928C9C33DC5901307461,1601742546,279.6


In [10]:
sessions_warm_df.shape[0], sessions_warm_df.index.get_level_values(USER_COL).nunique(), sessions_warm_df[ITEM_COL].nunique()

(1836070, 116661, 165881)

### Slice and fill na

In [4]:
import scipy.stats as ss

In [4]:
from tqdm import tqdm
tqdm.pandas()

In [5]:
MAX_SEQUENCE_LEN = 30

In [14]:
ss.percentileofscore(sessions_warm_df.reset_index(level=SESSION_COL).groupby([USER_COL]).size(),  MAX_SEQUENCE_LEN)

89.04046767985874

In [15]:
%%time
sessions_group = sessions_warm_df.sort_values(TIME_COL).groupby(USER_COL, group_keys=False, sort=False)
sessions_sliced = get_apply_op(sessions_group)(lambda group: group.head(MAX_SEQUENCE_LEN))

CPU times: user 38.1 s, sys: 1.03 s, total: 39.1 s
Wall time: 39.7 s


In [16]:
# Fillna for numerical columns and remove definetily unused columns
sessions_filled = sessions_sliced.copy()#.drop(columns=["concept_id"])
sessions_filled.loc[:, ["median_pause", "std_speed", "backward_gap"]] = sessions_filled[["median_pause", "std_speed", "backward_gap"]].fillna(0)

In [17]:
sessions_filled.shape[0], sessions_filled.index.get_level_values(USER_COL).nunique(), sessions_filled[ITEM_COL].nunique()

(1461684, 116661, 158358)

### Embedding fields and concepts - MOOC Specific
* Must aggregate concepts and fields -> Simple per-dim average
* Not distinguishing between non-chineese concepts and not, which will then generate vectors also for OOV tokens

#### Split concept ids

In [6]:
CONCEPT_COL = "concepts"
FIELD_COL = "fields"

In [19]:
%%time
sessions_concepts = sessions_filled.assign(**(sessions_filled["concept_id"].explode()
                                .str[2:].str.rsplit("_", n=1, expand=True)
                                .replace("", np.nan).fillna(value=np.nan)
                                .transform(lambda col: col.str.strip())
                                .rename(columns={0:CONCEPT_COL, 1:FIELD_COL})
                                .groupby(level=[0,1,2]).agg(list))
                                           ).drop(columns="concept_id")
sessions_concepts

CPU times: user 1min 48s, sys: 4.04 s, total: 1min 52s
Wall time: 1min 59s


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time_spent,num_forward,num_backward,num_pause,median_pause,std_speed,avg_speed,eff_speed,seg_rep_0,seg_rep_1,...,time_comp,time_played,replay_length,skipped_length,item_id,ccid,timestamp,backward_gap,concepts,fields
user_id,item_consecutive_id,session_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
U_10001587,1,0,306.90,0.0,0.0,0.0,0.000,0.000000,2.000000,0.000000,0.0,0.0,...,613.8,613.8,0.0,0.0,V_6043067,41C0E8B591E095359C33DC5901307461,1597743218,0.00,[nan],[nan]
U_10001587,2,0,2743.00,14.0,2.0,16.0,30.000,0.242536,1.986347,-0.013653,4.0,4.0,...,3934.6,4028.3,93.7,367.4,V_6043065,5B96029FBD4A8CF09C33DC5901307461,1597811595,30.25,[nan],[nan]
U_10001587,3,0,20.00,0.0,0.0,0.0,0.000,0.000000,2.000000,0.000000,0.0,0.0,...,40.0,40.0,0.0,0.0,V_6012901,52EDC2D2B77883B99C33DC5901307461,1597815482,0.00,[nan],[nan]
U_10001587,3,1,20.00,0.0,0.0,0.0,0.000,0.000000,2.000000,0.000000,0.0,0.0,...,40.0,40.0,0.0,0.0,V_6012901,52EDC2D2B77883B99C33DC5901307461,1597824712,9210.00,[nan],[nan]
U_10001587,3,2,3452.75,4.0,1.0,4.0,488.175,0.516398,1.793915,0.793915,1.0,1.0,...,2501.7,2507.7,6.0,199.9,V_6012901,52EDC2D2B77883B99C33DC5901307461,1597829829,5097.00,[nan],[nan]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U_9999820,1,1,1010.00,4.0,0.0,4.0,30.000,0.000000,1.000000,0.000000,0.0,0.0,...,885.0,885.0,0.0,125.0,V_7386531,1BAE71E966D980C29C33DC5901307461,1601437095,2350.00,[nan],[nan]
U_9999820,2,0,500.00,2.0,0.0,2.0,29.950,0.000000,1.000000,0.000000,0.0,0.0,...,440.1,440.1,0.0,59.9,V_7386532,AD99106BD3AD4A789C33DC5901307461,1601451341,29.90,[nan],[nan]
U_9999820,3,0,260.00,0.0,0.0,0.0,0.000,0.000000,2.000000,0.000000,0.0,0.0,...,520.0,520.0,0.0,0.0,V_7386534,8D21D8DA741CBD639C33DC5901307461,1601468512,0.00,[nan],[nan]
U_9999820,4,0,2150.00,4.0,0.0,4.0,45.000,0.000000,2.000000,0.000000,0.0,0.0,...,2650.0,2650.0,0.0,1650.0,V_7386535,81FDDCC818CF26339C33DC5901307461,1601474220,30.00,[nan],[nan]


### Create embedding tables
Have to use the encoded strings as integer lookups in a manually created lookup table

In [8]:
import fasttext
import pickle

In [21]:
emb_table = fasttext.load_model("cc.zh.64.bin")



`print-sentence-vectors` do not provide the given sentence -> THe order is given by the lines above

In [22]:
%%time
unique_fields = sessions_concepts[FIELD_COL].explode().unique()
unique_concepts = sessions_concepts[CONCEPT_COL].explode().unique()

CPU times: user 3.69 s, sys: 171 ms, total: 3.87 s
Wall time: 3.88 s


In [23]:
len(unique_fields), len(unique_concepts)

(73, 159319)

#### Encoding of fields

In [24]:
# Standardize all nan-values to np.nan, e.g. None-type and drop unnecessary column 
sessions_concepts.loc[:,"ccid"] = sessions_concepts["ccid"].fillna(value=np.nan)

In [25]:
%%time
# Concept mapping, saving 0 for padding
concept2int = {val: i for i, val in enumerate(unique_concepts, start=0)}
# Manually map videos without related concepts to - Already included -> Start at 0 instead
#concept2int[np.nan] = 0
int2concept = {i: concept for concept, i in concept2int.items()}

# Field mapping, saving 0 for padding
field2int = {val: i for i, val in enumerate(unique_fields, start=0)}
# Manually map videos without related concepts - Already included
#field2int[np.nan] = 0
int2field = {i: field for field, i in field2int.items()}

# Video id mapping, saving 0 for padding
video_id2int = {val: i for i, val in enumerate(sessions_concepts[ITEM_COL].unique(), start=1)}
int2video_id = {i: video_id for video_id, i in video_id2int.items()}

# CCID mapping, sorting to have np.nan at index 0
ccid2int = {val: i for i, val in enumerate(sessions_concepts["ccid"].dropna().unique(), start=1)}
ccid2int[np.nan] = 0
int2ccid = {i: ccid for ccid, i in ccid2int.items()}

CPU times: user 339 ms, sys: 992 µs, total: 340 ms
Wall time: 344 ms


In [26]:
# Exlucdes np.nan which is mapped to 0 for concepts and fields
MAX_VIDEO_ID, MAX_CONCEPT, MAX_FIELD, MAX_CCID = max(video_id2int.values()), max(concept2int.values()), max(field2int.values()), max(ccid2int.values())
MAX_VIDEO_ID, MAX_CCID, MAX_CONCEPT, MAX_FIELD

(158358, 78972, 159318, 72)

In [11]:
from utils.io import save_enc

In [28]:
embeddings_path = Path("embeddings")
embeddings_path.mkdir(exist_ok=True)
save_enc(concept2int, embeddings_path / "concept2int.json" )
save_enc(field2int, embeddings_path / "field2int.json" )
save_enc(video_id2int, embeddings_path / "video_id2int.json" )
save_enc(ccid2int, embeddings_path / "ccid2int.json" )

In [7]:
TXT_EMBED_DIM = 64
PAD_TOKEN = 0
MAX_CONCEPT_LENGTH = 10

Some problems storing concepts as strings in file. Don't know why, but likely some encoding issue of some sort

In [30]:
# Checking to verify that all concepts and fields are stripped as expected
assert all([isinstance(field, float) or field.strip() == field for field in unique_fields])
assert all([isinstance(concept, float) or concept.strip() == concept for concept in unique_concepts])

In [31]:
%%time
# Skipping first, which is NaN
embedded_concepts = np.array([emb_table.get_sentence_vector(concept) for concept in unique_concepts[1:]])
embedded_fields = np.array([emb_table.get_sentence_vector(field) for field in unique_fields[1:]])

CPU times: user 904 ms, sys: 1.07 ms, total: 905 ms
Wall time: 908 ms


In [32]:
# Create explicit mapping between feature and embedding, ignoring np.nan
field2float = dict(zip(unique_fields[1:], embedded_fields))
concept2float = dict(zip(unique_concepts[1:], embedded_concepts))

In [33]:
SAVE = True

In [34]:
# Create explicit mapping between encoding and embedding + add pad token
concept_enc2float = {concept2int[concept]: embed for concept, embed in concept2float.items()}
concept_enc2float[PAD_TOKEN] = np.zeros(TXT_EMBED_DIM)
field_enc2float = {field2int[field]: embed for field, embed in field2float.items()}
field_enc2float[PAD_TOKEN] = np.zeros(TXT_EMBED_DIM)

In [35]:
if SAVE:
    with open(embeddings_path / "concept-enc-embedding-sentence.pickle", "wb") as f:
        pickle.dump(concept_enc2float, f)
    with open(embeddings_path / "field-enc-embedding-sentence.pickle", "wb") as f:
        pickle.dump(field_enc2float, f)

In [36]:
fields_as_list = list(sorted(field_enc2float.items(), key=lambda x: x[0]))
concepts_as_list = list(sorted(concept_enc2float.items(), key=lambda x: x[0]))
# Verify that the order is as expected
assert all([(field_enc2float[idx] == val).all() for idx, val in fields_as_list])
assert all([(concept_enc2float[idx] == val).all() for idx, val in concepts_as_list])

In [37]:
# Convert to embedding tables and store - Redundant but
field_embed_table = np.concatenate(([[val] for idx, val in fields_as_list]))
concept_embed_table = np.concatenate(([[val] for idx, val in concepts_as_list]))

In [38]:
field_embed_table.shape, concept_embed_table.shape

((73, 64), (159319, 64))

In [39]:
from utils.io import save_embedding_table
# Store the weight matrices themselves to be loaded - Each row corresponding to the encoded index
if SAVE:
    save_embedding_table(embeddings_path, CONCEPT_COL, concept_embed_table)
    save_embedding_table(embeddings_path, FIELD_COL, field_embed_table)

#### Encoding columns

In [40]:
%%time
# Basic video_id, encoding, stripping as they where embedded cleanly
sessions_encoded = sessions_concepts.copy()
sessions_encoded.loc[:,ITEM_COL] = sessions_encoded[ITEM_COL].map(video_id2int)
sessions_encoded.loc[:,"ccid"] = sessions_encoded["ccid"].map(ccid2int)
sessions_encoded.loc[:,CONCEPT_COL] = sessions_encoded[CONCEPT_COL].transform(lambda lst: [concept2int[concept] for concept in lst])
sessions_encoded.loc[:,FIELD_COL] = sessions_encoded[FIELD_COL].transform(lambda lst: [field2int[field] for field in lst])

CPU times: user 12.8 s, sys: 209 ms, total: 13 s
Wall time: 13.3 s


In [12]:
def pad_and_slice(series, pad_token=0, max_length=MAX_CONCEPT_LENGTH):
    """Pads and slice a list with the given padding token and a given max length"""
    return series.transform(lambda lst: (lst + [pad_token] * max_length)[:max_length])

In [42]:
%%time
# Slice and pad concepts to avoid ragged list 
sessions_encoded.loc[:, [CONCEPT_COL, FIELD_COL]] = (sessions_encoded[[CONCEPT_COL, FIELD_COL]]
                                                     .transform(pad_and_slice, max_length=MAX_CONCEPT_LENGTH, axis=0)
                                                     # Convert to 2D array which is needed for embedding
                                                     .transform(lambda series: series.transform(lambda lst: [lst])))

CPU times: user 6.65 s, sys: 185 ms, total: 6.84 s
Wall time: 6.96 s


In [43]:
(sessions_encoded[FIELD_COL].str[0].str.len() == MAX_CONCEPT_LENGTH).all(), (sessions_encoded[FIELD_COL].str.len() == 1).all()

(True, True)

#### Split

In [44]:
test_raw = sessions_encoded.copy()
val_raw = get_apply_op(test_raw.groupby(USER_COL, group_keys=False, sort=False))(lambda group: group[:-1])
train_raw = get_apply_op(val_raw.groupby(USER_COL, group_keys=False, sort=False))(lambda group: group[:-1])

In [45]:
num_users =116661 #116093 

In [46]:
assert test_raw.shape[0] -1 * num_users  == val_raw.shape[0]
assert test_raw.shape[0] - 2 * num_users  == train_raw.shape[0]

#### Video normalize

In [8]:
from utils.pre_processing import norm_vid_feats

In [14]:
VIDEO_NORM_FEATS = ["replay_length", "time_played", "time_comp", "skipped_length", "time_spent"]

In [49]:
# Include OOV videos to the max video feature length index to properly handle missing videos
video_max = train_raw.groupby(ITEM_COL)[VIDEO_NORM_FEATS].max().reindex(pd.Index(test_raw[ITEM_COL].unique()), fill_value=0).sort_index()
video_max

Unnamed: 0,replay_length,time_played,time_comp,skipped_length,time_spent
1,49.0,7032.3,7032.3,6995.0,7734.100000
2,115.8,9313.0,9313.0,8565.0,9687.000000
3,16.0,2971.9,2971.9,2909.0,2840.000000
4,55.7,2420.0,2420.0,1360.0,2454.500000
5,6.2,4927.0,4927.0,4234.0,5817.700000
...,...,...,...,...,...
158354,37.3,1385.8,1348.5,270.4,2077.000000
158355,15.1,1259.2,1244.1,514.9,1240.133333
158356,0.0,380.0,380.0,49.2,841.533333
158357,0.0,119.9,119.9,0.0,119.900000


In [50]:
video_max.astype(bool).sum()

replay_length      30261
time_played       140747
time_comp         140747
skipped_length     89713
time_spent        140747
dtype: int64

In [51]:
%%time
test_norm = norm_vid_feats(test_raw, max_lengths=video_max)
val_norm = norm_vid_feats(val_raw, max_lengths=video_max)
train_norm = norm_vid_feats(train_raw, max_lengths=video_max)

100%|████████████████████████████████████████| 158358/158358 [00:56<00:00, 2810.16it/s]
100%|████████████████████████████████████████| 149922/149922 [00:54<00:00, 2727.28it/s]
100%|████████████████████████████████████████| 140747/140747 [00:50<00:00, 2810.89it/s]


CPU times: user 2min 33s, sys: 3.95 s, total: 2min 36s
Wall time: 2min 43s


In [52]:
keep_last = lambda group: group.tail(1)

In [53]:
%%time
val_loo_norm = get_apply_op(val_raw.groupby(USER_COL, group_keys=False, sort=False))(keep_last)
test_loo_norm = get_apply_op(test_norm.groupby(USER_COL, sort=False, group_keys=False))(keep_last)

CPU times: user 53.7 s, sys: 1.38 s, total: 55.1 s
Wall time: 57.4 s


In [9]:
mooc_datasets_path = Path("mooc_fix")
conv_path = mooc_datasets_path / "conventional"
seq_path = mooc_datasets_path / "sequential"
conv_path.mkdir(exist_ok=True, parents=True)
seq_path.mkdir(exist_ok=True, parents=True)

In [16]:
conv_vid_norm_path = conv_path / "video_normalized"
conv_vid_norm_path.mkdir(exist_ok=True, parents=True)

In [64]:
train_norm.to_parquet(conv_vid_norm_path / "train.parquet")
val_loo_norm.to_parquet(conv_vid_norm_path / "val.parquet")
test_loo_norm.to_parquet(conv_vid_norm_path / "test.parquet")

In [65]:
# Stored for ease of access for sequential methods, but not listified so stored in convential folders
test_norm.to_parquet(conv_vid_norm_path / "test_full.parquet")
val_norm.to_parquet(conv_vid_norm_path / "val_full.parquet")

In [66]:
# Describes all features besides encoding of concepts and fields, which must be added manually
test_norm.describe().to_parquet(conv_vid_norm_path / "feature_stats.parquet")

**Yeo Johnson all feature scaling/normalization**

Using Train-dataset as basis for the normalization. Will apply zero-mean, unit-variance normalization

In [67]:
from sklearn.preprocessing import PowerTransformer

In [68]:
# All numerical Feature normalize using Yeo-Johnson
numeric_cols = train_norm.select_dtypes(include="number").drop(columns=[ITEM_COL, "ccid", TIME_COL, "backward_gap"]).columns
numeric_cols

Index(['time_spent', 'num_forward', 'num_backward', 'num_pause',
       'median_pause', 'std_speed', 'avg_speed', 'eff_speed', 'seg_rep_0',
       'seg_rep_1', 'seg_rep_2', 'seg_rep_3', 'seg_rep_4', 'seg_rep_5',
       'seg_rep_10', 'seg_rep_15', 'seg_rep_20', 'seg_rep_25', 'seg_rep_30',
       'seg_rep_45', 'seg_rep_60', 'time_comp', 'time_played', 'replay_length',
       'skipped_length'],
      dtype='object')

In [69]:
pt = PowerTransformer("yeo-johnson")

In [70]:
%%time
train_scaled = train_norm.copy()
train_scaled.loc[:,numeric_cols] = pt.fit_transform(train_norm[numeric_cols])
val_scaled = val_loo_norm.copy()
val_scaled.loc[:,numeric_cols]  = pt.transform(val_loo_norm[numeric_cols])
test_scaled = test_loo_norm.copy()
test_scaled.loc[:,numeric_cols]  = pt.transform(test_loo_norm[numeric_cols])

CPU times: user 18.7 s, sys: 497 ms, total: 19.2 s
Wall time: 19.6 s


In [71]:
%%time
# For Sequential dataset down stream
val_full_scaled = val_norm.copy()
val_full_scaled.loc[:,numeric_cols] = pt.transform(val_full_scaled[numeric_cols])
test_full_scaled = test_norm.copy()
test_full_scaled.loc[:,numeric_cols] = pt.transform(test_full_scaled[numeric_cols])

CPU times: user 6.97 s, sys: 886 ms, total: 7.85 s
Wall time: 7.97 s


In [17]:
conv_scaled_path = conv_path / "all_scaled"
conv_scaled_path.mkdir(exist_ok=True, parents=True)

In [75]:
train_scaled.to_parquet(conv_scaled_path / "train.parquet")
val_scaled.to_parquet(conv_scaled_path / "val.parquet")
test_scaled.to_parquet(conv_scaled_path / "test.parquet")

In [76]:
# For Sequential datasets downstream
val_full_scaled.to_parquet(conv_scaled_path / "val_full.parquet")
test_full_scaled.to_parquet(conv_scaled_path / "test_full.parquet")

In [77]:
# storing min and mix of each continuous feature
test_full_scaled.describe().to_parquet(conv_scaled_path / "cont_feature_stats.parquet")

#### Bias adjusted and scaled

In [10]:
from utils.pre_processing import adaptive_bias

In [79]:
# Load existing datasets
if conv_vid_norm_path.exists():
    train_norm = pd.read_parquet(conv_vid_norm_path / "train.parquet")
    val_full_norm = pd.read_parquet(conv_vid_norm_path / "val_full.parquet")
    test_full_norm = pd.read_parquet(conv_vid_norm_path / "test_full.parquet")
    # All numerical Feature normalize using Yeo-Johnson
    numeric_cols = train_norm.select_dtypes(include="number").drop(columns=[ITEM_COL, "ccid", TIME_COL, "backward_gap"]).columns
else:
    raise RuntimeError(f"Need to create {conv_vid_norm_path.name} datasets first")

In [80]:
numeric_cols

Index(['time_spent', 'num_forward', 'num_backward', 'num_pause',
       'median_pause', 'std_speed', 'avg_speed', 'eff_speed', 'seg_rep_0',
       'seg_rep_1', 'seg_rep_2', 'seg_rep_3', 'seg_rep_4', 'seg_rep_5',
       'seg_rep_10', 'seg_rep_15', 'seg_rep_20', 'seg_rep_25', 'seg_rep_30',
       'seg_rep_45', 'seg_rep_60', 'time_comp', 'time_played', 'replay_length',
       'skipped_length'],
      dtype='object')

In [81]:
pt2 = PowerTransformer("yeo-johnson")

In [82]:
%%time
train_bias_adj = train_norm.copy()
train_bias_adj_group = train_bias_adj.groupby(USER_COL, group_keys=False, sort=False)[numeric_cols]
val_full_bias_adj = val_full_norm.copy()
val_full_bias_adj_group = val_full_bias_adj.groupby(USER_COL, group_keys=False, sort=False)[numeric_cols]
test_full_bias_adj = test_full_norm.copy()
test_full_bias_adj_group = test_full_bias_adj.groupby(USER_COL, group_keys=False, sort=False)[numeric_cols]

CPU times: user 256 ms, sys: 113 ms, total: 370 ms
Wall time: 372 ms


In [83]:
%%time
# Since it is an expanding bias -> Doesn't make sense to apply it to the LOO splits
# -> Must apply it to the full splits and then save it as LOO
train_bias_adj.loc[:,numeric_cols] = pt2.fit_transform(get_apply_op(train_bias_adj_group)(lambda col: col.transform(adaptive_bias)))
val_full_bias_adj.loc[:,numeric_cols] = pt2.transform(get_apply_op(val_full_bias_adj_group)(lambda col: col.transform(adaptive_bias)))
test_full_bias_adj.loc[:,numeric_cols] = pt2.transform(get_apply_op(test_full_bias_adj_group)(lambda col: col.transform(adaptive_bias)))

CPU times: user 22min 3s, sys: 7.42 s, total: 22min 11s
Wall time: 22min 41s


In [12]:
conv_scaled_bias_path = conv_path / "bias_adj_all_scaled"
conv_scaled_bias_path.mkdir(exist_ok=True, parents=True)

In [85]:
train_bias_adj.to_parquet(conv_scaled_bias_path / "train.parquet")
# Had to calculate the bias based on the complete row -> Save only last (Includes features in the case of Simulation/feature prediction)
(get_apply_op(val_full_bias_adj.groupby(USER_COL, sort=False, group_keys=False))(keep_last)).to_parquet(conv_scaled_bias_path / "val.parquet")
(get_apply_op(test_full_bias_adj.groupby(USER_COL, sort=False, group_keys=False))(keep_last)).to_parquet(conv_scaled_bias_path / "test.parquet")

In [86]:
# For Sequential datasets downstream
val_full_bias_adj.to_parquet(conv_scaled_bias_path / "val_full.parquet")
test_full_bias_adj.to_parquet(conv_scaled_bias_path / "test_full.parquet")

In [87]:
# storing min and max of each numerical feature
test_full_bias_adj.describe().to_parquet(conv_scaled_bias_path / "feature_stats.parquet")

### Listify By user
**Raw, without video_norm**

In [11]:
import importlib
utils = importlib.import_module("utils")
importlib.reload(utils)
from utils.pre_processing import bulk_listify, bulk_pad_split, save_splits

**Raw sequential dataset**

In [11]:
#%%time
#train_raw_seq_pad, val_raw_seq_pad, test_raw_seq_pad =  bulk_pad_split(
#                                                            *bulk_listify(train_raw, val_raw, test_raw, time_col=TIME_COL), 
#                                                            reg_cols=REG_COLS,
#                                                            list_cols=LIST_COLS,
#                                                            max_list_length=MAX_CONCEPT_LENGTH
#                                                        )

In [12]:
#%%time
#raw_seq_splits = {
#    "train": train_raw_seq_pad.drop(columns="concept_id"),
#    "val": val_raw_seq_pad.drop(columns="concept_id"),
#    "test": test_raw_seq_pad.drop(columns="concept_id")
#}
#seq_raw_path = seq_path / "raw_dataset"
#save_splits(seq_raw_path, raw_seq_splits)
#test_raw.describe().to_parquet(seq_raw_path / "feature_stats.parquet")

**Sequential Video Normalized**

In [22]:
train_norm = pd.read_parquet(conv_vid_norm_path / "train.parquet")
val_norm = pd.read_parquet(conv_vid_norm_path / "val_full.parquet")
test_norm = pd.read_parquet(conv_vid_norm_path / "test_full.parquet")

In [16]:
LIST_COLS = [FIELD_COL, CONCEPT_COL]
REG_COLS = list(set(train_bias_adj.reset_index().drop(columns=["user_id"]).columns) - set(LIST_COLS))

In [25]:
%%time
train_norm_seq_pad, val_norm_seq_pad, test_norm_seq_pad =  bulk_pad_split(
                                                            *bulk_listify(train_norm, val_norm, test_norm, time_col=TIME_COL), 
                                                            reg_cols=REG_COLS,
                                                            list_cols=LIST_COLS,
                                                            max_list_length=MAX_CONCEPT_LENGTH
    
                                                        )

CPU times: user 1min 22s, sys: 4.02 s, total: 1min 26s
Wall time: 1min 27s


In [26]:
%%time
norm_seq_splits = {
    "train": train_norm_seq_pad,#.drop(columns="concept_id"),
    "val": val_norm_seq_pad,#.drop(columns="concept_id"),
    "test": test_norm_seq_pad,#.drop(columns="concept_id")
}
seq_video_norm_path = seq_path / "video_normalized"
seq_video_norm_path.mkdir(exist_ok=True, parents=True)
save_splits(seq_video_norm_path, norm_seq_splits)
test_norm.describe().to_parquet(seq_video_norm_path / "feature_stats.parquet")

CPU times: user 30.8 s, sys: 2.49 s, total: 33.3 s
Wall time: 33.7 s


###

**Sequential All features scaled**

In [27]:
train_scaled = pd.read_parquet(conv_scaled_path / "train.parquet")
val_full_scaled = pd.read_parquet(conv_scaled_path / "val_full.parquet")
test_full_scaled = pd.read_parquet(conv_scaled_path / "test_full.parquet")

In [28]:
%%time
train_scaled_seq, val_scaled_seq, test_scaled_seq =  bulk_pad_split(
                                                            *bulk_listify(train_scaled, val_full_scaled, test_full_scaled, time_col=TIME_COL), 
                                                            reg_cols=REG_COLS,
                                                            list_cols=LIST_COLS,
                                                            max_list_length=MAX_CONCEPT_LENGTH
                                                        )

CPU times: user 1min 29s, sys: 7.02 s, total: 1min 36s
Wall time: 1min 36s


In [29]:
%%time
scaled_seq_splits = {
    "train": train_scaled_seq,#.drop(columns="concept_id"),
    "val": val_scaled_seq,#.drop(columns="concept_id"),
    "test": test_scaled_seq,#.drop(columns="concept_id")
}
seq_scaled_path = seq_path / "all_scaled"
seq_scaled_path.mkdir(exist_ok=True, parents=True)
save_splits(seq_scaled_path, scaled_seq_splits)
test_full_scaled.describe().to_parquet(seq_scaled_path / "feature_stats.parquet")

CPU times: user 31.3 s, sys: 2.32 s, total: 33.6 s
Wall time: 33.7 s


**Sequential Expanding bias adjustment**

In [13]:
train_bias_adj = pd.read_parquet(conv_scaled_bias_path / "train.parquet")
val_full_bias_adj = pd.read_parquet(conv_scaled_bias_path / "val_full.parquet")
test_full_bias_adj = pd.read_parquet(conv_scaled_bias_path / "test_full.parquet")

In [17]:
%%time
train_scaled_bias_adj_seq, val_scaled_bias_adj_seq, test_scaled_bias_adj_seq =  bulk_pad_split(
                                                            *bulk_listify(train_bias_adj, val_full_bias_adj, test_full_bias_adj, time_col=TIME_COL), 
                                                            reg_cols=REG_COLS,
                                                            list_cols=LIST_COLS,
                                                            max_list_length=MAX_CONCEPT_LENGTH
                                                        )

CPU times: user 1min 25s, sys: 5.53 s, total: 1min 30s
Wall time: 1min 31s


In [18]:
%%time
scaled_bias_seq_splits = {
    "train": train_scaled_bias_adj_seq,#.drop(columns="concept_id"),
    "val": val_scaled_bias_adj_seq,#.drop(columns="concept_id"),
    "test": test_scaled_bias_adj_seq,#.drop(columns="concept_id")
}
seq_scaled_bias_adj_path = seq_path / "bias_adj_all_scaled"
seq_scaled_bias_adj_path.mkdir(exist_ok=True, parents=True)
save_splits(seq_scaled_bias_adj_path, scaled_bias_seq_splits)
test_full_bias_adj.describe().to_parquet(seq_scaled_bias_adj_path / "feature_stats.parquet")

CPU times: user 31.2 s, sys: 2.22 s, total: 33.4 s
Wall time: 33.5 s
