# EdNet 05 - Dataset creation
1. Remove cold users
2. Slice
3. Encoding tables
4. Split datasets
5. Feature scale test and validation based on training scales 
6. Listify

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import dask.dataframe as dd
# from multiprocesspandas import applyparallel
from tqdm import tqdm
tqdm.pandas()
import os

In [2]:
if (cpu_count:= int(os.environ.get("SLURM_JOB_CPUS_PER_NODE"))) > 1:
    print(f"{cpu_count} cores are available -> Using multiprocesspandas")
    apply_fn = "apply_parallel"
else:
    apply_fn = "progress_apply"

get_apply_op = lambda df: getattr(df, apply_fn)


In [3]:
ednet_path = Path("../EdNet")

In [4]:
ITEM_COL = "item_id"
USER_COL = "user_id"
TIME_COL = "timestamp"
SESSION_COL = "session_id"
CONSECUTIVE_COL = "item_consecutive_id"

In [5]:
%%time
interactions = dd.read_parquet(ednet_path / "KT4_session_features_fix").drop(columns=["back_gap", "forward_gap", "user_answer", "action_type", "cursor_time", "source", "platform", "length"]).compute()

CPU times: user 491 ms, sys: 93.8 ms, total: 584 ms
Wall time: 632 ms


In [6]:
VIDEO_NORM_FEATS = ["replay_length", "time_played", "time_comp", "skipped_length", "time_spent"]
interactions["median_pause"] = interactions["median_pause"].dt.total_seconds().fillna(0)
interactions[VIDEO_NORM_FEATS] = interactions[VIDEO_NORM_FEATS] / 1e3
# Rename seg rep columns
SEG_REP_FEATURES = interactions.filter(regex="seg_rep_.*").columns.tolist()
renamed_seg_feats = [f"seg_rep_{int(int(name.split('_')[-1])/1e3)}" for name in SEG_REP_FEATURES]
interactions = interactions.rename(columns=dict(zip(SEG_REP_FEATURES, renamed_seg_feats)))
interactions

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time_spent,num_forward,num_backward,num_pause,median_pause,seg_rep_0,seg_rep_1,seg_rep_2,seg_rep_3,seg_rep_4,...,seg_rep_30,seg_rep_45,seg_rep_60,time_comp,time_played,replay_length,skipped_length,tags,item_id,timestamp
user_id,item_consecutive_id,session_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,1,0,369.981,1,0,1,1.7,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,357.911,357.911,0.000,0.187,103,l504,1565096637922
1,2,0,338.240,0,0,1,72.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,260.249,260.249,0.000,0.000,52,l464,1568120237376
1,3,0,309.277,1,0,1,2.6,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,204.026,204.026,0.000,0.905,53,l463,1568120575929
1,4,0,371.762,0,0,1,31.1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,334.948,334.948,0.000,0.000,55,l462,1568120885352
1,5,0,369.912,0,0,1,159.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,205.357,205.357,0.000,0.000,0,l316,1568121279019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832028,3,0,342.948,1,0,2,18.4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,292.763,292.763,0.000,0.326,111,l536,1574766579325
832056,1,0,233.981,0,0,1,17.4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,207.474,207.474,0.000,0.000,33,l447,1574801905337
832072,1,0,18.340,3,1,1,1.4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.091,5.091,0.000,0.230,102,l553,1574771559317
832396,1,0,219.064,0,1,1,17.9,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,174.748,174.816,0.068,0.000,79,l546,1574760296296


In [7]:
interactions["median_pause"].describe()

count    5.386490e+05
mean     3.877606e+02
std      3.483124e+04
min      0.000000e+00
25%      5.000000e-01
50%      2.500000e+00
75%      8.800000e+00
max      1.004903e+07
Name: median_pause, dtype: float64

In [25]:
interactions.shape[0], interactions.index.get_level_values(USER_COL).nunique(), interactions[ITEM_COL].nunique()

(538649, 42826, 971)

# Dataset creation

### Remove cold users

In [27]:
user_session_count = interactions.groupby(USER_COL).size()
warm_thresh = 5
warm_users = user_session_count[user_session_count >= warm_thresh]
warm_users

user_id
1          25
4          17
5           9
7         103
9           8
         ... 
830559     10
830677      5
831416     19
831612     10
831764     15
Length: 18194, dtype: int64

In [28]:
interactions_warm_df = interactions[interactions.index.get_level_values(USER_COL).isin(warm_users.index)]
interactions_warm_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time_spent,num_forward,num_backward,num_pause,median_pause,seg_rep_0,seg_rep_1,seg_rep_2,seg_rep_3,seg_rep_4,...,seg_rep_30,seg_rep_45,seg_rep_60,time_comp,time_played,replay_length,skipped_length,tags,item_id,timestamp
user_id,item_consecutive_id,session_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,1,0,369.981,1,0,1,1.7,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,357.911,357.911,0.000,0.187,103,l504,1565096637922
1,2,0,338.240,0,0,1,72.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,260.249,260.249,0.000,0.000,52,l464,1568120237376
1,3,0,309.277,1,0,1,2.6,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,204.026,204.026,0.000,0.905,53,l463,1568120575929
1,4,0,371.762,0,0,1,31.1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,334.948,334.948,0.000,0.000,55,l462,1568120885352
1,5,0,369.912,0,0,1,159.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,205.357,205.357,0.000,0.000,0,l316,1568121279019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
831764,10,0,389.067,0,2,2,2.9,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,358.143,369.062,10.919,0.000,103,l504,1574765372527
831764,11,0,514.073,0,1,1,1.7,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,486.857,498.017,11.160,0.000,100,l505,1574765764861
831764,12,0,453.015,0,0,1,5.2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,441.086,441.086,0.000,0.000,106,l506,1574766296581
831764,13,0,638.514,0,1,1,3.6,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,586.075,622.422,36.347,0.000,84,l507,1574766766311


In [29]:
interactions_warm_df.shape[0], interactions_warm_df.index.get_level_values(USER_COL).nunique(), interactions_warm_df[ITEM_COL].nunique()

(494763, 18194, 971)

In [30]:
interactions_warm_df.groupby(USER_COL).size()

user_id
1          25
4          17
5           9
7         103
9           8
         ... 
830559     10
830677      5
831416     19
831612     10
831764     15
Length: 18194, dtype: int64

#### Slice

In [31]:
MAX_SEQUENCE_LEN = 30

In [32]:
from scipy import stats
stats.percentileofscore(interactions_warm_df.groupby(USER_COL).size(), MAX_SEQUENCE_LEN)

74.90106628558866

In [33]:
from tqdm import tqdm
tqdm.pandas()

In [34]:
interactions_sliced = interactions_warm_df.sort_values(TIME_COL).groupby(USER_COL, group_keys=False, sort=False).progress_apply(lambda group: group.head(MAX_SEQUENCE_LEN))

100%|██████████████████████████████████████████| 18194/18194 [00:07<00:00, 2420.64it/s]


In [35]:
interactions_sliced

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time_spent,num_forward,num_backward,num_pause,median_pause,seg_rep_0,seg_rep_1,seg_rep_2,seg_rep_3,seg_rep_4,...,seg_rep_30,seg_rep_45,seg_rep_60,time_comp,time_played,replay_length,skipped_length,tags,item_id,timestamp
user_id,item_consecutive_id,session_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,1,0,369.981,1,0,1,1.7,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,357.911,357.911,0.000,0.187,103,l504,1565096637922
1,2,0,338.240,0,0,1,72.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,260.249,260.249,0.000,0.000,52,l464,1568120237376
1,3,0,309.277,1,0,1,2.6,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,204.026,204.026,0.000,0.905,53,l463,1568120575929
1,4,0,371.762,0,0,1,31.1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,334.948,334.948,0.000,0.000,55,l462,1568120885352
1,5,0,369.912,0,0,1,159.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,205.357,205.357,0.000,0.000,0,l316,1568121279019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
831764,10,0,389.067,0,2,2,2.9,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,358.143,369.062,10.919,0.000,103,l504,1574765372527
831764,11,0,514.073,0,1,1,1.7,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,486.857,498.017,11.160,0.000,100,l505,1574765764861
831764,12,0,453.015,0,0,1,5.2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,441.086,441.086,0.000,0.000,106,l506,1574766296581
831764,13,0,638.514,0,1,1,3.6,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,586.075,622.422,36.347,0.000,84,l507,1574766766311


In [36]:
interactions_sliced.shape[0], interactions_sliced.index.get_level_values(USER_COL).nunique(), interactions_sliced[ITEM_COL].nunique()

(304754, 18194, 951)

#### Encoding of fields

In [37]:
interactions_sliced["tags"].sort_values().nunique()

256

In [38]:
%%time
# item id mapping, saving 0 for padding
item_id2int = {val: i for i, val in enumerate(interactions_sliced[ITEM_COL].unique(), start=1)}
int2item_id = {i: item_id2int for item_id2int, i in item_id2int.items()}

# tags mapping, tags at 0 is already encoded as np.nan
tag2int = {int(val): i for i, val in enumerate(interactions_sliced["tags"].sort_values().unique())}
int2tag = {i: tag for tag, i in tag2int.items()}

CPU times: user 27.1 ms, sys: 0 ns, total: 27.1 ms
Wall time: 27.9 ms


In [39]:
# Exlucdes np.nan which is mapped to 0 for concepts and fields
MAX_ITEM_ID, MAX_TAG = max(item_id2int.values()), max(tag2int.values()),
MAX_ITEM_ID, MAX_TAG

(951, 255)

In [40]:
from utils.io import save_enc

In [41]:
embeddings_path = Path("embeddings")
save_enc(item_id2int, embeddings_path / "item_id2int.json" )
save_enc(tag2int, embeddings_path / "tag2int.json" )

**Encoding columns**

In [42]:
%%time
# Basic video_id, encoding, stripping as they where embedded cleanly
interactions_encoded = interactions_sliced.copy()
interactions_encoded.loc[:,ITEM_COL] = interactions_encoded[ITEM_COL].map(item_id2int)
interactions_encoded.loc[:,"tags"] = interactions_encoded["tags"].map(tag2int)

CPU times: user 557 ms, sys: 34.8 ms, total: 592 ms
Wall time: 597 ms


#### Split

In [43]:
test_raw = interactions_encoded.copy()
val_raw = test_raw.groupby(USER_COL, group_keys=False, sort=False).progress_apply(lambda group: group[:-1])
train_raw = val_raw.groupby(USER_COL, group_keys=False, sort=False).progress_apply(lambda group: group[:-1])

100%|██████████████████████████████████████████| 18194/18194 [00:04<00:00, 3810.64it/s]
100%|██████████████████████████████████████████| 18194/18194 [00:04<00:00, 3680.95it/s]


In [46]:
NUM_USERS = 18194

In [47]:
assert test_raw.shape[0] -1 * NUM_USERS   == val_raw.shape[0]
assert test_raw.shape[0] - 2 * NUM_USERS   == train_raw.shape[0]

#### Video normalize

In [48]:
pd.Index(test_raw[ITEM_COL].unique()).difference(train_raw[ITEM_COL].unique())

Int64Index([574, 928, 933], dtype='int64')

In [49]:
# Include OOV videos to the max video feature length index to properly handle missing videos
video_max = train_raw.groupby(ITEM_COL)[VIDEO_NORM_FEATS].max().reindex(pd.Index(test_raw[ITEM_COL].unique()), fill_value=0).sort_index()
video_max

Unnamed: 0,replay_length,time_played,time_comp,skipped_length,time_spent
1,1661.767,1071.846,358.327,334.261,2768.057
2,1561.494,1040.996,260.299,248.103,2404.391
3,2050.780,1025.390,205.084,182.035,2096.133
4,2004.945,1338.257,335.152,221.685,2484.868
5,2423.725,1100.816,206.136,198.204,2308.235
...,...,...,...,...,...
947,0.000,13.465,13.465,0.000,13.465
948,8.500,596.367,587.867,0.000,605.176
949,0.000,308.196,308.196,0.164,409.885
950,32.826,242.219,216.770,0.032,855.309


In [50]:
from utils.pre_processing import norm_vid_feats

In [51]:
%%time
test_norm = norm_vid_feats(test_raw, video_max)
val_norm = norm_vid_feats(val_raw, video_max)
train_norm = norm_vid_feats(train_raw, video_max)

100%|██████████████████████████████████████████████| 951/951 [00:00<00:00, 1085.28it/s]
100%|███████████████████████████████████████████████| 949/949 [00:01<00:00, 940.43it/s]
100%|██████████████████████████████████████████████| 948/948 [00:00<00:00, 1043.43it/s]

CPU times: user 2.9 s, sys: 89.1 ms, total: 2.99 s
Wall time: 3.03 s





In [52]:
datasets_path = Path("ednet")
conv_path = datasets_path / "conventional"
seq_path = datasets_path / "sequential"

In [53]:
conv_vid_norm_path = conv_path / "video_normalized"

In [54]:
%%time
test_norm.groupby(USER_COL, sort=False, group_keys=False).progress_apply(lambda group: group.tail(1)).to_parquet(conv_vid_norm_path / "test.parquet")
val_norm.groupby(USER_COL, sort=False, group_keys=False).progress_apply(lambda group: group.tail(1)).to_parquet(conv_vid_norm_path / "val.parquet")
train_norm.to_parquet(conv_vid_norm_path / "train.parquet")

100%|██████████████████████████████████████████| 18194/18194 [00:04<00:00, 3971.10it/s]
100%|██████████████████████████████████████████| 18194/18194 [00:04<00:00, 3669.76it/s]


CPU times: user 9.13 s, sys: 217 ms, total: 9.35 s
Wall time: 9.98 s


In [55]:
# Stored for ease of access for sequential methods, but not listified so stored in convential folders
test_norm.to_parquet(conv_vid_norm_path / "test_full.parquet")
val_norm.to_parquet(conv_vid_norm_path / "val_full.parquet")

In [56]:
# Describes all features besides encoding of concepts and fields, which must be added manually
test_norm.describe().to_parquet(conv_vid_norm_path / "feature_stats.parquet")

**Yeo Johnson all feature scaling/normalization**

Using Train-dataset as basis for the normalization. Will apply zero-mean, unit-variance normalization

In [57]:
val_loo_norm = pd.read_parquet(conv_vid_norm_path / "val.parquet")
test_loo_norm = pd.read_parquet(conv_vid_norm_path / "test.parquet")

In [58]:
from sklearn.preprocessing import PowerTransformer

In [59]:
# All numerical Feature normalize using Yeo-Johnson
numeric_cols = train_norm.select_dtypes(include="number").drop(columns=["tags", TIME_COL, ITEM_COL]).columns
numeric_cols

Index(['time_spent', 'num_forward', 'num_backward', 'num_pause',
       'median_pause', 'seg_rep_0', 'seg_rep_1', 'seg_rep_2', 'seg_rep_3',
       'seg_rep_4', 'seg_rep_5', 'seg_rep_10', 'seg_rep_15', 'seg_rep_20',
       'seg_rep_25', 'seg_rep_30', 'seg_rep_45', 'seg_rep_60', 'time_comp',
       'time_played', 'replay_length', 'skipped_length'],
      dtype='object')

In [60]:
pt = PowerTransformer("yeo-johnson")

In [61]:
%%time
train_scaled = train_norm.copy()
train_scaled.loc[:,numeric_cols] = pt.fit_transform(train_norm[numeric_cols])
val_scaled = val_loo_norm.copy()
val_scaled.loc[:,numeric_cols]  = pt.transform(val_loo_norm[numeric_cols])
test_scaled = test_loo_norm.copy()
test_scaled.loc[:,numeric_cols]  = pt.transform(test_loo_norm[numeric_cols])

CPU times: user 3.27 s, sys: 93.1 ms, total: 3.37 s
Wall time: 3.5 s


In [62]:
%%time
# For Sequential dataset down stream
val_full_scaled = val_norm.copy()
val_full_scaled.loc[:,numeric_cols] = pt.transform(val_full_scaled[numeric_cols])
test_full_scaled = test_norm.copy()
test_full_scaled.loc[:,numeric_cols] = pt.transform(test_full_scaled[numeric_cols])

CPU times: user 1.29 s, sys: 215 ms, total: 1.5 s
Wall time: 1.51 s


In [63]:
conv_scaled_path = conv_path / "all_scaled"

In [64]:
train_scaled.to_parquet(conv_scaled_path / "train.parquet")
val_scaled.to_parquet(conv_scaled_path / "val.parquet")
test_scaled.to_parquet(conv_scaled_path / "test.parquet")

In [65]:
# For Sequential datasets downstream
val_full_scaled.to_parquet(conv_scaled_path / "val_full.parquet")
test_full_scaled.to_parquet(conv_scaled_path / "test_full.parquet")

In [66]:
# storing min and mix of each continuous feature
test_full_scaled.describe().to_parquet(conv_scaled_path / "feature_stats.parquet")

#### Scaled and bias adjusted
Re-done - Adjust bias before scaling, not after.
Video normalization doesn't scale/change it

In [67]:
from utils.pre_processing import adaptive_bias

In [68]:
# Load existing datasets
if conv_vid_norm_path.exists():
    train_norm = pd.read_parquet(conv_vid_norm_path / "train.parquet")
    val_full_norm = pd.read_parquet(conv_vid_norm_path / "val_full.parquet")
    test_full_norm = pd.read_parquet(conv_vid_norm_path / "test_full.parquet")
    numeric_cols = train_norm.select_dtypes(include="number").drop(columns=[ITEM_COL, TIME_COL, "tags"]).columns
else:
    raise RuntimeError(f"Need to create {conv_vid_norm_path.name} datasets first")

In [69]:
pt2 = PowerTransformer("yeo-johnson")

In [70]:
%%time
# Since it is an expanding bias -> Doesn't make sense to apply it to the LOO splits
# -> Must apply it to the full splits and then save it as LOO
train_bias_adj = train_norm.copy()
train_bias_adj.loc[:,numeric_cols] = pt2.fit_transform(get_apply_op(train_bias_adj[numeric_cols]
                                                                    .groupby(USER_COL, group_keys=False, sort=False)
                                                                   )(lambda col: col.transform(adaptive_bias))
                                                      )

100%|███████████████████████████████████████████| 18194/18194 [01:16<00:00, 237.18it/s]


CPU times: user 1min 18s, sys: 393 ms, total: 1min 19s
Wall time: 1min 21s


In [71]:
train_bias_adj

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time_spent,num_forward,num_backward,num_pause,median_pause,seg_rep_0,seg_rep_1,seg_rep_2,seg_rep_3,seg_rep_4,...,seg_rep_30,seg_rep_45,seg_rep_60,time_comp,time_played,replay_length,skipped_length,tags,item_id,timestamp
user_id,item_consecutive_id,session_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,1,0,0.063300,0.078834,0.097627,0.072830,0.007534,0.063478,0.070438,0.075614,0.078449,0.080292,...,0.067506,0.055008,0.044493,-0.084609,0.005759,0.087217,0.085685,94,1,1565096637922
1,2,0,0.101227,-0.100228,0.097627,0.072830,0.008709,0.063478,0.070438,0.075614,0.078449,0.080292,...,0.067506,0.055008,0.044493,-0.083012,-0.281423,0.087217,0.081267,43,2,1568120237376
1,3,0,0.137784,0.190092,0.097627,0.072830,0.006792,0.063478,0.070438,0.075614,0.078449,0.080292,...,0.067506,0.055008,0.044493,-0.094449,-0.421574,0.087217,0.134819,44,3,1568120575929
1,4,0,0.135844,-0.100228,0.097627,0.072830,0.007675,0.063478,0.070438,0.075614,0.078449,0.080292,...,0.067506,0.055008,0.044493,-0.080746,-0.048379,0.087217,0.063809,46,4,1568120885352
1,5,0,0.211688,-0.063346,0.097627,0.072830,0.011089,0.063478,0.070438,0.075614,0.078449,0.080292,...,0.067506,0.055008,0.044493,-0.089877,-0.389254,0.087217,0.068192,0,5,1568121279019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
831764,8,2,-0.183279,-0.039035,1.297894,0.132674,0.007444,0.639629,0.767032,0.890615,1.071205,0.535843,...,-2.922814,-1.957621,-2.530637,0.375767,-0.047743,-0.949430,0.025146,124,139,1574764191758
831764,9,0,0.803611,-0.026965,3.015015,0.126745,0.007445,2.322427,2.737729,3.122304,3.503102,3.421359,...,-0.156445,-1.736013,-2.247220,0.328549,2.055936,3.606615,0.031232,110,135,1574764713578
831764,10,0,-0.391562,-0.017134,-0.962232,0.576904,0.007518,-1.005021,-1.277674,-1.571933,-1.726969,-1.657983,...,-2.579004,-1.557885,-2.019391,0.288065,-1.703018,-2.700738,0.036207,94,1,1574765372527
831764,11,0,-0.300393,-0.008973,-1.693709,0.072830,0.007483,-0.908112,-1.153823,-1.418587,-1.557684,-1.494826,...,-2.327086,-1.411671,-1.832366,0.256676,-0.396641,-2.352806,0.040349,91,127,1574765764861


In [72]:
%%time
# For Sequential dataset down stream and to be saved as LOO
val_full_bias_adj = val_full_norm.copy()
val_full_bias_adj.loc[:,numeric_cols] = pt2.transform(get_apply_op(val_full_bias_adj[numeric_cols]
                                                                   .groupby(USER_COL, group_keys=False, sort=False)
                                                                  )(lambda col: col.transform(adaptive_bias))
                                                     )
test_full_bias_adj = test_full_norm.copy()
test_full_bias_adj.loc[:,numeric_cols] = pt2.transform(get_apply_op(test_full_bias_adj[numeric_cols]
                                                                    .groupby(USER_COL, group_keys=False, sort=False)
                                                                   )(lambda col: col.transform(adaptive_bias))
                                                      )

100%|███████████████████████████████████████████| 18194/18194 [01:36<00:00, 188.13it/s]
100%|███████████████████████████████████████████| 18194/18194 [01:36<00:00, 187.61it/s]


CPU times: user 2min 34s, sys: 740 ms, total: 2min 35s
Wall time: 3min 16s


In [73]:
conv_scaled_bias_path = conv_path / "bias_adj_all_scaled"

In [74]:
train_bias_adj.to_parquet(conv_scaled_bias_path / "train.parquet")
# Had to calculate the bias based on the complete row -> Save only last (Includes features in the case of Simulation/feature prediction)
val_full_bias_adj.groupby(USER_COL, sort=False, group_keys=False).progress_apply(lambda group: group.tail(1)).to_parquet(conv_scaled_bias_path / "val.parquet")
test_full_bias_adj.groupby(USER_COL, sort=False, group_keys=False).progress_apply(lambda group: group.tail(1)).to_parquet(conv_scaled_bias_path / "test.parquet")

100%|██████████████████████████████████████████| 18194/18194 [00:03<00:00, 5493.82it/s]
100%|██████████████████████████████████████████| 18194/18194 [00:03<00:00, 5326.39it/s]


In [75]:
# For Sequential datasets downstream
val_full_bias_adj.to_parquet(conv_scaled_bias_path / "val_full.parquet")
test_full_bias_adj.to_parquet(conv_scaled_bias_path / "test_full.parquet")

In [76]:
# storing min and mix of each continuous feature
test_full_bias_adj.describe().to_parquet(conv_scaled_bias_path / "feature_stats.parquet")

### Listify By user
**Raw, without video_norm**

In [77]:
LIST_COLS = []
REG_COLS = list(set(test_full_bias_adj.reset_index().drop(columns=["user_id"]).columns) - set(LIST_COLS))

In [78]:
REG_COLS, LIST_COLS

(['num_backward',
  'skipped_length',
  'seg_rep_10',
  'seg_rep_30',
  'seg_rep_1',
  'seg_rep_2',
  'timestamp',
  'seg_rep_60',
  'tags',
  'seg_rep_0',
  'item_consecutive_id',
  'seg_rep_4',
  'seg_rep_5',
  'item_id',
  'time_played',
  'session_id',
  'seg_rep_45',
  'seg_rep_15',
  'seg_rep_25',
  'seg_rep_20',
  'replay_length',
  'num_pause',
  'time_spent',
  'seg_rep_3',
  'num_forward',
  'median_pause',
  'time_comp'],
 [])

In [79]:
import importlib
utils = importlib.import_module("utils")
importlib.reload(utils)
from utils.pre_processing import bulk_listify, bulk_pad_split, save_splits

In [80]:
PAD_TOKEN = 0

**Raw sequential dataset**

In [81]:
%%time
train_raw_seq_pad, val_raw_seq_pad, test_raw_seq_pad =  bulk_pad_split(
                                                            *bulk_listify(train_raw, val_raw, test_raw), 
                                                            reg_cols=REG_COLS,
                                                        )

CPU times: user 10.6 s, sys: 394 ms, total: 11 s
Wall time: 11.2 s


In [82]:
%%time
raw_seq_splits = {
    "train": train_raw_seq_pad,
    "val": val_raw_seq_pad,
    "test": test_raw_seq_pad
}
seq_raw_path = seq_path / "raw_dataset"
save_splits(seq_raw_path, raw_seq_splits)
test_raw.describe().to_parquet(seq_raw_path / "feature_stats.parquet")

CPU times: user 2.61 s, sys: 92.6 ms, total: 2.71 s
Wall time: 2.77 s


###

**Sequential Video Normalized**

In [83]:
%%time
train_norm_seq_pad, val_norm_seq_pad, test_norm_seq_pad =  bulk_pad_split(
                                                            *bulk_listify(train_norm, val_norm, test_norm), 
                                                            reg_cols=REG_COLS
                                                        )

CPU times: user 10.5 s, sys: 408 ms, total: 10.9 s
Wall time: 11.1 s


In [84]:
%%time
norm_seq_splits = {
    "train": train_norm_seq_pad,
    "val": val_norm_seq_pad,
    "test": test_norm_seq_pad
}
seq_video_norm_path = seq_path / "video_normalized"
save_splits(seq_video_norm_path, norm_seq_splits)
test_norm.describe().to_parquet(seq_video_norm_path / "feature_stats.parquet")

CPU times: user 2.58 s, sys: 52.8 ms, total: 2.64 s
Wall time: 2.7 s


###

**Sequential All features scaled**

In [85]:
%%time
train_scaled_seq, val_scaled_seq, test_scaled_seq =  bulk_pad_split(
                                                            *bulk_listify(train_scaled, val_full_scaled, test_full_scaled), 
                                                            reg_cols=REG_COLS
                                                        )

CPU times: user 11 s, sys: 392 ms, total: 11.4 s
Wall time: 11.6 s


In [86]:
%%time
scaled_seq_splits = {
    "train": train_scaled_seq,
    "val": val_scaled_seq,
    "test": test_scaled_seq
}
seq_scaled_path = seq_path / "all_scaled"
save_splits(seq_scaled_path, scaled_seq_splits)
test_full_scaled.describe().to_parquet(seq_scaled_path / "feature_stats.parquet")

CPU times: user 2.61 s, sys: 62.8 ms, total: 2.67 s
Wall time: 2.74 s


####

**Sequential Expanding bias adjustment**

In [87]:
%%time
train_scaled_bias_adj_seq, val_scaled_bias_adj_seq, test_scaled_bias_adj_seq =  bulk_pad_split(
                                                            *bulk_listify(train_bias_adj, val_full_bias_adj, test_full_bias_adj), 
                                                            reg_cols=REG_COLS
                                                        )

CPU times: user 10.7 s, sys: 419 ms, total: 11.1 s
Wall time: 11.5 s


In [88]:
%%time
scaled_bias_seq_splits = {
    "train": train_scaled_bias_adj_seq,
    "val": val_scaled_bias_adj_seq,
    "test": test_scaled_bias_adj_seq
}
seq_scaled_bias_adj_path = seq_path / "bias_adj_all_scaled"
save_splits(seq_scaled_bias_adj_path, scaled_bias_seq_splits)
test_full_bias_adj.describe().to_parquet(seq_scaled_bias_adj_path / "feature_stats.parquet")

CPU times: user 2.68 s, sys: 56.7 ms, total: 2.74 s
Wall time: 2.8 s
