# Attempting XLNet with side information
Testing to see if it can work with continous features as well. 
Segment repetition with concepts

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
base_path = Path("~/fall_project/MOOCCubeX/")
results_path = Path("./results")
relations_path = base_path / "relations"
session2video_id_path = relations_path / "sessions_w_concepts"

In [3]:
%%time
sessions = pd.read_parquet(session2video_id_path)

CPU times: user 20.3 s, sys: 1.43 s, total: 21.7 s
Wall time: 21.9 s


In [4]:
sessions

Unnamed: 0,user_id,video_consecutive_id,session_id,seg_rep_count,video_id,local_start_time,ccid,concept_ids,concepts,fields,interaction_session
0,U_1002476,1,0,0,V_1353266,1598971366,D8EC8658CD3FC45D9C33DC5901307461,"[K_边界的受力_物理学, K_变量_物理学, K_单形_物理学, K_动量矩方程_物理学,...","[边界的受力, 变量, 单形, 动量矩方程, 观察, 轨道, 阶, 理论力学, 力的平衡，,...","[物理学, 物理学, 物理学, 物理学, 物理学, 物理学, 物理学, 物理学, 物理学, ...",U_1002476-1-0
1,U_1002476,2,0,0,V_6258948,1601736654,1D4D20B05FD0D8B99C33DC5901307461,"[K_包含已知直线_机械工程, K_辅助平面法_机械工程, K_辅助平面法求直线和平面交点_...","[包含已知直线, 辅助平面法, 辅助平面法求直线和平面交点, 辅助平面, 积聚性, 几何元素...","[机械工程, 机械工程, 机械工程, 机械工程, 机械工程, 机械工程, 机械工程, 机械工...",U_1002476-2-0
2,U_1002476,3,0,0,V_6258951,1601736952,292B73EF985AE70A9C33DC5901307461,"[K_z坐标轴_机械工程, K_包含已知直线_机械工程, K_侧面投影_机械工程, K_辅助...","[z坐标轴, 包含已知直线, 侧面投影, 辅助平面法求直线和平面交点, 辅助平面, 几何学原...","[机械工程, 机械工程, 机械工程, 机械工程, 机械工程, 机械工程, 机械工程, 机械工...",U_1002476-3-0
3,U_10027765,1,0,1,V_7704153,1601717800,2C72189286FF3EF99C33DC5901307461,[],[],[],U_10027765-1-0
4,U_10027765,2,0,1,V_8186213,1604113273,DC81374DA06F06D89C33DC5901307461,[],[],[],U_10027765-2-0
...,...,...,...,...,...,...,...,...,...,...,...
2009151,U_9999820,1,0,0,V_7386531,1601434095,1BAE71E966D980C29C33DC5901307461,[],[],[],U_9999820-1-0
2009152,U_9999820,2,0,0,V_7386532,1601451341,AD99106BD3AD4A789C33DC5901307461,[],[],[],U_9999820-2-0
2009153,U_9999820,3,0,0,V_7386534,1601468512,8D21D8DA741CBD639C33DC5901307461,[],[],[],U_9999820-3-0
2009154,U_9999820,4,0,0,V_7386535,1601474220,81FDDCC818CF26339C33DC5901307461,[],[],[],U_9999820-4-0


In [7]:
ITEM_COL = "video_id"
USER_COL = "user_id"
CONTINUOUS_COL = "seg_rep_count"
CONCEPT_COL = "concepts"
FIELD_COL = "fields"
TIMESTAMP_COL = "local_start_time"

### Multiple concepts
Can't handle ragget lists -> Must pad them slice and pad

In [6]:
sessions[FIELD_COL].str.len().describe(percentiles=np.arange(0.5,1,0.05)).round(3)

count    2009156.000
mean          10.895
std           19.296
min            0.000
50%            1.000
55%            2.000
60%            3.000
65%            4.000
70%            7.000
75%           13.000
80%           21.000
85%           29.000
90%           43.000
95%           55.000
max          183.000
Name: fields, dtype: float64

In [4]:
MAX_CONCEPT_LENGTH = 10

In [8]:
def slice_and_pad(series, pad_token=0, max_length=MAX_CONCEPT_LENGTH):
    return series.transform(lambda lst: (lst + [pad_token] * max_length)[:max_length])

In [9]:
sessions[CONCEPT_COL].explode().dropna().unique().shape#.explode([CONCEPT_COL, FIELD_COL])

(172010,)

In [10]:
%%time
# Concept mapping
concept2int = {val: i for i, val in enumerate(sessions[CONCEPT_COL].explode().dropna().unique(), start=1)}
# Manually map videos without related concepts to 
concept2int[np.nan] = 0
int2concept = {i: concept for concept, i in concept2int.items()}

# Field mapping
field2int = {val: i for i, val in enumerate(sessions[FIELD_COL].explode().dropna().unique(), start=1)}
# Manually map videos without related concepts
field2int[np.nan] = 0
int2field = {i: field for field, i in field2int.items()}

# Video id mapping
video_id2int = {val: i for i, val in enumerate(sessions[ITEM_COL].unique(), start=1)}
int2video_id = {i: video_id for video_id, i in video_id2int.items()}

CPU times: user 5.43 s, sys: 191 ms, total: 5.62 s
Wall time: 5.67 s


In [11]:
MAX_VIDEO_ID, MAX_CONCEPT, MAX_FIELD = max(video_id2int.values()), max(concept2int.values()), max(field2int.values())
MAX_VIDEO_ID, MAX_CONCEPT, MAX_FIELD

(185637, 172010, 74)

In [12]:
%%time
# Basic video_id, encoding, saving 0 for padding
sessions_encoded = sessions.copy()
sessions_encoded.loc[:,ITEM_COL] = sessions_encoded[ITEM_COL].map(video_id2int)
sessions_encoded.loc[:,CONCEPT_COL] = sessions_encoded[CONCEPT_COL].transform(lambda lst: [concept2int[concept] for concept in lst])
sessions_encoded.loc[:,FIELD_COL] = sessions_encoded[FIELD_COL].transform(lambda lst: [field2int[field] for field in lst])

CPU times: user 7.34 s, sys: 197 ms, total: 7.53 s
Wall time: 7.59 s


In [62]:
%%time
# Slice and pad concepts 
sessions_encoded.loc[:, [CONCEPT_COL, FIELD_COL]] = sessions_encoded[[CONCEPT_COL, FIELD_COL]].transform(slice_and_pad, max_length=MAX_CONCEPT_LENGTH, axis=0).transform(lambda series: series.transform(lambda lst: [lst]))

CPU times: user 5.7 s, sys: 204 ms, total: 5.91 s
Wall time: 5.96 s


In [64]:
sessions_encoded[CONCEPT_COL]

0                   [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]
1          [[26, 27, 28, 29, 30, 31, 32, 33, 34, 35]]
2          [[70, 26, 71, 28, 29, 72, 73, 74, 75, 76]]
3                    [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
4                    [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
                              ...                    
2009151              [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
2009152              [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
2009153              [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
2009154              [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
2009155              [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
Name: concepts, Length: 2009156, dtype: object

In [65]:
sessions_encoded[CONCEPT_COL].str.len().describe()

count    2009156.0
mean           1.0
std            0.0
min            1.0
25%            1.0
50%            1.0
75%            1.0
max            1.0
Name: concepts, dtype: float64

### Single concept
Only grab the first concept

In [None]:
# Get only the first concept and field for each video
sessions_first = sessions.copy()
sessions_first.loc[:, [CONCEPT_COL, FIELD_COL]] = sessions[[CONCEPT_COL, FIELD_COL]].transform(lambda series: series.str[0])

In [None]:
# Concept mapping
concept2int = {val: i for i, val in enumerate(sessions_first[CONCEPT_COL].dropna().unique(), start=1)}
# Manually map videos without related concepts to 
concept2int[np.nan] = 0
int2concept = {i: concept for concept, i in concept2int.items()}

# Field mapping
field2int = {val: i for i, val in enumerate(sessions_first[FIELD_COL].dropna().unique(), start=1)}
# Manually map videos without related concepts
field2int[np.nan] = 0
int2field = {i: field for field, i in field2int.items()}

# Video id mapping
video_id2int = {val: i for i, val in enumerate(sessions_first[ITEM_COL].dropna().unique(), start=1)}
int2video_id = {i: video_id for video_id, i in video_id2int.items()}

In [None]:
MAX_VIDEO_ID, MAX_CONCEPT, MAX_FIELD = max(video_id2int.values()), max(concept2int.values()), max(field2int.values())
MAX_VIDEO_ID, MAX_CONCEPT, MAX_FIELD

In [None]:
# Basic video_id, encoding, saving 0 for padding
sessions_encoded = sessions_first.copy()
sessions_encoded.loc[:,ITEM_COL] = sessions_encoded[ITEM_COL].map(video_id2int)
sessions_encoded.loc[:,CONCEPT_COL] = sessions_encoded[CONCEPT_COL].map(concept2int)
sessions_encoded.loc[:,FIELD_COL] = sessions_encoded[FIELD_COL].map(field2int)

#### Listify and process segment repetition

In [66]:
# Functions for pre processing
def minmax_norm(series):
    if not series.any():
        return series
    return ((series - series.min())/(series.max() - series.min())).fillna(0)
def adaptive_z_score(series):
    """Adaptively standardizes the segment repetition as one learns more about the user's repetition behaviour.
    First repetition is deemed as 0. Alternative, lookinto it as 1"""
    if not series.any():
        return series
    expanding_window = series.expanding(1)
    return ((series - expanding_window.mean())/expanding_window.std()).fillna(0)

In [67]:
%%time
# Pre-process and group to lists
sessions_list = (sessions_encoded.sort_values(TIMESTAMP_COL)
                     .groupby(USER_COL)
                     .agg({ITEM_COL: list, 
                           CONCEPT_COL: list,
                           FIELD_COL: list,
                           CONTINUOUS_COL: lambda series: minmax_norm(series).tolist()}))
sessions_list_hot_5 = sessions_list[sessions_list[ITEM_COL].str.len() >= 5]

CPU times: user 25.3 s, sys: 192 ms, total: 25.5 s
Wall time: 25.7 s


In [68]:
sessions_list_hot_5

Unnamed: 0_level_0,video_id,concepts,fields,seg_rep_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U_10001587,"[11923, 11993, 11924, 11924, 11924, 12024]","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[0.0, 1.0, 0.0, 0.0, 0.25, 0.0]"
U_10008027,"[12082, 34059, 12145, 12031, 12036, 11924]","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[0, 0, 0, 0, 0, 0]"
U_10012383,"[12147, 12143, 12082, 12080, 34059, 12145, 121...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
U_10018910,"[2766, 2762, 2763, 2764, 2765, 2768]","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[0, 0, 0, 0, 0, 0]"
U_100203,"[142499, 142500, 87277, 125176, 125177, 142500...","[[[78778, 2457, 39369, 47191, 4930, 4937, 0, 0...","[[[6, 6, 6, 6, 6, 6, 0, 0, 0, 0]], [[6, 6, 6, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
U_9979639,"[2769, 2766, 2762, 2763, 2764, 2768]","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[0, 0, 0, 0, 0, 0]"
U_998508,"[25718, 2221, 2222, 2223, 2953, 2953, 28783, 7...","[[[2584, 2586, 59744, 59745, 2657, 7850, 35858...","[[[18, 18, 18, 18, 18, 18, 18, 18, 19, 18]], [...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
U_9988528,"[160890, 160891, 160891, 160892, 160892, 16089...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[0.3333333333333333, 0.0, 0.3333333333333333, ..."
U_9996819,"[2769, 2766, 2762, 2764, 2764, 2765]","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[0, 0, 0, 0, 0, 0]"


In [59]:
sessions_list_hot_5[ITEM_COL].str.len().describe(percentiles=np.arange(.5, 1, 0.05))

count    114150.000000
mean         14.279877
std          15.113597
min           5.000000
50%           9.000000
55%          10.000000
60%          11.000000
65%          12.000000
70%          14.000000
75%          16.000000
80%          18.000000
85%          23.000000
90%          30.000000
95%          44.000000
max         570.000000
Name: video_id, dtype: float64

In [6]:
MAX_SEQUENCE_LENGTH = 50
pad_token = 0

In [8]:
LIST_COLS = [FIELD_COL, CONCEPT_COL]
REG_COLS = list(set(sessions_list_hot_5.columns) - set(LIST_COLS))

NameError: name 'sessions_list_hot_5' is not defined

In [82]:
sessions_list_h5_sliced = sessions_list_hot_5.copy()
# Slice and pad with 0
sessions_list_h5_sliced.loc[:, REG_COLS] = sessions_list_h5_sliced[REG_COLS].transform(slice_and_pad, max_length=MAX_SEQUENCE_LENGTH, axis=0)
# Must pad with nested list - as (1, MAX_CONCEPT_LENGTH) attempt
sessions_list_h5_sliced.loc[:, LIST_COLS] = sessions_list_h5_sliced[LIST_COLS].transform(slice_and_pad, 
                                                                                     pad_token=[[pad_token]*MAX_CONCEPT_LENGTH],
                                                                                     max_length=MAX_SEQUENCE_LENGTH, axis=0)

In [86]:
raw_lists = np.array(sessions_list_h5_sliced[ITEM_COL].tolist())
raw_lists

array([[ 11923,  11993,  11924, ...,      0,      0,      0],
       [ 12082,  34059,  12145, ...,      0,      0,      0],
       [ 12147,  12143,  12082, ...,      0,      0,      0],
       ...,
       [160890, 160891, 160891, ...,      0,      0,      0],
       [  2769,   2766,   2762, ...,      0,      0,      0],
       [  2769,   2766,   2763, ...,      0,      0,      0]])

In [87]:
# First encountered zero
target_idx = (raw_lists==0).argmax(1, keepdims=True) - 1 
test_values = raw_lists

In [88]:
# Fill in the value which is hidden
# ->Split into train, validation and test with last and second-to-last items

val_values = test_values.copy()
np.put_along_axis(val_values, target_idx, pad_token, axis=1)
train_values = val_values.copy()
train_mask_idx = target_idx - 1 
np.put_along_axis(train_values, train_mask_idx, pad_token, axis=1)

In [89]:
test = sessions_list_h5_sliced[[ITEM_COL]]#.to_frame()
val = pd.Series(list(val_values), index=sessions_list_h5_sliced.index, name=ITEM_COL)#.to_frame()
train = pd.Series(list(train_values), index=sessions_list_h5_sliced.index, name=ITEM_COL)#.to_frame()

In [90]:
base_df = sessions_list_h5_sliced[[CONTINUOUS_COL, CONCEPT_COL, FIELD_COL]]
test_df = base_df.merge(test, on=USER_COL)
val_df = base_df.merge(val, on=USER_COL)
train_df = base_df.merge(train, on=USER_COL)

In [91]:
val_df

Unnamed: 0_level_0,seg_rep_count,concepts,fields,video_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U_10001587,"[0.0, 1.0, 0.0, 0.0, 0.25, 0.0, 0, 0, 0, 0, 0,...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[11923, 11993, 11924, 11924, 11924, 0, 0, 0, 0..."
U_10008027,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[12082, 34059, 12145, 12031, 12036, 0, 0, 0, 0..."
U_10012383,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[12147, 12143, 12082, 12080, 34059, 12145, 121..."
U_10018910,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2766, 2762, 2763, 2764, 2765, 0, 0, 0, 0, 0, ..."
U_100203,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[78778, 2457, 39369, 47191, 4930, 4937, 0, 0...","[[[6, 6, 6, 6, 6, 6, 0, 0, 0, 0]], [[6, 6, 6, ...","[142499, 142500, 87277, 125176, 125177, 142500..."
...,...,...,...,...
U_9979639,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2769, 2766, 2762, 2763, 2764, 0, 0, 0, 0, 0, ..."
U_998508,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[2584, 2586, 59744, 59745, 2657, 7850, 35858...","[[[18, 18, 18, 18, 18, 18, 18, 18, 19, 18]], [...","[25718, 2221, 2222, 2223, 2953, 2953, 28783, 7..."
U_9988528,"[0.3333333333333333, 0.0, 0.3333333333333333, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[160890, 160891, 160891, 160892, 160892, 16089..."
U_9996819,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2769, 2766, 2762, 2764, 2764, 0, 0, 0, 0, 0, ..."


In [92]:
train_df

Unnamed: 0_level_0,seg_rep_count,concepts,fields,video_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U_10001587,"[0.0, 1.0, 0.0, 0.0, 0.25, 0.0, 0, 0, 0, 0, 0,...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[11923, 11993, 11924, 11924, 0, 0, 0, 0, 0, 0,..."
U_10008027,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[12082, 34059, 12145, 12031, 0, 0, 0, 0, 0, 0,..."
U_10012383,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[12147, 12143, 12082, 12080, 34059, 12145, 121..."
U_10018910,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2766, 2762, 2763, 2764, 0, 0, 0, 0, 0, 0, 0, ..."
U_100203,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[78778, 2457, 39369, 47191, 4930, 4937, 0, 0...","[[[6, 6, 6, 6, 6, 6, 0, 0, 0, 0]], [[6, 6, 6, ...","[142499, 142500, 87277, 125176, 125177, 142500..."
...,...,...,...,...
U_9979639,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2769, 2766, 2762, 2763, 0, 0, 0, 0, 0, 0, 0, ..."
U_998508,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[2584, 2586, 59744, 59745, 2657, 7850, 35858...","[[[18, 18, 18, 18, 18, 18, 18, 18, 19, 18]], [...","[25718, 2221, 2222, 2223, 2953, 2953, 28783, 7..."
U_9988528,"[0.3333333333333333, 0.0, 0.3333333333333333, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[160890, 160891, 160891, 160892, 160892, 16089..."
U_9996819,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2769, 2766, 2762, 2764, 0, 0, 0, 0, 0, 0, 0, ..."


In [93]:
test_df

Unnamed: 0_level_0,seg_rep_count,concepts,fields,video_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U_10001587,"[0.0, 1.0, 0.0, 0.0, 0.25, 0.0, 0, 0, 0, 0, 0,...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[11923, 11993, 11924, 11924, 11924, 12024, 0, ..."
U_10008027,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[12082, 34059, 12145, 12031, 12036, 11924, 0, ..."
U_10012383,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[12147, 12143, 12082, 12080, 34059, 12145, 121..."
U_10018910,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2766, 2762, 2763, 2764, 2765, 2768, 0, 0, 0, ..."
U_100203,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[78778, 2457, 39369, 47191, 4930, 4937, 0, 0...","[[[6, 6, 6, 6, 6, 6, 0, 0, 0, 0]], [[6, 6, 6, ...","[142499, 142500, 87277, 125176, 125177, 142500..."
...,...,...,...,...
U_9979639,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2769, 2766, 2762, 2763, 2764, 2768, 0, 0, 0, ..."
U_998508,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[2584, 2586, 59744, 59745, 2657, 7850, 35858...","[[[18, 18, 18, 18, 18, 18, 18, 18, 19, 18]], [...","[25718, 2221, 2222, 2223, 2953, 2953, 28783, 7..."
U_9988528,"[0.3333333333333333, 0.0, 0.3333333333333333, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[160890, 160891, 160891, 160892, 160892, 16089..."
U_9996819,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2769, 2766, 2762, 2764, 2764, 2765, 0, 0, 0, ..."


In [2]:
out_path = Path("data-concepts-multi")

In [95]:
%%time
test_df.to_parquet(out_path / "test.parquet")
val_df.to_parquet(out_path / "val.parquet")
train_df.to_parquet(out_path / "train.parquet")

CPU times: user 22.1 s, sys: 965 ms, total: 23.1 s
Wall time: 23.3 s


### Create manual schema

In [3]:
from merlin_standard_lib import Schema, Tag, ColumnSchema
from merlin.schema.tags import Tags
from merlin_standard_lib.proto.schema_bp import ValueCount, IntDomain

In [98]:
schema = Schema([
        ColumnSchema.create_categorical(ITEM_COL, MAX_VIDEO_ID, 
                                        value_count= ValueCount(0, MAX_SEQUENCE_LENGTH),
                                        min_index=0,
                                        tags=[Tag.CATEGORICAL, Tag.LIST, Tag.ITEM, Tag.ITEM_ID]
                                       ),
        ColumnSchema.create_continuous(CONTINUOUS_COL, 
                                       min_value=0, max_value=1, 
                                       value_count=ValueCount(0, MAX_SEQUENCE_LENGTH), 
                                       tags=[Tag.CONTINUOUS, Tag.LIST]),
        ColumnSchema.create_categorical(CONCEPT_COL, MAX_CONCEPT, 
                                    #value_count=ValueCount(0, MAX_SEQUENCE_LENGTH),
                                    min_index=0,
                                    shape=(MAX_SEQUENCE_LENGTH, 1, MAX_CONCEPT_LENGTH),
                                    tags=[Tag.CATEGORICAL, Tag.LIST]
                                   ),
        ColumnSchema.create_categorical(FIELD_COL, MAX_FIELD, 
                                    #value_count= ValueCount(0, MAX_SEQUENCE_LENGTH),
                                    min_index=0,
                                    shape=(MAX_SEQUENCE_LENGTH, 1, MAX_CONCEPT_LENGTH),
                                    tags=[Tag.CATEGORICAL, Tag.LIST]
                                   ),
    ])

In [99]:
schema.select_by_name(CONCEPT_COL)

[{'name': 'concepts', 'shape': {'dim': [{'size': '50'}, {'size': '1'}, {'size': '10'}]}, 'type': 'INT', 'int_domain': {'name': 'concepts', 'max': '172010', 'is_categorical': True}, 'annotation': {'tag': ['categorical', 'list']}}]

In [33]:
with open(out_path/"schema.pb", "w") as file:
    file.write(schema.to_proto_text())

# XLNet Config and such

In [24]:
import os

#os.environ["CUDA_VISIBLE_DEVICES"]="0"
import glob
import torch 

from transformers4rec import torch as tr
from transformers4rec.torch.ranking_metric import NDCGAt, AvgPrecisionAt, RecallAt
from transformers4rec.torch.utils.examples_utils import wipe_memory
from transformers4rec.config.transformer import BertConfig
import random as pyrandom

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
INTERACTION_EMB=64
seed = 53

In [180]:
test_df.head(1)

Unnamed: 0_level_0,seg_rep_count,concepts,fields,video_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U_10001587,"[0.0, 1.0, 0.0, 0.0, 0.25, 0.0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, ...","[11923, 11993, 11924, 11924, 11924, 12024, 0, ..."


In [40]:
test = {k: torch.tensor(v, device=0).reshape(1, -1) for k, v in test_df.loc["U_10001587",:].to_dict().items()}

NameError: name 'test_df' is not defined

In [100]:
#Setting all seeds, maybe redundant
torch.manual_seed(seed)
np.random.default_rng(seed)
pyrandom.seed(seed)
# Define input module - No continous features -> No need for projection 
input_module = tr.TabularSequenceFeatures.from_schema(
        schema,
        max_sequence_length=MAX_SEQUENCE_LENGTH,
        #continuous_projection=16, #Project continous values before merging
        masking="mlm",
        d_output=INTERACTION_EMB, # Make output embedding match the embedding used in the transformer arch
)
input_module

TabularSequenceFeatures(
  (to_merge): ModuleDict(
    (continuous_module): ContinuousFeatures(
      (filter_features): FilterFeatures()
    )
    (categorical_module): SequenceEmbeddingFeatures(
      (filter_features): FilterFeatures()
      (embedding_tables): ModuleDict(
        (video_id): Embedding(185638, 64, padding_idx=0)
        (concepts): Embedding(172011, 64, padding_idx=0)
        (fields): Embedding(75, 64, padding_idx=0)
      )
    )
  )
  (_aggregation): ConcatFeatures()
  (projection_module): SequentialBlock(
    (0): DenseBlock(
      (0): Linear(in_features=193, out_features=64, bias=True)
      (1): ReLU(inplace=True)
    )
  )
  (_masking): MaskedLanguageModeling()
)

In [101]:
#Setting all seeds, maybe redundant
torch.manual_seed(seed)
np.random.default_rng(seed)
pyrandom.seed(seed)
# Define XLNetConfig class and set default parameters for HF XLNet config  
transformer_config = tr.XLNetConfig.build(
    d_model=INTERACTION_EMB, n_head=4, n_layer=2, total_seq_length=MAX_SEQUENCE_LENGTH
)

metrics = [NDCGAt(top_ks=[5, 10], labels_onehot=True),  
           RecallAt(top_ks=[5, 10], labels_onehot=True),
           AvgPrecisionAt(top_ks=[5, 10], labels_onehot=True)
          ]

prediction_task = tr.NextItemPredictionTask(
    weight_tying=True, metrics=metrics, loss=torch.nn.NLLLoss(ignore_index=0)
)
model = transformer_config.to_torch_model(input_module, prediction_task)
model

Model(
  (heads): ModuleList(
    (0): Head(
      (body): SequentialBlock(
        (0): TabularSequenceFeatures(
          (to_merge): ModuleDict(
            (continuous_module): ContinuousFeatures(
              (filter_features): FilterFeatures()
            )
            (categorical_module): SequenceEmbeddingFeatures(
              (filter_features): FilterFeatures()
              (embedding_tables): ModuleDict(
                (video_id): Embedding(185638, 64, padding_idx=0)
                (concepts): Embedding(172011, 64, padding_idx=0)
                (fields): Embedding(75, 64, padding_idx=0)
              )
            )
          )
          (_aggregation): ConcatFeatures()
          (projection_module): SequentialBlock(
            (0): DenseBlock(
              (0): Linear(in_features=193, out_features=64, bias=True)
              (1): ReLU(inplace=True)
            )
          )
          (_masking): MaskedLanguageModeling()
        )
        (1): TansformerBlock(
     

In [37]:
from transformers4rec.config.trainer import T4RecTrainingArguments
from transformers4rec.torch import Trainer

In [102]:
#Setting all seeds, maybe redundant
seed = 53
torch.manual_seed(seed)
np.random.default_rng(seed)
pyrandom.seed(seed)

# Set hyperparameters for training 
TRAIN_BATCH_SIZE = 512
EVAL_BATCH_SIZE = 512
train_args = T4RecTrainingArguments(data_loader_engine='merlin', 
                                    per_device_train_batch_size = TRAIN_BATCH_SIZE,
                                    per_device_eval_batch_size = EVAL_BATCH_SIZE,
                                    output_dir = "./xlnet-concept-frst", 
                                    save_total_limit=5,
                                    logging_steps=100,
                                    report_to = [],
                                    num_train_epochs=27,
                                    max_sequence_length=MAX_SEQUENCE_LENGTH, 
                                    no_cuda=False,
                                    seed=seed,
                                    #save_strategy="steps",
                                    evaluation_strategy="steps",
                                    eval_steps=250,
                                    save_steps=750,
                                    #load_best_model_at_end=True,
                                    # Optimizer - Adapted from Keras Adam default params
                                    learning_rate=0.001,
                                    lr_scheduler_type='cosine', 
                                    learning_rate_num_cosine_cycles_by_epoch=1.5,
                                   )
trainer = Trainer(
    model=model,
    args=train_args,
    schema=schema,
    compute_metrics=True,
    train_dataset_or_path = str(out_path / "train.parquet"),
    eval_dataset_or_path = str(out_path / "val.parquet")
)

PyTorch: setting up devices


In [103]:
# INITIAL EVAL
trainer.evaluate()

{'eval_/next-item/ndcg_at_5': 0.0,
 'eval_/next-item/ndcg_at_10': 0.0,
 'eval_/next-item/recall_at_5': 0.0,
 'eval_/next-item/recall_at_10': 0.0,
 'eval_/next-item/avg_precision_at_5': 0.0,
 'eval_/next-item/avg_precision_at_10': 0.0,
 'eval_/loss': 12.171435356140137,
 'eval_runtime': 12.5443,
 'eval_samples_per_second': 9101.806,
 'eval_steps_per_second': 17.777}

In [109]:
trainer.train()

***** Running training *****
  Num examples = 114176
  Num Epochs = 27
  Instantaneous batch size per device = 512
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 6021


Step,Training Loss,Validation Loss,/next-item/ndcg At 5,/next-item/ndcg At 10,/next-item/recall At 5,/next-item/recall At 10,/next-item/avg Precision At 5,/next-item/avg Precision At 10,/loss
250,4.8745,No log,0.298362,0.32203,0.382059,0.45512,0.270562,0.280354,6.491741
500,4.7563,No log,0.305893,0.329846,0.393132,0.467026,0.276929,0.286849,6.448344
750,4.7006,No log,0.31113,0.3352,0.397845,0.472168,0.282327,0.292281,6.399476
1000,4.6002,No log,0.313318,0.338387,0.401104,0.478546,0.284184,0.294544,6.356697
1250,4.5796,No log,0.319358,0.344811,0.410346,0.488787,0.289159,0.299715,6.319958
1500,4.5196,No log,0.324372,0.350025,0.416023,0.495164,0.293941,0.304565,6.292979
1750,4.4787,No log,0.325804,0.352483,0.419334,0.501629,0.294755,0.305805,6.252179
2000,4.4199,No log,0.33111,0.35804,0.426903,0.509917,0.299316,0.310479,6.231364
2250,4.3806,No log,0.339269,0.365577,0.434998,0.516032,0.307538,0.318456,6.199189
2500,4.3068,No log,0.33879,0.366347,0.434157,0.519089,0.307196,0.318622,6.176024


Saving model checkpoint to ./xlnet-concept-frst/checkpoint-750
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [xlnet-concept-frst/checkpoint-3000] due to args.save_total_limit
Saving model checkpoint to ./xlnet-concept-frst/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [xlnet-concept-frst/checkpoint-3750] due to args.save_total_limit
Saving model checkpoint to ./xlnet-concept-frst/checkpoint-2250
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [xlnet-concept-frst/checkpoint-4500] due to args.save_total_limit
Saving model checkpoint to ./xlnet-concept-frst/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [xlnet-concept-frst/checkpoint-5250] due to args.save_total_limit
Saving model checkpoint to ./xlnet-concept-frst/checkpoint-3750
Trainer.model is not a `PreTraine

TrainOutput(global_step=6021, training_loss=4.267088297850427, metrics={'train_runtime': 400.0266, 'train_samples_per_second': 0.067, 'train_steps_per_second': 15.052, 'total_flos': 0.0, 'train_loss': 4.267088297850427})

In [108]:
trainer.eval_dataset_or_path = str(out_path / "val.parquet")
trainer.evaluate()

{'eval_/next-item/ndcg_at_5': 0.2945694327354431,
 'eval_/next-item/ndcg_at_10': 0.31838956475257874,
 'eval_/next-item/recall_at_5': 0.375567227602005,
 'eval_/next-item/recall_at_10': 0.4491546154022217,
 'eval_/next-item/avg_precision_at_5': 0.26765644550323486,
 'eval_/next-item/avg_precision_at_10': 0.2775011658668518,
 'eval_/loss': 6.5359086990356445,
 'eval_runtime': 7.5714,
 'eval_samples_per_second': 15079.953,
 'eval_steps_per_second': 29.453}

In [115]:
trainer.test_dataset_or_path = str(out_path / "test.parquet")
preds = trainer.predict(str(out_path / "test.parquet"))

In [119]:
preds.label_ids