# XLNet with Multi-concepts embedded
Trying to make it work with an embedding module of the fields and concepts

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import fasttext
import pickle

In [2]:
base_path = Path("~/fall_project/MOOCCubeX/")
results_path = Path("./results")
relations_path = base_path / "relations"
session2video_id_path = relations_path / "sessions_w_concepts"

In [3]:
%%time
sessions = pd.read_parquet(session2video_id_path)

CPU times: user 28.8 s, sys: 3.36 s, total: 32.2 s
Wall time: 32.5 s


In [4]:
sessions

Unnamed: 0,user_id,video_consecutive_id,session_id,seg_rep_count,video_id,local_start_time,ccid,concept_ids,concepts,fields,interaction_session
0,U_1002476,1,0,0,V_1353266,1598971366,D8EC8658CD3FC45D9C33DC5901307461,"[K_边界的受力_物理学, K_变量_物理学, K_单形_物理学, K_动量矩方程_物理学,...","[边界的受力, 变量, 单形, 动量矩方程, 观察, 轨道, 阶, 理论力学, 力的平衡，,...","[物理学, 物理学, 物理学, 物理学, 物理学, 物理学, 物理学, 物理学, 物理学, ...",U_1002476-1-0
1,U_1002476,2,0,0,V_6258948,1601736654,1D4D20B05FD0D8B99C33DC5901307461,"[K_包含已知直线_机械工程, K_辅助平面法_机械工程, K_辅助平面法求直线和平面交点_...","[包含已知直线, 辅助平面法, 辅助平面法求直线和平面交点, 辅助平面, 积聚性, 几何元素...","[机械工程, 机械工程, 机械工程, 机械工程, 机械工程, 机械工程, 机械工程, 机械工...",U_1002476-2-0
2,U_1002476,3,0,0,V_6258951,1601736952,292B73EF985AE70A9C33DC5901307461,"[K_z坐标轴_机械工程, K_包含已知直线_机械工程, K_侧面投影_机械工程, K_辅助...","[z坐标轴, 包含已知直线, 侧面投影, 辅助平面法求直线和平面交点, 辅助平面, 几何学原...","[机械工程, 机械工程, 机械工程, 机械工程, 机械工程, 机械工程, 机械工程, 机械工...",U_1002476-3-0
3,U_10027765,1,0,1,V_7704153,1601717800,2C72189286FF3EF99C33DC5901307461,[],[],[],U_10027765-1-0
4,U_10027765,2,0,1,V_8186213,1604113273,DC81374DA06F06D89C33DC5901307461,[],[],[],U_10027765-2-0
...,...,...,...,...,...,...,...,...,...,...,...
2009151,U_9999820,1,0,0,V_7386531,1601434095,1BAE71E966D980C29C33DC5901307461,[],[],[],U_9999820-1-0
2009152,U_9999820,2,0,0,V_7386532,1601451341,AD99106BD3AD4A789C33DC5901307461,[],[],[],U_9999820-2-0
2009153,U_9999820,3,0,0,V_7386534,1601468512,8D21D8DA741CBD639C33DC5901307461,[],[],[],U_9999820-3-0
2009154,U_9999820,4,0,0,V_7386535,1601474220,81FDDCC818CF26339C33DC5901307461,[],[],[],U_9999820-4-0


In [3]:
ITEM_COL = "video_id"
USER_COL = "user_id"
CONTINUOUS_COL = "seg_rep_count"
CONCEPT_COL = "concepts"
FIELD_COL = "fields"
TIMESTAMP_COL = "local_start_time"

### Embedding fields and concepts
* Must aggregate concepts and fields -> Simple per-dim average
* Some fields might be the same -> Must use the set first
* **Currently** - `not-field` fields are embedded the same way as regular fields

In [6]:
# Later - Download the tokenizer bin and nltk seperatiely to avoid installing it in the container
emb_table = fasttext.load_model("cc.zh.300.bin")



In [None]:
# Deprecated - Only encoding, embeddings come later on
def embed_lst(lst):
    """Embeds and takes the average for each dimension of the embedding across the items"""
    unique_items = set(lst)
    if len(lst)==0:
        return np.array([[0]*TXT_EMBED_DIM])
    # Must be returned as list to due to nested errors with Pyarrow
    return np.array([emb_table.get_word_vector(item) for item in unique_items]).mean(axis=0, keepdims=True).tolist()

`print-sentence-vectors` do not provide the given sentence -> THe order is given by the lines above

### Create embedding tables
Have to use the encoded strings as integer lookups in a manually created lookup table

#### Encoding of fields

In [8]:
unique_concepts = sessions[CONCEPT_COL].explode().dropna().str.strip().unique()
unique_fields = sessions[FIELD_COL].explode().dropna().str.strip().unique()

In [9]:
%%time
# Concept mapping, saving 0 for padding
concept2int = {val: i for i, val in enumerate(unique_concepts, start=1)}
# Manually map videos without related concepts to 
concept2int[np.nan] = 0
int2concept = {i: concept for concept, i in concept2int.items()}

# Field mapping, saving 0 for padding
field2int = {val: i for i, val in enumerate(unique_fields, start=1)}
# Manually map videos without related concepts
field2int[np.nan] = 0
int2field = {i: field for field, i in field2int.items()}

# Video id mapping, saving 0 for padding
video_id2int = {val: i for i, val in enumerate(sessions[ITEM_COL].unique(), start=1)}
int2video_id = {i: video_id for video_id, i in video_id2int.items()}

CPU times: user 199 ms, sys: 9.73 ms, total: 209 ms
Wall time: 213 ms


In [10]:
print(*list(concept2int.items())[:10], sep="\n")

('边界的受力', 1)
('变量', 2)
('单形', 3)
('动量矩方程', 4)
('观察', 5)
('轨道', 6)
('阶', 7)
('理论力学', 8)
('力的平衡，', 9)
('力学', 10)


In [11]:
# Exlucdes np.nan which is mapped to 0 for concepts and fields
MAX_VIDEO_ID, MAX_CONCEPT, MAX_FIELD = max(video_id2int.values()), max(concept2int.values()), max(field2int.values())
MAX_VIDEO_ID, MAX_CONCEPT, MAX_FIELD

(185637, 171411, 74)

In [4]:
TXT_EMBED_DIM = 300
PAD_TOKEN = 0
MAX_CONCEPT_LENGTH = 10

Some problems storing concepts as strings in file. Don't know why, but likely some encoding issue of some sort

In [13]:
%%time
embedded_concepts = np.array([emb_table.get_sentence_vector(concept) for concept in unique_concepts])

CPU times: user 1.57 s, sys: 140 ms, total: 1.71 s
Wall time: 1.72 s


In [14]:
embedded_fields = np.array([emb_table.get_sentence_vector(field) for field in unique_fields])

In [15]:
# Create explicit mapping between feature and embedding
field2float = dict(zip(unique_fields, embedded_fields))
concept2float = dict(zip(unique_concepts, embedded_concepts))

In [5]:
SAVE = False

In [28]:
# Verifying that concept2embedding matrices are equal
try: 
    with open("nlp/concept-embedding-sentence.pickle", "rb") as f:
        test_concept_dict = pickle.load(f)
    assert all([(concept2float[concept] == embed).all() for concept, embed in test_concept_dict.items()]), "The stored concept embeddings do not match the generated ones"
    with open("nlp/field-embedding-sentence.pickle", "rb") as f:
        test_field_dict = pickle.load(f)
    assert all([(field2float[field] == embed).all() for field, embed in test_field_dict.items()]), "The stored field embeddings do not match the generated ones"
except AssertionError:
    print("Stored and newly created embeddings do not match")
    if SAVE:
        print("Overriding with new embeddings")
        with open("nlp/concept-embedding-sentence.pickle", "wb") as f:
            pickle.dump(field2float, f)
        with open("nlp/field-embedding-sentence.pickle", "wb") as f:
            pickle.dump(concept2float, f)

In [23]:
# Create explicit mapping between encoding and embedding + add pad token
concept_enc2float = {concept2int[concept]: embed for concept, embed in concept2float.items()}
concept_enc2float[PAD_TOKEN] = np.zeros(TXT_EMBED_DIM)
field_enc2float = {field2int[field]: embed for field, embed in field2float.items()}
field_enc2float[PAD_TOKEN] = np.zeros(TXT_EMBED_DIM)

In [29]:
if SAVE:
    with open("nlp/concept-enc-embedding-sentence.pickle", "wb") as f:
        pickle.dump(concept_enc2float, f)
    with open("nlp/field-enc-embedding-sentence.pickle", "wb") as f:
        pickle.dump(field_enc2float, f)

In [24]:
fields_as_list = list(sorted(field_enc2float.items(), key=lambda x: x[0]))
concepts_as_list = list(sorted(concept_enc2float.items(), key=lambda x: x[0]))
# Verify that the order is as expected
assert all([(field_enc2float[idx] == val).all() for idx, val in fields_as_list])
assert all([(concept_enc2float[idx] == val).all() for idx, val in concepts_as_list])

In [13]:
def save_embedding_table(feature_name, embedding_table):
    """Store embedding tables as numpy matrices"""
    out_name = f"embeddings/session_{feature_name}.npy"
    with open(out_name, "wb") as file:
        np.save(file, embedding_table)
        
def load_embedding_table(feature_name):
    """Store embedding tables as numpy matrices"""
    in_name = f"embeddings/session_{feature_name}.npy"
    with open(in_name, "rb") as file:
        emb_table = np.load(file)
    return emb_table

In [31]:
# Convert to embedding tables and store - Redundant but
field_embed_table = np.concatenate(([[val] for idx, val in fields_as_list]))
concept_embed_table = np.concatenate(([[val] for idx, val in concepts_as_list]))

In [32]:
field_embed_table.shape, concept_embed_table.shape

((75, 300), (171412, 300))

In [33]:
# Store the weight matrices themselves to be loaded - Each row corresponding to the encoded index
if SAVE:
    save_embedding_table(CONCEPT_COL, concept_embed_table)
    save_embedding_table(FIELD_COL, field_embed_table)

In [385]:
# Deprecated due text problems with chinese
def create_embedding_table(feature_name, pad_value=0, embed_dim=TXT_EMBED_DIM):
    """Loads a generate embedding table from `print-sentence-vectors` and 
    manually adds a padding token at index 0 (TODO: change to pad token index)
    """
    in_name = f"nlp/session_{feature_name}_embed_sentence.txt"
    emb_table = np.loadtxt(in_name)
    """
    emb_table = np.array([record.split() for record in feature_embed])
    oov_mask = np.isin(emb_table[:, 0], list(feature2int.keys()))
    num_oov = np.sum(~oov_mask)
    print("Number of OOV:", num_oov)
    # Encode feature indice - vectorize is pretty much as fast as it gets
    feats2enc = np.vectorize(lambda x: feature2int.get(x, -1))
    emb_table[:, 0] = feats2enc(emb_table[:, 0])
    # Invalidate OOV features - Pad token and value is set to be equal, if other than zero, be careful
    emb_table[~oov_mask, 1:] = (np.ones((num_oov, embed_dim)) * pad_value)
    """
    # Add manual 0 entry
    return np.concatenate(([[0] * embed_dim ], emb_table))
    """
    sorted_index = emb_table[:, 0].astype(int).argsort()
    sorted_table = emb_table[sorted_index]
    print((sorted_table[:,0] == "-1").sum())
    #assert (sorted_table[:,0].astype(int) == list(range(len(feature2int)))).all()
    print(sorted_table)
    return sorted_table[:, 1:].astype(float)
    """


### Encode table

In [34]:
%%time
# Basic video_id, encoding, stripping as they where embedded cleanly
sessions_encoded = sessions.copy()
sessions_encoded.loc[:,ITEM_COL] = sessions_encoded[ITEM_COL].map(video_id2int)
sessions_encoded.loc[:,CONCEPT_COL] = sessions_encoded[CONCEPT_COL].transform(lambda lst: [concept2int[concept.strip()] for concept in lst])
sessions_encoded.loc[:,FIELD_COL] = sessions_encoded[FIELD_COL].transform(lambda lst: [field2int[field.strip()] for field in lst])

CPU times: user 11.3 s, sys: 383 ms, total: 11.7 s
Wall time: 11.7 s


In [35]:
def slice_and_pad(series, pad_token=0, max_length=MAX_CONCEPT_LENGTH):
    return series.transform(lambda lst: (lst + [pad_token] * max_length)[:max_length])

In [36]:
%%time
# Slice and pad concepts to avoid ragged list 
sessions_encoded.loc[:, [CONCEPT_COL, FIELD_COL]] = (sessions_encoded[[CONCEPT_COL, FIELD_COL]]
                                                     .transform(slice_and_pad, max_length=MAX_CONCEPT_LENGTH, axis=0)
                                                     # Convert to 2D array which is needed for embedding
                                                     .transform(lambda series: series.transform(lambda lst: [lst])))

CPU times: user 8.27 s, sys: 400 ms, total: 8.67 s
Wall time: 8.73 s


In [38]:
sessions_encoded[CONCEPT_COL]

0                   [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]
1          [[26, 27, 28, 29, 30, 31, 32, 33, 34, 35]]
2          [[70, 26, 71, 28, 29, 72, 73, 74, 75, 76]]
3                    [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
4                    [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
                              ...                    
2009151              [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
2009152              [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
2009153              [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
2009154              [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
2009155              [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
Name: concepts, Length: 2009156, dtype: object

#### Listify and process segment repetition

In [39]:
# Functions for pre processing
def minmax_norm(series):
    if not series.any():
        return series
    return ((series - series.min())/(series.max() - series.min())).fillna(0)
def adaptive_z_score(series):
    """Adaptively standardizes the segment repetition as one learns more about the user's repetition behaviour.
    First repetition is deemed as 0. Alternative, lookinto it as 1"""
    if not series.any():
        return series
    expanding_window = series.expanding(1)
    return ((series - expanding_window.mean())/expanding_window.std()).fillna(0)

In [40]:
%%time
# Pre-process and group to lists
sessions_list = (sessions_encoded.sort_values(TIMESTAMP_COL)
                     .groupby(USER_COL)
                     .agg({ITEM_COL: list, 
                           CONCEPT_COL: list,
                           FIELD_COL: list,
                           CONTINUOUS_COL: lambda series: minmax_norm(series).tolist()}))
sessions_list_hot_5 = sessions_list[sessions_list[ITEM_COL].str.len() >= 5]

CPU times: user 29.2 s, sys: 148 ms, total: 29.4 s
Wall time: 29.6 s


In [41]:
np.array(sessions_list_hot_5[CONCEPT_COL][0][0]).shape

(1, 10)

In [42]:
sessions_list_hot_5

Unnamed: 0_level_0,video_id,concepts,fields,seg_rep_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U_10001587,"[11923, 11993, 11924, 11924, 11924, 12024]","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[0.0, 1.0, 0.0, 0.0, 0.25, 0.0]"
U_10008027,"[12082, 34059, 12145, 12031, 12036, 11924]","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[0, 0, 0, 0, 0, 0]"
U_10012383,"[12147, 12143, 12082, 12080, 34059, 12145, 121...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
U_10018910,"[2766, 2762, 2763, 2764, 2765, 2768]","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[0, 0, 0, 0, 0, 0]"
U_100203,"[142499, 142500, 87277, 125176, 125177, 142500...","[[[22186, 2450, 39251, 47053, 4908, 4915, 0, 0...","[[[6, 6, 6, 6, 6, 6, 0, 0, 0, 0]], [[6, 6, 6, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
U_9979639,"[2769, 2766, 2762, 2763, 2764, 2768]","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[0, 0, 0, 0, 0, 0]"
U_998508,"[25718, 2221, 2222, 2223, 2953, 2953, 28783, 7...","[[[2577, 2579, 59554, 59555, 2649, 7814, 35747...","[[[18, 18, 18, 18, 18, 18, 18, 18, 19, 18]], [...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
U_9988528,"[160890, 160891, 160891, 160892, 160892, 16089...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[0.3333333333333333, 0.0, 0.3333333333333333, ..."
U_9996819,"[2769, 2766, 2762, 2764, 2764, 2765]","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[0, 0, 0, 0, 0, 0]"


In [43]:
sessions_list_hot_5[ITEM_COL].str.len().describe(percentiles=np.arange(.5, 1, 0.05))

count    114150.000000
mean         14.279877
std          15.113597
min           5.000000
50%           9.000000
55%          10.000000
60%          11.000000
65%          12.000000
70%          14.000000
75%          16.000000
80%          18.000000
85%          23.000000
90%          30.000000
95%          44.000000
max         570.000000
Name: video_id, dtype: float64

In [6]:
MAX_SEQUENCE_LENGTH = 50
#pad_token = 0

In [17]:
LIST_COLS = [FIELD_COL, CONCEPT_COL]
REG_COLS = list(set(sessions_list_hot_5.columns) - set(LIST_COLS))

NameError: name 'sessions_list_hot_5' is not defined

#### Slice and pad sequences

In [98]:
%%time
sessions_list_h5_sliced = sessions_list_hot_5.copy()
# Slice and pad with 0
sessions_list_h5_sliced.loc[:, REG_COLS] = sessions_list_h5_sliced[REG_COLS].transform(slice_and_pad, max_length=MAX_SEQUENCE_LENGTH, axis=0)
# Must pad with nested list - as (1, MAX_CONCEPT_LENGTH) attempt
sessions_list_h5_sliced.loc[:, LIST_COLS] = sessions_list_h5_sliced[LIST_COLS].transform(slice_and_pad, 
                                                                                     pad_token=[[PAD_TOKEN]*MAX_CONCEPT_LENGTH],
                                                                                     max_length=MAX_SEQUENCE_LENGTH, axis=0)

CPU times: user 2.41 s, sys: 173 ms, total: 2.58 s
Wall time: 2.61 s


In [99]:
raw_lists = np.array(sessions_list_h5_sliced[ITEM_COL].tolist())
raw_lists

array([[ 11923,  11993,  11924, ...,      0,      0,      0],
       [ 12082,  34059,  12145, ...,      0,      0,      0],
       [ 12147,  12143,  12082, ...,      0,      0,      0],
       ...,
       [160890, 160891, 160891, ...,      0,      0,      0],
       [  2769,   2766,   2762, ...,      0,      0,      0],
       [  2769,   2766,   2763, ...,      0,      0,      0]])

In [100]:
# First encountered zero
target_idx = (raw_lists==0).argmax(1, keepdims=True) - 1 
test_values = raw_lists

In [101]:
# Fill in the value which is hidden
# ->Split into train, validation and test with last and second-to-last items

val_values = test_values.copy()
np.put_along_axis(val_values, target_idx, PAD_TOKEN, axis=1)
train_values = val_values.copy()
train_mask_idx = target_idx - 1 
np.put_along_axis(train_values, train_mask_idx, PAD_TOKEN, axis=1)

In [102]:
test = sessions_list_h5_sliced[[ITEM_COL]]#.to_frame()
val = pd.Series(list(val_values), index=sessions_list_h5_sliced.index, name=ITEM_COL)#.to_frame()
train = pd.Series(list(train_values), index=sessions_list_h5_sliced.index, name=ITEM_COL)#.to_frame()

In [103]:
base_df = sessions_list_h5_sliced[[CONTINUOUS_COL, CONCEPT_COL, FIELD_COL]]
test_df = base_df.merge(test, on=USER_COL)
val_df = base_df.merge(val, on=USER_COL)
train_df = base_df.merge(train, on=USER_COL)

In [104]:
val_df

Unnamed: 0_level_0,seg_rep_count,concepts,fields,video_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U_10001587,"[0.0, 1.0, 0.0, 0.0, 0.25, 0.0, 0, 0, 0, 0, 0,...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[11923, 11993, 11924, 11924, 11924, 0, 0, 0, 0..."
U_10008027,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[12082, 34059, 12145, 12031, 12036, 0, 0, 0, 0..."
U_10012383,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[12147, 12143, 12082, 12080, 34059, 12145, 121..."
U_10018910,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2766, 2762, 2763, 2764, 2765, 0, 0, 0, 0, 0, ..."
U_100203,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[22186, 2450, 39251, 47053, 4908, 4915, 0, 0...","[[[6, 6, 6, 6, 6, 6, 0, 0, 0, 0]], [[6, 6, 6, ...","[142499, 142500, 87277, 125176, 125177, 142500..."
...,...,...,...,...
U_9979639,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2769, 2766, 2762, 2763, 2764, 0, 0, 0, 0, 0, ..."
U_998508,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[2577, 2579, 59554, 59555, 2649, 7814, 35747...","[[[18, 18, 18, 18, 18, 18, 18, 18, 19, 18]], [...","[25718, 2221, 2222, 2223, 2953, 2953, 28783, 7..."
U_9988528,"[0.3333333333333333, 0.0, 0.3333333333333333, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[160890, 160891, 160891, 160892, 160892, 16089..."
U_9996819,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2769, 2766, 2762, 2764, 2764, 0, 0, 0, 0, 0, ..."


In [105]:
train_df

Unnamed: 0_level_0,seg_rep_count,concepts,fields,video_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U_10001587,"[0.0, 1.0, 0.0, 0.0, 0.25, 0.0, 0, 0, 0, 0, 0,...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[11923, 11993, 11924, 11924, 0, 0, 0, 0, 0, 0,..."
U_10008027,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[12082, 34059, 12145, 12031, 0, 0, 0, 0, 0, 0,..."
U_10012383,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[12147, 12143, 12082, 12080, 34059, 12145, 121..."
U_10018910,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2766, 2762, 2763, 2764, 0, 0, 0, 0, 0, 0, 0, ..."
U_100203,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[22186, 2450, 39251, 47053, 4908, 4915, 0, 0...","[[[6, 6, 6, 6, 6, 6, 0, 0, 0, 0]], [[6, 6, 6, ...","[142499, 142500, 87277, 125176, 125177, 142500..."
...,...,...,...,...
U_9979639,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2769, 2766, 2762, 2763, 0, 0, 0, 0, 0, 0, 0, ..."
U_998508,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[2577, 2579, 59554, 59555, 2649, 7814, 35747...","[[[18, 18, 18, 18, 18, 18, 18, 18, 19, 18]], [...","[25718, 2221, 2222, 2223, 2953, 2953, 28783, 7..."
U_9988528,"[0.3333333333333333, 0.0, 0.3333333333333333, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[160890, 160891, 160891, 160892, 160892, 16089..."
U_9996819,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2769, 2766, 2762, 2764, 0, 0, 0, 0, 0, 0, 0, ..."


In [106]:
test_df

Unnamed: 0_level_0,seg_rep_count,concepts,fields,video_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U_10001587,"[0.0, 1.0, 0.0, 0.0, 0.25, 0.0, 0, 0, 0, 0, 0,...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[11923, 11993, 11924, 11924, 11924, 12024, 0, ..."
U_10008027,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[12082, 34059, 12145, 12031, 12036, 11924, 0, ..."
U_10012383,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[12147, 12143, 12082, 12080, 34059, 12145, 121..."
U_10018910,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2766, 2762, 2763, 2764, 2765, 2768, 0, 0, 0, ..."
U_100203,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[22186, 2450, 39251, 47053, 4908, 4915, 0, 0...","[[[6, 6, 6, 6, 6, 6, 0, 0, 0, 0]], [[6, 6, 6, ...","[142499, 142500, 87277, 125176, 125177, 142500..."
...,...,...,...,...
U_9979639,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2769, 2766, 2762, 2763, 2764, 2768, 0, 0, 0, ..."
U_998508,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[[2577, 2579, 59554, 59555, 2649, 7814, 35747...","[[[18, 18, 18, 18, 18, 18, 18, 18, 19, 18]], [...","[25718, 2221, 2222, 2223, 2953, 2953, 28783, 7..."
U_9988528,"[0.3333333333333333, 0.0, 0.3333333333333333, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[160890, 160891, 160891, 160892, 160892, 16089..."
U_9996819,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, ...","[2769, 2766, 2762, 2764, 2764, 2765, 0, 0, 0, ..."


In [121]:
# Deprecated
def debug(row):
    for col in [CONCEPT_COL, FIELD_COL]:
        if np.array(row[col]).shape != (50, 1, 300):
            print(f"something isn't as expected, {col}, {row.name}")
    return row


In [58]:
import pyarrow as pa

In [120]:
# Define a PyArrow schema to improve convertion/storage speed, as well as size
pa_fields = [
    pa.field(CONTINUOUS_COL, pa.list_(pa.float32(), list_size=MAX_SEQUENCE_LENGTH)),
    pa.field(ITEM_COL, pa.list_(pa.int64(), list_size=MAX_SEQUENCE_LENGTH)),
    pa.field(CONCEPT_COL, pa.list_(
        pa.list_(
            pa.list_(
                pa.int64(), list_size=MAX_CONCEPT_LENGTH
            )
            , list_size=1
        ), 
        list_size=MAX_SEQUENCE_LENGTH)
    ),
    pa.field(FIELD_COL, pa.list_(
        pa.list_(
            pa.list_(
                pa.int64(), list_size=MAX_CONCEPT_LENGTH
            )
            , list_size=1
        ), 
        list_size=MAX_SEQUENCE_LENGTH)
    )
]
pa_schema = pa.schema(pa_fields)
pa_schema

seg_rep_count: fixed_size_list<item: float>[50]
  child 0, item: float
video_id: fixed_size_list<item: int64>[50]
  child 0, item: int64
concepts: fixed_size_list<item: fixed_size_list<item: fixed_size_list<item: int64>[10]>[1]>[50]
  child 0, item: fixed_size_list<item: fixed_size_list<item: int64>[10]>[1]
      child 0, item: fixed_size_list<item: int64>[10]
          child 0, item: int64
fields: fixed_size_list<item: fixed_size_list<item: fixed_size_list<item: int64>[10]>[1]>[50]
  child 0, item: fixed_size_list<item: fixed_size_list<item: int64>[10]>[1]
      child 0, item: fixed_size_list<item: int64>[10]
          child 0, item: int64

In [114]:
out_name2df =  {"test_pa.parquet": test_df, "val_pa.parquet": val_df, "train_pa.parquet": train_df}

In [117]:
%%timeit
#pa.Table.from_pandas(test_df, schema=pa_schema)
#pa.Table.from_pandas(val_df, schema=pa_schema)
#pa.Table.from_pandas(train_df, schema=pa_schema)    

14.8 s ± 84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [118]:
%%timeit
#pa.Table.from_pandas(test_df)
#pa.Table.from_pandas(val_df)
#pa.Table.from_pandas(train_df)

26.1 s ± 193 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
out_path = Path("data-concepts-embed")

In [258]:
%%time
# fixed_size_list is not recognized further down the pipeline
test_df.to_parquet(out_path / "test.parquet")#, schema=pa_schema)
val_df.to_parquet(out_path / "val.parquet")#, schema=pa_schema)
train_df.to_parquet(out_path / "train.parquet")#, schema=pa_schema)

CPU times: user 31.6 s, sys: 2.58 s, total: 34.2 s
Wall time: 34.4 s


### Create manual schema

In [8]:
from merlin_standard_lib import Schema, Tag, ColumnSchema
from merlin.schema.tags import Tags
from merlin_standard_lib.proto.schema_bp import ValueCount, IntDomain

In [126]:
schema = Schema([
        ColumnSchema.create_categorical(ITEM_COL, MAX_VIDEO_ID, 
                                        value_count= ValueCount(0, MAX_SEQUENCE_LENGTH),
                                        min_index=0,
                                        tags=[Tag.CATEGORICAL, Tag.LIST, Tag.ITEM, Tag.ITEM_ID]
                                       ),
        ColumnSchema.create_continuous(CONTINUOUS_COL, 
                                       min_value=0, max_value=1, 
                                       value_count=ValueCount(0, MAX_SEQUENCE_LENGTH), 
                                       tags=[Tag.CONTINUOUS, Tag.LIST]),
        ColumnSchema.create_categorical(CONCEPT_COL, MAX_CONCEPT, 
                                    min_index=0,
                                    shape=(MAX_SEQUENCE_LENGTH, 1, MAX_CONCEPT_LENGTH),
                                    tags=[Tag.CATEGORICAL, Tag.LIST]
                                   ),
        ColumnSchema.create_categorical(FIELD_COL, MAX_FIELD, 
                                    #value_count= ValueCount(0, MAX_SEQUENCE_LENGTH),
                                    min_index=0,
                                    shape=(MAX_SEQUENCE_LENGTH, 1, MAX_CONCEPT_LENGTH),
                                    tags=[Tag.CATEGORICAL, Tag.LIST]
                                   ),
    ])

In [249]:
schema.select_by_name(ITEM_COL).column_schemas

[ColumnSchema(name='video_id', deprecated=False, presence=FeaturePresence(min_fraction=0.0, min_count=0), group_presence=FeaturePresenceWithinGroup(required=False), shape=FixedShape(dim=[]), value_count=ValueCount(min=0, max=50), value_counts=ValueCountList(value_count=[]), type=2, domain='', int_domain=IntDomain(name='video_id', min=0, max=185637, is_categorical=True), float_domain=FloatDomain(name='', min=0.0, max=0.0, disallow_nan=False, disallow_inf=False, is_embedding=False), string_domain=StringDomain(name='', value=[]), bool_domain=BoolDomain(name='', true_value='', false_value=''), struct_domain=StructDomain(feature=[], sparse_feature=[]), natural_language_domain=NaturalLanguageDomain(vocabulary='', coverage=FeatureCoverageConstraints(min_coverage=0.0, min_avg_token_length=0.0, excluded_string_tokens=[], excluded_int_tokens=[], oov_string_tokens=[]), token_constraints=[], sequence_length_constraints=SequenceLengthConstraints(excluded_int_value=[], excluded_string_value=[], min_

In [127]:
schema.select_by_name(CONCEPT_COL)

[{'name': 'concepts', 'shape': {'dim': [{'size': '50'}, {'size': '1'}, {'size': '10'}]}, 'type': 'INT', 'int_domain': {'name': 'concepts', 'max': '171411', 'is_categorical': True}, 'annotation': {'tag': ['list', 'categorical']}}]

In [128]:
with open(out_path/"schema.pb", "w") as file:
    file.write(schema.to_proto_text())

# XLNet Config and such

In [20]:
import os

#os.environ["CUDA_VISIBLE_DEVICES"]="0"
import glob
import torch 

from transformers4rec import torch as tr
from transformers4rec.torch.ranking_metric import NDCGAt, AvgPrecisionAt, RecallAt
from transformers4rec.torch.utils.examples_utils import wipe_memory
from transformers4rec.config.transformer import BertConfig
import random as pyrandom

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
schema = Schema().from_proto_text(out_path / "schema.pb")
schema

[{'name': 'video_id', 'value_count': {'max': '50'}, 'type': 'INT', 'int_domain': {'name': 'video_id', 'max': '185637', 'is_categorical': True}, 'annotation': {'tag': ['list', 'item', 'item_id', 'categorical']}}, {'name': 'seg_rep_count', 'value_count': {'max': '50'}, 'type': 'FLOAT', 'float_domain': {'name': 'seg_rep_count', 'max': 1.0}, 'annotation': {'tag': ['list', 'continuous']}}, {'name': 'concepts', 'shape': {'dim': [{'size': '50'}, {'size': '1'}, {'size': '10'}]}, 'type': 'INT', 'int_domain': {'name': 'concepts', 'max': '171411', 'is_categorical': True}, 'annotation': {'tag': ['list', 'categorical']}}, {'name': 'fields', 'shape': {'dim': [{'size': '50'}, {'size': '1'}, {'size': '10'}]}, 'type': 'INT', 'int_domain': {'name': 'fields', 'max': '74', 'is_categorical': True}, 'annotation': {'tag': ['list', 'categorical']}}]

#### Creating the embedding module

In [15]:
# Loading the weight matrices
emb_path = Path("embeddings")
pre_trained_concepts = load_embedding_table(CONCEPT_COL)
pre_trained_fields = load_embedding_table(FIELD_COL)
if SAVE:
    assert pre_trained_concepts.shape == (len(concept_enc2float), TXT_EMBED_DIM), f"Concept embedding mismatch, actual: {pre_trained_concepts.shape}"
    assert pre_trained_fields.shape == (len(field_enc2float), TXT_EMBED_DIM), f"Concept embedding mismatch, actual: {pre_trained_fields.shape}"

In [46]:
# Define embedding module parameters
TRAINABLE=False
INTERACTION_EMB=512
seed = 53
PRE_TRAINED_COLS = LIST_COLS[:]

In [47]:
embed_dims = {col: TXT_EMBED_DIM for col in PRE_TRAINED_COLS}
embed_init = {
    FIELD_COL: tr.PretrainedEmbeddingsInitializer(
                pre_trained_fields, trainable=TRAINABLE
                ),
    CONCEPT_COL: tr.PretrainedEmbeddingsInitializer(
                pre_trained_concepts, trainable=TRAINABLE
                ),
}

In [65]:
input_module = tr.TabularSequenceFeatures.from_schema(
        schema,
        max_sequence_length=MAX_SEQUENCE_LENGTH,
        masking="mlm",
        d_output=INTERACTION_EMB,
        # Categorical embedding kwargs
        embedding_dims=embed_dims,
        embeddings_initializers=embed_init,
)
# HARD CODED FOR NOW - Size of all features
input_module.projection_module[0][0] = torch.nn.Linear(in_features=64+TXT_EMBED_DIM*2+1, out_features=INTERACTION_EMB)
input_module

TabularSequenceFeatures(
  (to_merge): ModuleDict(
    (continuous_module): ContinuousFeatures(
      (filter_features): FilterFeatures()
    )
    (categorical_module): SequenceEmbeddingFeatures(
      (filter_features): FilterFeatures()
      (embedding_tables): ModuleDict(
        (video_id): Embedding(185638, 64, padding_idx=0)
        (concepts): Embedding(171412, 300, padding_idx=0)
        (fields): Embedding(75, 300, padding_idx=0)
      )
    )
  )
  (_aggregation): ConcatFeatures()
  (projection_module): SequentialBlock(
    (0): DenseBlock(
      (0): Linear(in_features=665, out_features=512, bias=True)
      (1): ReLU(inplace=True)
    )
  )
  (_masking): MaskedLanguageModeling()
)

In [66]:
#Setting all seeds, maybe redundant
torch.manual_seed(seed)
np.random.default_rng(seed)
pyrandom.seed(seed)
# Define XLNetConfig class and set default parameters for HF XLNet config  
transformer_config = tr.XLNetConfig.build(
    d_model=INTERACTION_EMB, n_head=4, n_layer=2, total_seq_length=MAX_SEQUENCE_LENGTH
)

metrics = [NDCGAt(top_ks=[5, 10], labels_onehot=True),  
           RecallAt(top_ks=[5, 10], labels_onehot=True),
           AvgPrecisionAt(top_ks=[5, 10], labels_onehot=True)
          ]

prediction_task = tr.NextItemPredictionTask(
    weight_tying=True, metrics=metrics, loss=torch.nn.NLLLoss(ignore_index=0)
)
model = transformer_config.to_torch_model(input_module, prediction_task)
model

Projecting inputs of NextItemPredictionTask to'64' As weight tying requires the input dimension '512' to be equal to the item-id embedding dimension '64'


Model(
  (heads): ModuleList(
    (0): Head(
      (body): SequentialBlock(
        (0): TabularSequenceFeatures(
          (to_merge): ModuleDict(
            (continuous_module): ContinuousFeatures(
              (filter_features): FilterFeatures()
            )
            (categorical_module): SequenceEmbeddingFeatures(
              (filter_features): FilterFeatures()
              (embedding_tables): ModuleDict(
                (video_id): Embedding(185638, 64, padding_idx=0)
                (concepts): Embedding(171412, 300, padding_idx=0)
                (fields): Embedding(75, 300, padding_idx=0)
              )
            )
          )
          (_aggregation): ConcatFeatures()
          (projection_module): SequentialBlock(
            (0): DenseBlock(
              (0): Linear(in_features=665, out_features=512, bias=True)
              (1): ReLU(inplace=True)
            )
          )
          (_masking): MaskedLanguageModeling()
        )
        (1): TansformerBlock(
  

In [24]:
from transformers4rec.config.trainer import T4RecTrainingArguments
from transformers4rec.torch import Trainer

In [27]:
model_path = Path("./xlnet-concept-embed")

In [67]:
#Setting all seeds, maybe redundant
seed = 53
torch.manual_seed(seed)
np.random.default_rng(seed)
pyrandom.seed(seed)

# Set hyperparameters for training 
TRAIN_BATCH_SIZE = 512
EVAL_BATCH_SIZE = 512
train_args = T4RecTrainingArguments(data_loader_engine='merlin', 
                                    per_device_train_batch_size = TRAIN_BATCH_SIZE,
                                    per_device_eval_batch_size = EVAL_BATCH_SIZE,
                                    output_dir = model_path, 
                                    save_total_limit=5,
                                    logging_steps=100,
                                    report_to = [],
                                    num_train_epochs=2,
                                    max_sequence_length=MAX_SEQUENCE_LENGTH, 
                                    no_cuda=False,
                                    seed=seed,
                                    #save_strategy="steps",
                                    evaluation_strategy="steps",
                                    eval_steps=250,
                                    save_steps=500,
                                    load_best_model_at_end=True,
                                    metric_for_best_model="/loss",
                                    greater_is_better=False,
                                    # Optimizer - Adapted from Keras Adam default params
                                    learning_rate=0.001,
                                    lr_scheduler_type='cosine', 
                                    learning_rate_num_cosine_cycles_by_epoch=1.5,
                                   )
trainer = Trainer(
    model=model,
    args=train_args,
    schema=schema,
    compute_metrics=True,
    train_dataset_or_path = str(out_path / "train.parquet"),
    eval_dataset_or_path = str(out_path / "val.parquet")
)

PyTorch: setting up devices


In [68]:
trainer.train()

***** Running training *****
  Num examples = 114176
  Num Epochs = 2
  Instantaneous batch size per device = 512
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 446


Step,Training Loss,Validation Loss,/next-item/ndcg At 5,/next-item/ndcg At 10,/next-item/recall At 5,/next-item/recall At 10,/next-item/avg Precision At 5,/next-item/avg Precision At 10,/loss
250,8.8015,No log,0.01953,0.04393,0.040648,0.116575,0.012821,0.022791,9.191959




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=446, training_loss=8.643563309057946, metrics={'train_runtime': 95.0891, 'train_samples_per_second': 0.021, 'train_steps_per_second': 4.69, 'total_flos': 0.0, 'train_loss': 8.643563309057946})

In [45]:
trainer.eval_dataset_or_path = str(out_path / "train.parquet")
trainer.evaluate()

{'eval_/next-item/ndcg_at_5': 0.7819878458976746,
 'eval_/next-item/ndcg_at_10': 0.7975867390632629,
 'eval_/next-item/recall_at_5': 0.8756811022758484,
 'eval_/next-item/recall_at_10': 0.9234866499900818,
 'eval_/next-item/avg_precision_at_5': 0.750555694103241,
 'eval_/next-item/avg_precision_at_10': 0.7570745944976807,
 'eval_/loss': 1.5988435745239258}