In [2]:
import pandas as pd
from deepctr.feature_column import SparseFeat, VarLenSparseFeat
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
import numpy as np
import random
from tqdm import tqdm
from deepmatch.models import ComiRec, NCF
from deepmatch.utils import sampledsoftmaxloss, NegativeSampler
import tensorflow as tf
import gc

from deepctr.feature_column import DenseFeat

pad_sequences = tf.keras.utils.pad_sequences
SEQ_LEN = 50

2024-03-29 23:13:15.781153: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-29 23:13:15.844947: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-29 23:13:16.143508: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-03-29 23:13:16.143538: W tensorflow/compiler/xl

In [3]:
def get_train_input_for_user(user_df):
    #user_df = user_df.sort_values("timestamp")
    # user_df = user_df[-SEQ_LEN:] 
    user_id = user_df["user_id"].to_list()[0]
    tmp = [user_df[:x+1] for x in range(len(user_df))]
    input_data = []
    for record in tmp:
        sample = {
            "history_movie_id":np.array(record["movie_id"].tolist()[:-1]),
            "history_genre_id": np.array(record["genres"].tolist()[:-1]),
            "movie_id":np.array(record["movie_id"].tolist()[-1]),
            # np.array(record["genres"].tolist()[:-1]) # history genre list
        }
        input_data.append(sample)
    # train, test = input_data[:-1], input_data[-1:]
    result_df = pd.DataFrame(input_data)
    result_df["label"] = 1 # tmp for now
    result_df["hist_len"] = SEQ_LEN
    result_df[["label", "hist_len"]] = result_df[["label", "hist_len"]].astype("int8")
    return result_df

def get_test_input_for_user(user_df):
    #user_df = user_df.sort_values("timestamp")
    # user_df = user_df[-SEQ_LEN:] 
    user_id = user_df["user_id"].to_list()[0]
    
    input_data = []
    record = user_df
    sample = {
        "history_movie_id":np.array(record["movie_id"].tolist()[:-1]),
        "history_genre_id": np.array(record["genres"].tolist()[:-1]),
        "movie_id":np.array(record["movie_id"].tolist()[-1]),
        # np.array(record["genres"].tolist()[:-1]) # history genre list
    }
    input_data.append(sample)
    # train, test = input_data[:-1], input_data[-1:]
    result_df = pd.DataFrame(input_data)
    result_df["label"] = 1 # tmp for now
    result_df["hist_len"] = SEQ_LEN
    return result_df

In [4]:
df = pd.read_csv("./datasets/ml-25m/ratings.csv")

In [5]:
movies = pd.read_csv("./datasets/ml-25m/movies.csv")

In [6]:
tqdm.pandas()

In [7]:
data = pd.merge(df,movies)

In [8]:
data["genres"] = data["genres"].apply(lambda x:x.split("|")[0])

In [9]:
data.columns = ["user_id", "movie_id", "rating", "timestamp", "title", "genres"]

In [10]:
lbe = LabelEncoder()
data["genres"] = lbe.fit_transform(data["genres"]) + 1

In [11]:
data = data.sort_values("timestamp")

In [12]:
del data["timestamp"]

In [13]:
del df, movies

In [14]:
del data["title"]

In [15]:
gc.collect()

0

In [16]:
user_id_size =  data['user_id'].max()+1
movie_id_size = data['movie_id'].max()+1
genre_id_size = int(data['genres'].max())+1
embedding_dim = 32

In [17]:
user_feature_columns = [
                        # SparseFeat('user_id', user_id_size, 16),
                        VarLenSparseFeat(SparseFeat('hist_movie_id', movie_id_size, embedding_dim,
                                                    embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'),
                        VarLenSparseFeat(SparseFeat('hist_genres', genre_id_size, embedding_dim,
                                                   embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'),
                        ]
item_feature_columns = [SparseFeat('movie_id', movie_id_size, embedding_dim),]

In [18]:
#tmp_df = data[:1000000].copy()

In [19]:
train_set = data.groupby("user_id").progress_apply(lambda x:get_train_input_for_user(x[:80]))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162541/162541 [08:17<00:00, 326.53it/s]


In [20]:
test_set = data.groupby("user_id").progress_apply(lambda x:get_test_input_for_user(x))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162541/162541 [00:51<00:00, 3164.51it/s]


In [21]:
test_set["history_movie_id"] = pad_sequences(test_set["history_movie_id"], maxlen=SEQ_LEN, padding='post', truncating='post', value=0).tolist()
test_set["history_genre_id"] = pad_sequences(test_set["history_genre_id"], maxlen=SEQ_LEN, padding='post', truncating='post', value=0).tolist()

test_input = test_set.sample(frac=1)

test_model_input = {
    "hist_movie_id":np.array(test_input["history_movie_id"].tolist()),
    "hist_genres":np.array(test_input["history_genre_id"].tolist()),
    "movie_id": np.array(test_input["movie_id"].tolist()),
    "hist_len": np.array(test_input["hist_len"].tolist()),
}
test_label =  test_input["label"].tolist()

train_set["history_movie_id"] = pad_sequences(train_set["history_movie_id"], maxlen=SEQ_LEN, padding='post', truncating='post', value=0).tolist()
train_set["history_genre_id"] = pad_sequences(train_set["history_genre_id"], maxlen=SEQ_LEN, padding='post', truncating='post', value=0).tolist()

train_input = train_set.sample(frac=1)

train_model_input = {
    "hist_movie_id":np.array(train_input["history_movie_id"].tolist()),
    "hist_genres":np.array(train_input["history_genre_id"].tolist()),
    "movie_id": np.array(train_input["movie_id"].tolist()),
    "hist_len": np.array(train_input["hist_len"].tolist()),
}
train_label =  train_input["label"].tolist()





In [22]:
import tensorflow as tf
if tf.__version__ >= '2.0.0':
    tf.compat.v1.disable_eager_execution()
else:
    K.set_learning_phase(True)

In [23]:
from collections import Counter
train_counter = Counter(data['movie_id'])
item_count = [train_counter.get(i,0) for i in range(item_feature_columns[0].vocabulary_size)] #todo change this later
sampler_config = NegativeSampler('frequency',num_sampled=255,item_name="movie_id",item_count=item_count)
#model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(128,64, embedding_dim), sampler_config=sampler_config)
model = ComiRec(user_feature_columns,
                item_feature_columns,
                k_max=2, 
                user_dnn_hidden_units=(128,64, embedding_dim),
                sampler_config=sampler_config)


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


2024-03-29 23:26:33.600176: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-03-29 23:26:33.600191: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: pop-os
2024-03-29 23:26:33.600193: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: pop-os
2024-03-29 23:26:33.600270: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 525.89.2
2024-03-29 23:26:33.600277: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 525.89.2
2024-03-29 23:26:33.600278: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 525.89.2


In [None]:
model.compile(optimizer="adam", loss=sampledsoftmaxloss)
history = model.fit(train_model_input, train_label,  # train_label,
                    batch_size=512, epochs=20, verbose=1, validation_split=0.0, use_multiprocessing=True)

Train on 9617397 samples
Epoch 1/20


2024-03-29 23:26:37.242782: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-29 23:26:37.269088: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:357] MLIR V1 optimization pass is not enabled
2024-03-29 23:26:37.332414: W tensorflow/c/c_api.cc:291] Operation '{name:'training/Adam/user_dnn/bias0/v/Assign' id:1298 op device:{requested: '', assigned: ''} def:{{{node training/Adam/user_dnn/bias0/v/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](training/Adam/user_dnn/bias0/v, training/Adam/user_dnn/bias0/v/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an er

Epoch 13/20
Epoch 14/20
Epoch 15/20

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 18/20
Epoch 19/20

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



 109056/9617397 [..............................] - ETA: 51:35 - loss: 3.7958

In [None]:
movies[movies["title"].apply(lambda x:"The Shining" in x)]