In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

import sys
sys.path.remove('/home/jovyan/.imgenv-vasilyev-0/lib/python3.7/site-packages')
sys.path.append('/home/jovyan/klenitskiy/repos/seqrec-experiments/')
sys.path.append('/home/jovyan/klenitskiy/repos/seqrec-datasets')

In [2]:
import json

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, ModelSummary, RichProgressBar
from replay.splitters import ColdUserRandomSplitter, NewUsersSplitter
from sklearn.preprocessing import LabelEncoder
from torch import nn
from torch.utils.data import DataLoader
from transformers import GPT2Config, GPT2Model

from seqrec_experiments.metrics import Evaluator
from seqrec_experiments.lightning.datasets import CausalLMDataset, CausalLMPredictionDataset, PaddingCollateFn
from seqrec_experiments.lightning.models import GPT4Rec
from seqrec_experiments.lightning.modules import SeqRec
from seqrec_experiments.postprocess import preds2recs
from seqrec_experiments.utils import extract_validation_history

from preprocessing.preparation import get_last_item, remove_last_item


libgomp: Invalid value for environment variable OMP_NUM_THREADS

libgomp: Invalid value for environment variable OMP_NUM_THREADS


In [3]:
DATA_PATH = '/home/jovyan/klenitskiy/data/ml-20m/ratings.csv'

DATA_SAVE_PATH = 'data/ml-20m'
MODEL_SAVE_PATH = 'models/ml-20m'

## Prepare data

In [94]:
df = pd.read_csv(DATA_PATH)
df.columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = df.sort_values(['user_id', 'timestamp'])
print(df.shape)
print(df.user_id.nunique(), df.item_id.nunique())
df.head()

(20000263, 4)
138493 26744


Unnamed: 0,user_id,item_id,rating,timestamp
20,1,924,3.5,1094785598
19,1,919,3.5,1094785621
86,1,2683,3.5,1094785650
61,1,1584,3.5,1094785656
23,1,1079,4.0,1094785665


In [45]:
# encoder = LabelEncoder()
# df['item_id'] = encoder.fit_transform(df['item_id'])
# mapping = dict(zip(encoder.classes_.tolist(), encoder.transform(encoder.classes_).tolist()))

In [95]:
# with open(os.path.join(DATA_SAVE_PATH, 'item_mapping.json'), 'w') as file_:
#     json.dump(mapping, file_)

In [96]:
test_splitter = NewUsersSplitter(
    test_size=0.1, drop_cold_items=True, query_column="user_id")
train, test = test_splitter.split(df)

user_counts = train.user_id.value_counts()
user_ids = user_counts[user_counts > 1].index
train = train[train.user_id.isin(user_ids)]

validation_splitter = ColdUserRandomSplitter(
    test_size=0.4, drop_cold_items=True, query_column="user_id", seed=42)
train, validation = validation_splitter.split(train)

In [97]:
print(train.shape, train.user_id.nunique(), train.item_id.nunique())
print(validation.shape, validation.user_id.nunique(), validation.item_id.nunique())
print(test.shape, test.user_id.nunique(), test.item_id.nunique())

(10753324, 4) 74777 17376
(7177832, 4) 49852 16084
(1482915, 4) 13850 13999


In [98]:
print(pd.to_datetime(train.timestamp.min(), unit='s'), pd.to_datetime(train.timestamp.max(), unit='s'))
print(pd.to_datetime(validation.timestamp.min(), unit='s'), pd.to_datetime(validation.timestamp.max(), unit='s'))
print(pd.to_datetime(test.timestamp.min(), unit='s'), pd.to_datetime(test.timestamp.max(), unit='s'))

1996-01-29 00:00:00 2012-02-25 23:08:52
1995-01-09 11:46:44 2012-02-25 23:02:42
2012-02-25 23:09:42 2015-03-31 06:03:17


In [99]:
test_inputs = remove_last_item(test)
test_last_item = get_last_item(test)

In [100]:
# train.to_csv(os.path.join(DATA_SAVE_PATH, 'train_raw.csv'), index=False)
# validation.to_csv(os.path.join(DATA_SAVE_PATH, 'validation_raw.csv'), index=False)
# test.to_csv(os.path.join(DATA_SAVE_PATH, 'test_raw.csv'), index=False)

## Train

In [109]:
MAX_LENGTH = 128

VALIDATION_SIZE = 10000

BATCH_SIZE = 256
TEST_BATCH_SIZE = 256
NUM_WORKERS = 8

GPT_CONFIG = {
    'vocab_size': 2,
    'n_positions': 128,
    'n_embd': 256,
    'n_layer': 2,
    'n_head': 2,
}

TRAINER_PARAMS = {
    'max_epochs': 100,
    'devices': 1,
    'enable_checkpointing': True,
}
LEARNING_RATE = 1e-3
PATIENCE = 5

In [110]:
train_dataset = CausalLMDataset(train, max_length=MAX_LENGTH, time_col='timestamp')

validation_users = validation.user_id.unique()
if VALIDATION_SIZE and (VALIDATION_SIZE < len (validation_users)):
    validation_users = np.random.choice(validation_users, size=VALIDATION_SIZE, replace=False)
eval_dataset = CausalLMPredictionDataset(validation[validation.user_id.isin(validation_users)],
                                         max_length=MAX_LENGTH, validation_mode=True,
                                         time_col='timestamp')

collate_fn = PaddingCollateFn()

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE,
    shuffle=True, num_workers=NUM_WORKERS,
    collate_fn=PaddingCollateFn())
eval_loader = DataLoader(
    eval_dataset, batch_size=TEST_BATCH_SIZE,
    shuffle=False, num_workers=NUM_WORKERS,
    collate_fn=PaddingCollateFn())

batch = next(iter(train_loader))
print(batch['input_ids'].shape)

torch.Size([256, 128])


In [111]:
vocab_size = df.item_id.max() + 1

model = GPT4Rec(GPT_CONFIG, vocab_size, add_head=True, tie_weights=True)
outputs = model(batch['input_ids'], batch['attention_mask'])
outputs.shape

torch.Size([256, 128, 131263])

In [None]:
%%time
seqrec_module = SeqRec(model, lr=LEARNING_RATE, predict_top_k=10)

early_stopping = EarlyStopping(monitor="val_ndcg", mode="max", patience=PATIENCE, verbose=False)
model_summary = ModelSummary(max_depth=4)
checkpoint = ModelCheckpoint(save_top_k=1, monitor="val_ndcg", mode="max", save_weights_only=True)
pbar = RichProgressBar()
callbacks=[early_stopping, model_summary, checkpoint, pbar]

trainer = pl.Trainer(callbacks=callbacks, **TRAINER_PARAMS)

trainer.fit(model=seqrec_module,
            train_dataloaders=train_loader,
            val_dataloaders=eval_loader)

Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]

   | Name                         | Type       | Params
-------------------------------------------------------------
0  | model                        | GPT4Rec    | 35.2 M
1  | model.embed_layer            | Embedding  | 33.6 M
2  | model.transformer_model      | GPT2Model  | 1.6 M 
3  | model.transformer_model.wte  | Embedding  | 512   
4  | model.transformer_model.wpe  | Embedding  | 32.8 K
5  | model.transformer_model.drop | Dropout    | 0     
6  | model.transformer_model.h    | ModuleList | 1.6 M 
7  | model.transformer_model.h.0  | GPT2Block  | 789 K 
8  | model.transformer_model.h.1  | GPT2Block  | 789 K 
9  | m

Output()

In [None]:
history = extract_validation_history(trainer.logger.experiment.log_dir)
display(history)
history.set_index('epoch')['val_ndcg'].plot(figsize=(10,4), title='GPT4Rec')

## Predict

In [None]:
seqrec_module.load_state_dict(torch.load(checkpoint.best_model_path)['state_dict'])

predict_dataset = CausalLMPredictionDataset(test_inputs, max_length=MAX_LENGTH, time_col='timestamp')

predict_loader = DataLoader(
    predict_dataset, batch_size=TEST_BATCH_SIZE, shuffle=False,
    num_workers=NUM_WORKERS, collate_fn=PaddingCollateFn())

preds = trainer.predict(model=seqrec_module, dataloaders=predict_loader)

recs = preds2recs(preds)
print(recs.shape)
recs.head()

In [None]:
%%time
evaluator = Evaluator()
metrics = evaluator.compute_metrics(test_last_item, recs, train)
metrics

## Save

In [None]:
# torch.save(seqrec_module.model, os.path.join(MODEL_SAVE_PATH, 'gpt_64_1_1.pt'))
# torch.save(seqrec_module.model, os.path.join(MODEL_SAVE_PATH, 'gpt_64_2_2.pt'))
# torch.save(seqrec_module.model, os.path.join(MODEL_SAVE_PATH, 'gpt_256_2_2.pt'))

# torch.save(seqrec_module.model, os.path.join(MODEL_SAVE_PATH, 'gpt_64_2_2_raw.pt'))
# torch.save(seqrec_module.model, os.path.join(MODEL_SAVE_PATH, 'gpt_256_2_2_raw.pt'))

In [24]:
# torch.save(seqrec_module.model.state_dict(), os.path.join(MODEL_SAVE_PATH, 'gpt_64_1_1.pt'))