# Preparation stuff

## Connect to Drive

In [2]:
connect_to_drive = False

In [3]:
#Run command and authorize by popup --> other window
if connect_to_drive:
    from google.colab import drive
    drive.mount('/content/gdrive', force_remount=True)

## Install packages

In [4]:
if connect_to_drive:
    #Install FS code
    #!pip install  --upgrade --no-deps --force-reinstall git+https://github.com/federicosiciliano/easy_lightning.git@fedsic
    !pip install  --upgrade --no-deps --force-reinstall git+https://github.com/PokeResearchLab/easy_lightning.git

    !pip install pytorch_lightning

## IMPORTS

In [5]:
#Put all imports here
import numpy as np
import matplotlib.pyplot as plt
#from copy import deepcopy
#import pickle
import os
import sys
#import cv2
import torch
import csv

## Define paths

In [6]:
#every path should start from the project folder:
project_folder = "../"
if connect_to_drive:
    project_folder = "/content/gdrive/Shareddrives/<SharedDriveName>" #Name of SharedDrive folder
    #project_folder = "/content/gdrive/MyDrive/<MyDriveName>" #Name of MyDrive folder

#Config folder should contain hyperparameters configurations
cfg_folder = os.path.join(project_folder,"cfg")

#Data folder should contain raw and preprocessed data
data_folder = os.path.join(project_folder,"data")
raw_data_folder = os.path.join(data_folder,"raw")
processed_data_folder = os.path.join(data_folder,"processed")

#Source folder should contain all the (essential) source code
source_folder = os.path.join(project_folder,"src")

#The out folder should contain all outputs: models, results, plots, etc.
out_folder = os.path.join(project_folder,"out")
img_folder = os.path.join(out_folder,"img")

## Import own code

In [7]:
#To import from src:

#attach the source folder to the start of sys.path
sys.path.insert(0, project_folder)

#import from src directory
from src.module import *

import easy_exp, easy_rec, easy_torch #easy_data

[2024-06-03 10:09:02,895] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to mps (auto detect)




# MAIN

## Train

### Data

In [502]:
cfg = easy_exp.cfg.load_configuration("config_rec")

In [503]:
cfg["data_params"]["data_folder"] = raw_data_folder

In [504]:
#cfg["data_params"]["test_sizes"] = [cfg["data_params.dataset_params.out_seq_len.val"],cfg["data_params.dataset_params.out_seq_len.test"]]

data, maps = easy_rec.data_generation_utils.preprocess_dataset(**cfg["data_params"])

True True
../data\raw\ml-1m ml-1m
Ratings data already exists. Skip pre-processing
Filtering by minimum number of users per item: 5
Filtering by minimum number of items per user: 5
Densifying index
Splitting: leave_n_out


In [505]:
#Save user and item mappings
with open(os.path.join(processed_data_folder,"user_map.csv"), "w") as f_user:
    w = csv.writer(f_user)
    w.writerows(maps['uid'].items())

with open(os.path.join(processed_data_folder,"item_map.csv"), "w") as f_item:
    w = csv.writer(f_item)
    w.writerows(maps['sid'].items())

In [506]:
datasets = easy_rec.rec_torch.prepare_rec_datasets(data,**cfg["data_params"]["dataset_params"])

In [507]:
cfg["data_params"]["collator_params"]["num_items"] = np.max(list(maps["sid"].values()))

In [None]:
collators = easy_rec.rec_torch.prepare_rec_collators(data, **cfg["data_params"]["collator_params"])

In [508]:
loaders = easy_rec.rec_torch.prepare_rec_data_loaders(datasets, **cfg["model"]["loader_params"], collate_fn=collators)

In [509]:
# for x in loaders["train"]:
#      break
# print(x['uid'][0])
# print(x['in_sid'][0])
# print(x['in_rating'][0])

### MODEL 

In [513]:
cfg["model"]["rec_model"]["num_items"] = np.max(list(maps["sid"].values()))
cfg["model"]["rec_model"]["num_users"] = np.max(list(maps["uid"].values()))
cfg["model"]["rec_model"]["lookback"] = cfg["data_params"]["collator_params"]["lookback"]

In [514]:
#load the default SASRec module with the specified parameters
main_module = easy_rec.rec_torch.create_rec_model(**cfg["model"]["rec_model"])
#print(main_module)

Seed set to 42


SASRec(
  (item_emb): Embedding(3417, 64, padding_idx=0)
  (pos_emb): Embedding(200, 64)
  (dropout): Dropout(p=0.2, inplace=False)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=256, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=256, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
        (activation): GELU(approximate='none')
      )
    )
  )
  (last_layernorm): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
)




In [515]:
#Set the item embedding layer with SVD right matrix (if freeze_emb=True the matrix weights will remain fixed)

useSVD = cfg["model"]["useSVD"]
freeze_emb = cfg["model"]["freeze_emb"]

if useSVD:
    num_users = cfg["model"]["rec_model"]["num_users"]
    num_items = cfg["model"]["rec_model"]["num_items"]
    emb_size = cfg["model"]["rec_model"]["emb_size"]
    
    utility_matrix = create_utility_matrix(loaders['train'], num_users, num_items)
    embedding_matrix = create_embedding_matrix(utility_matrix, emb_size)

    new_emb_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

    #initialize the item embedding matrix with the new embedding matrix 
    main_module.item_emb.weight.data = new_emb_matrix

    if freeze_emb:
        for param in main_module.item_emb.parameters():
            param.requires_grad = False

In [516]:
exp_found, experiment_id = easy_exp.exp.get_set_experiment_id(cfg)
print("Experiment already found:", exp_found, "----> The experiment id is:", experiment_id)

Experiment already found: False ----> The experiment id is: nDqVZlUvpTLSJ723


In [517]:
#if exp_found: exit() #TODO: make the notebook/script stop here if the experiment is already found

In [518]:
trainer_params = easy_torch.preparation.prepare_experiment_id(cfg["model"]["trainer_params"], experiment_id)

# Prepare callbacks and logger using the prepared trainer_params
trainer_params["callbacks"] = easy_torch.preparation.prepare_callbacks(trainer_params)
trainer_params["logger"] = easy_torch.preparation.prepare_logger(trainer_params)

# Prepare the trainer using the prepared trainer_params
trainer = easy_torch.preparation.prepare_trainer(**trainer_params)

model_params = cfg["model"].copy()

model_params["loss"] = easy_torch.preparation.prepare_loss(cfg["model"]["loss"], easy_rec.losses)

# Prepare the optimizer using configuration from cfg
model_params["optimizer"] = easy_torch.preparation.prepare_optimizer(**cfg["model"]["optimizer"])

# Prepare the metrics using configuration from cfg
model_params["metrics"] = easy_torch.preparation.prepare_metrics(cfg["model"]["metrics"], easy_rec.metrics)

# Create the model using main_module, loss, and optimizer
model = easy_torch.process.create_model(main_module, **model_params)

Seed set to 42
Seed set to 42
Seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42


In [519]:
# Prepare the emission tracker using configuration from cfg
tracker = easy_torch.preparation.prepare_emission_tracker(**cfg["model"]["emission_tracker"], experiment_id=experiment_id)

### Train

In [520]:
# Train the model using the prepared trainer, model, and data loaders
easy_torch.process.train_model(trainer, model, loaders, tracker=tracker, val_key=["val","test"])

Seed set to 42
Missing logger folder: ../out/log/prova/nDqVZlUvpTLSJ723/lightning_logs

  | Name        | Type                        | Params
------------------------------------------------------------
0 | main_module | SASRec                      | 331 K 
1 | loss        | SequentialBCEWithLogitsLoss | 0     
2 | metrics     | ModuleDict                  | 0     
------------------------------------------------------------
112 K     Trainable params
218 K     Non-trainable params
331 K     Total params
1.326     Total estimated model params size (MB)


Epoch 9: 100%|██████████| 68/68 [03:03<00:00,  0.37it/s, v_num=0]          

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 68/68 [03:03<00:00,  0.37it/s, v_num=0]


  df = pd.concat([df, pd.DataFrame.from_records([dict(data.values)])])


### TEST

In [521]:
easy_torch.process.test_model(trainer, model, loaders, tracker=tracker)

Seed set to 42


Testing DataLoader 0: 100%|██████████| 48/48 [00:36<00:00,  1.30it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       test_F1_@10          0.18078884482383728
       test_F1_@20           0.09476338326931
       test_F1_@5           0.3313210904598236
      test_MAP_@10          0.01253512967377901
      test_MAP_@20         0.011880980804562569
       test_MAP_@5         0.013013244606554508
      test_MRR_@10          0.03684346750378609
      test_MRR_@20          0.04383499547839165
       test_MRR_@5          0.0298730731010437
      test_NDCG_@10        0.055362313985824585
      test_NDCG_@20         0.08169471472501755
      test_NDCG_@5          0.03814282268285751
   test_Precision_@10      0.011771521531045437
   test_Precision_@20   

In [522]:
# Save experiment and print the current configuration
#save_experiment_and_print_config(cfg)
easy_exp.exp.save_experiment(cfg)

# Print completion message
print("Execution completed.")
print("######################################################################")
print()

Execution completed.
######################################################################

