# Preparation stuff

## Connect to Drive

In [1]:
connect_to_drive = False

In [2]:
#Run command and authorize by popup --> other window
if connect_to_drive:
    from google.colab import drive
    drive.mount('/content/gdrive', force_remount=True)

## Install packages

In [3]:
if connect_to_drive:
    #Install FS code
    !pip install  --upgrade --force-reinstall git+https://github.com/federicosiciliano/easy_lightning.git

    !pip install pytorch_lightning

## IMPORTS

In [None]:
#Put all imports here
import numpy as np
from copy import deepcopy
import os
import sys

## Define paths

In [5]:
#every path should start from the project folder:
project_folder = "../"
if connect_to_drive:
    project_folder = "/content/gdrive/Shareddrives/<SharedDriveName>" #Name of SharedDrive folder
    #project_folder = "/content/gdrive/MyDrive/<MyDriveName>" #Name of MyDrive folder

#Config folder should contain hyperparameters configurations
cfg_folder = os.path.join(project_folder,"cfg")

#Data folder should contain raw and preprocessed data
data_folder = os.path.join(project_folder,"data")
raw_data_folder = os.path.join(data_folder,"raw")
processed_data_folder = os.path.join(data_folder,"processed")

#Source folder should contain all the (essential) source code
source_folder = os.path.join(project_folder,"src")

#The out folder should contain all outputs: models, results, plots, etc.
out_folder = os.path.join(project_folder,"out")
img_folder = os.path.join(out_folder,"img")

## Import own code

In [6]:
#To import from src:

#attach the source folder to the start of sys.path
sys.path.insert(0, project_folder)

#import from src directory
# from src import ??? as additional_module
import easy_rec as additional_module #REMOVE THIS LINE IF IMPORTING OWN ADDITIONAL MODULE

import easy_exp, easy_rec, easy_torch #easy_data

# MAIN

## Train

### Data

In [7]:
cfg = easy_exp.cfg.load_configuration("config_rec")

In [None]:
data_params = deepcopy(cfg["data_params"])
data_params["data_folder"] = raw_data_folder

data, maps = easy_rec.data_generation_utils.preprocess_dataset(**data_params)

Ratings data already exists. Skip pre-processing
Filtering by minimum number of users per item: 5
Filtering by minimum number of items per user: 5
Densifying index
Splitting: leave_n_out


In [None]:
# #Save user and item mappings
# with open(os.path.join(processed_data_folder,"user_map.csv"), "w") as f_user:
#     w = csv.writer(f_user)
#     w.writerows(maps['uid'].items())

# with open(os.path.join(processed_data_folder,"item_map.csv"), "w") as f_item:
#     w = csv.writer(f_item)
#     w.writerows(maps['sid'].items())

In [10]:
datasets = easy_rec.rec_torch.prepare_rec_datasets(data,**data_params["dataset_params"])

In [None]:
collator_params = deepcopy(cfg["data_params"]["collator_params"])
collator_params["num_items"] = np.max(list(maps["sid"].values()))

In [12]:
# app = collator_params.get("negatives_distribution",None)
# if app is not None:
#     if app == "popularity":
#         collator_params["negatives_distribution"] = easy_rec.data_generation_utils.get_popularity_items(datasets["train"], collator_params["num_items"])
#     elif app not in ["uniform","dynamic"]:
#         raise ValueError("Invalid negatives distribution")

In [None]:
collators = easy_rec.rec_torch.prepare_rec_collators(**collator_params)

In [None]:
loader_params = deepcopy(cfg["model"]["loader_params"])
loaders = easy_rec.rec_torch.prepare_rec_data_loaders(datasets, **loader_params, collate_fn=collators)

In [None]:
rec_model_params = deepcopy(cfg["model"]["rec_model"])
rec_model_params["num_items"] = np.max(list(maps["sid"].values()))
rec_model_params["num_users"] = np.max(list(maps["uid"].values()))
rec_model_params["lookback"] = data_params["collator_params"]["lookback"]

In [16]:
main_module = easy_rec.rec_torch.create_rec_model(**rec_model_params)#, graph=easy_rec.data_generation_utils.get_graph_representation(data["train_sid"]))

Seed set to 42


In [17]:
exp_found, experiment_id = easy_exp.exp.get_set_experiment_id(cfg)
print("Experiment already found:", exp_found, "----> The experiment id is:", experiment_id)

Experiment already found: True ----> The experiment id is: VBdAdVbFguHeB1Ic


In [None]:
# # Find "original" implementation:
# # ...

# keys_to_change = {"model.rec_model.seed": 42}
# orig_cfg = deepcopy(cfg)
# for k,v in keys_to_change.items():
#     orig_cfg[k] = 42

# orig_exp_found, orig_experiment_id = easy_exp.exp.get_experiment_id(orig_cfg)
# print("Experiment already found:", orig_exp_found, "----> The experiment id is:", orig_experiment_id)

In [19]:
#if exp_found: exit() #TODO: make the notebook/script stop here if the experiment is already found

In [None]:
model_params = deepcopy(cfg["model"])

trainer_params = easy_torch.preparation.prepare_experiment_id(model_params["trainer_params"], experiment_id)

#dynamic_negatives_index = [i for i, x in enumerate(trainer_params["callbacks"]) if "DynamicNegatives" in x][0]
#trainer_params["callbacks"][dynamic_negatives_index]["DynamicNegatives"]["dataloader"] = loaders["train"]

# Prepare callbacks and logger using the prepared trainer_params
trainer_params["callbacks"] = easy_torch.preparation.prepare_callbacks(trainer_params, additional_module.callbacks)
trainer_params["logger"] = easy_torch.preparation.prepare_logger(trainer_params)

# Prepare the trainer using the prepared trainer_params
trainer = easy_torch.preparation.prepare_trainer(**trainer_params)

model_params["loss"] = easy_torch.preparation.prepare_loss(model_params["loss"], additional_module.losses)

# Prepare the optimizer using configuration from cfg
model_params["optimizer"] = easy_torch.preparation.prepare_optimizer(**model_params["optimizer"])

# Prepare the metrics using configuration from cfg
# num_negatives = {split_name:[x] for split_name,x in data_params["collator_params"]["num_negatives"].items()}
# num_negatives["val"] += num_negatives["test"] #cause using test as val just to get metrics
# model_params["metrics"] = additional_module.metrics.prepare_rank_corrections(model_params["metrics"], num_negatives = num_negatives, num_items = rec_model_params["num_items"])
model_params["metrics"] = easy_torch.preparation.prepare_metrics(model_params["metrics"], additional_module.metrics)

# Create the model using main_module, loss, and optimizer
model = easy_torch.process.create_model(main_module, **model_params)

Seed set to 42
Seed set to 42
Seed set to 42
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42
Seed set to 42


In [21]:
# Prepare the emission tracker using configuration from cfg
#tracker = easy_torch.preparation.prepare_emission_tracker(**cfg["model"]["emission_tracker"], experiment_id=experiment_id)

In [22]:
# Prepare the flops profiler using configuration from cfg
#profiler = easy_torch.preparation.prepare_flops_profiler(model=model, **cfg["model"]["flops_profiler"], experiment_id=experiment_id)

### Train

In [23]:
#easy_torch.process.test_model(trainer, model, loaders, test_key=["train","val","test"]) #, tracker=tracker, profiler=profiler)

In [24]:
# Train the model using the prepared trainer, model, and data loaders
easy_torch.process.train_model(trainer, model, loaders, val_key=["val","test"]) #tracker=tracker, profiler=profiler, 

Seed set to 42
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /Users/federicosiciliano/Desktop/Py_utils/easy_lightning_development/easy_rec/out/models/prova/VBdAdVbFguHeB1Ic exists and is not empty.

  | Name        | Type                        | Params | Mode 
--------------------------------------------------------------------
0 | main_module | GRU4Rec                     | 249 K  | train
1 | loss        | SequentialBCEWithLogitsLoss | 0      | train
2 | metrics     | RobustModuleDict            | 0      | train
--------------------------------------------------------------------
249 K     Trainable params
0         Non-trainable params
249 K     Total params
0.998     Total estimated model params size (MB)
39        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (8) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 19: 100%|██████████| 8/8 [00:01<00:00,  5.26it/s, v_num=2]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 8/8 [00:01<00:00,  5.24it/s, v_num=2]


In [25]:
easy_torch.process.test_model(trainer, model, loaders) #, tracker=tracker, profiler=profiler)

Seed set to 42
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 8/8 [00:00<00:00, 14.63it/s]


In [None]:
# Save experiment and print the current configuration
easy_exp.exp.save_experiment(cfg)

# Print completion message
print("Execution completed.")
print("######################################################################")
print()

Execution completed.
######################################################################

