# Preparation stuff

## Connect to Drive

In [1]:
connect_to_drive = False

In [2]:
#Run command and authorize by popup --> other window
if connect_to_drive:
    from google.colab import drive
    drive.mount('/content/gdrive', force_remount=True)

## Install packages

In [3]:
if connect_to_drive:
    #Install FS code
    !pip install  --upgrade --force-reinstall git+https://github.com/federicosiciliano/easy_lightning.git

    !pip install pytorch_lightning

## IMPORTS

In [4]:
#Put all imports here
import numpy as np
from copy import deepcopy
import os
import sys

## Define paths

In [5]:
#every path should start from the project folder:
project_folder = "../"
if connect_to_drive:
    project_folder = "/content/gdrive/Shareddrives/<SharedDriveName>" #Name of SharedDrive folder
    #project_folder = "/content/gdrive/MyDrive/<MyDriveName>" #Name of MyDrive folder

#Config folder should contain hyperparameters configurations
cfg_folder = os.path.join(project_folder,"cfg")

#Data folder should contain raw and preprocessed data
data_folder = os.path.join(project_folder,"data")
raw_data_folder = os.path.join(data_folder,"raw")
processed_data_folder = os.path.join(data_folder,"processed")

#Source folder should contain all the (essential) source code
source_folder = os.path.join(project_folder,"src")

#The out folder should contain all outputs: models, results, plots, etc.
out_folder = os.path.join(project_folder,"out")
img_folder = os.path.join(out_folder,"img")

## Import own code

In [6]:
#To import from src:

#attach the source folder to the start of sys.path
sys.path.insert(0, project_folder)

#import from src directory
# from src import ??? as additional_module
import easy_rec as additional_module #REMOVE THIS LINE IF IMPORTING OWN ADDITIONAL MODULE

import easy_exp, easy_rec, easy_torch #easy_data

# MAIN

## Train

### Data

In [7]:
cfg = easy_exp.cfg.load_configuration("config_rec")

In [8]:
from ray import tune

In [9]:
def prepare_raytune_config(cfg):
    raytune_cfg = {}
    for parameter_name, v in cfg["__exp__"]["__sweep__"]["parameters"].items():
        if "tune" in v:
            raytune_cfg[parameter_name] = getattr(tune, v["tune"]["name"])(**v["tune"]["params"])
    return raytune_cfg

In [10]:
raytune_cfg = prepare_raytune_config(cfg)

In [11]:
#from ray.train.lightning import RayDDPStrategy, RayLightningEnvironment, RayTrainReportCallback, prepare_trainer

In [12]:
# cfg["model"]["trainer_params"]["strategy"] = RayDDPStrategy()
# #cfg["model"]["trainer_params"]["callbacks"].append(lambda : RayTrainReportCallback())
# cfg["model"]["trainer_params"]["plugins"] = cfg["model"]["trainer_params"].get("plugins", []) + [RayLightningEnvironment()]

In [13]:
def run_config(cfg, if_exp_found=None, raytune=False):
    # exp_found
    # skip --> skip the experiment
    # load --> load the experiment
    # if not load nor skip, reruns the experiment completely

    exp_found, experiment_id = easy_exp.exp.get_set_experiment_id(cfg)
    #print("Experiment already found:", exp_found, "----> The experiment id is:", experiment_id)

    if exp_found and if_exp_found == "skip":
        #print("Skipping experiment")
        return
    
    # Save experiment (done here cause Early stopping with Tune schedulers may not run anything after training)
    easy_exp.exp.save_experiment(cfg)

    data, maps = easy_rec.preparation.prepare_rec_data(cfg)

    loaders = easy_rec.preparation.prepare_rec_dataloaders(cfg, data, maps)

    main_module = easy_rec.preparation.prepare_rec_model(cfg, maps)

    trainer = easy_torch.preparation.complete_prepare_trainer(cfg, experiment_id, additional_module=easy_rec, raytune=raytune)

    model = easy_torch.preparation.complete_prepare_model(cfg, main_module, additional_module=easy_rec)

    if exp_found and if_exp_found == "load":
        easy_torch.process.load_model(trainer, model, experiment_id)

    easy_torch.process.test_model(trainer, model, loaders, test_key=["val","test","train"])

    # Train the model using the prepared trainer, model, and data loaders
    easy_torch.process.train_model(trainer, model, loaders, val_key=["val","test"])

    # Early stopping with Tune schedulers may not run anything after training

In [28]:
def run_raytune_cfg(raytune_cfg, cfg, if_exp_found=None):
    complete_cfg = deepcopy(cfg)
    complete_cfg.update(raytune_cfg)

    # save complete_cfg to a file

    run_config(complete_cfg, if_exp_found, raytune=True)

In [15]:
# checkpoint_data = {
#     "epoch": epoch,
#     "net_state_dict": net.state_dict(),
#     "optimizer_state_dict": optimizer.state_dict(),
# }
# with tempfile.TemporaryDirectory() as checkpoint_dir:
#     data_path = Path(checkpoint_dir) / "data.pkl"
#     with open(data_path, "wb") as fp:
#         pickle.dump(checkpoint_data, fp)

#     checkpoint = Checkpoint.from_directory(checkpoint_dir)
#     train.report(
#         {"loss": val_loss / val_steps, "accuracy": correct / total},
#         checkpoint=checkpoint,
#     )

In [16]:
max_num_epochs = cfg["model"]["trainer_params"]["max_epochs"]
scheduler = tune.schedulers.ASHAScheduler(
        metric="val_NDCG_@10/dataloader_idx_0",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)

In [17]:
from ray.train.torch import TorchTrainer
from ray.train import ScalingConfig#, RunConfig, CheckpointConfig

scaling_config = ScalingConfig(
    num_workers=2, use_gpu=True, resources_per_worker={"CPU": 8, "GPU": 1}
)

# run_config = RunConfig(
#     checkpoint_config=CheckpointConfig(
#         num_to_keep=2,
#         checkpoint_score_attribute="ptl/val_accuracy",
#         checkpoint_score_order="max",
#     ),
# )

# Define a TorchTrainer without hyper-parameters for Tuner
ray_trainer = TorchTrainer(
    lambda x: run_raytune_cfg(x, cfg),
    scaling_config=scaling_config,
    # run_config=run_config,
)

In [18]:
os.environ["RAY_CHDIR_TO_TRIAL_DIR"] = "0" #To avoid changing working directory

In [19]:
tuner = tune.Tuner(
    ray_trainer,
    param_space={"train_loop_config": raytune_cfg},
    tune_config=tune.TuneConfig(
        # metric="val_NDCG_@10/dataloader_idx_0",
        # mode="max",
        num_samples=100,
        scheduler=scheduler,
        time_budget_s=40, #seconds #May raise WARNING Failed to fetch metrics for
    ),
)

results = tuner.fit()

0,1
Current time:,2024-12-24 17:35:44
Running for:,00:00:10.86
Memory:,6.6/124.9 GiB

Trial name,# failures,error file
TorchTrainer_18ceb_00000,1,"/tmp/ray/session_2024-12-24_17-35-32_668617_2176674/artifacts/2024-12-24_17-35-33/TorchTrainer_2024-12-24_17-35-32/driver_artifacts/TorchTrainer_18ceb_00000_0_model_optimizer_params_lr=0.0491,model_rec_model_dropout_rate=0.3416,model_rec_model_num_blocks=1_2024-12-24_17-35-33/error.txt"

Trial name,status,loc,...loop_config/model .optimizer.params.lr,...p_config/model.re c_model.dropout_rate,...oop_config/model. rec_model.num_blocks
TorchTrainer_18ceb_00001,PENDING,,0.0242003,0.195515,1
TorchTrainer_18ceb_00002,PENDING,,0.0770719,0.175685,1
TorchTrainer_18ceb_00003,PENDING,,0.0499875,0.418424,1
TorchTrainer_18ceb_00004,PENDING,,0.0686163,0.427993,2
TorchTrainer_18ceb_00005,PENDING,,0.0611673,0.221484,2
TorchTrainer_18ceb_00006,PENDING,,0.095605,0.321202,2
TorchTrainer_18ceb_00007,PENDING,,0.00159042,0.371017,2
TorchTrainer_18ceb_00008,PENDING,,0.00965245,0.22153,2
TorchTrainer_18ceb_00009,PENDING,,0.0865893,0.304719,1
TorchTrainer_18ceb_00010,PENDING,,0.0256837,0.284954,2


[36m(TorchTrainer pid=2178899)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=2178899)[0m - (node_id=693b4ef5f32c281865f33712464cc92cfa8273adb9eac8865854f014, ip=192.168.1.94, pid=2179139) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=2178899)[0m - (node_id=693b4ef5f32c281865f33712464cc92cfa8273adb9eac8865854f014, ip=192.168.1.94, pid=2179138) world_rank=1, local_rank=1, node_rank=0
[36m(RayTrainWorker pid=2179139)[0m Setting up process group for: env:// [rank=0, world_size=2]


[36m(RayTrainWorker pid=2179139)[0m Ratings data already exists. Skip pre-processing
[36m(RayTrainWorker pid=2179138)[0m Filtering by minimum number of users per item: 5
[36m(RayTrainWorker pid=2179138)[0m Filtering by minimum number of items per user: 5
[36m(RayTrainWorker pid=2179138)[0m Densifying index
[36m(RayTrainWorker pid=2179138)[0m Splitting: leave_n_out


[36m(RayTrainWorker pid=2179138)[0m [rank: 1] Seed set to 42
2024-12-24 17:35:43,754	ERROR tune_controller.py:1331 -- Trial task failed for trial TorchTrainer_18ceb_00000
Traceback (most recent call last):
  File "/home/caldia/recsys-svd/rec_svd/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/caldia/recsys-svd/rec_svd/lib/python3.12/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/caldia/recsys-svd/rec_svd/lib/python3.12/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/caldia/recsys-svd/rec_svd/lib/python3.12/site-packages/ray/_private/worker.py", line 2755, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
          

In [20]:
results

ResultGrid<[
  Result(
    metrics={'val_loss/dataloader_idx_0': 0.3252166211605072, 'val_Precision_@5/dataloader_idx_0': 0.20000000298023224, 'val_Precision_@10/dataloader_idx_0': 0.10000000149011612, 'val_Precision_@20/dataloader_idx_0': 0.05000000074505806, 'val_Recall_@5/dataloader_idx_0': 1.0, 'val_Recall_@10/dataloader_idx_0': 1.0, 'val_Recall_@20/dataloader_idx_0': 1.0, 'val_F1_@5/dataloader_idx_0': 0.33333075046539307, 'val_F1_@10/dataloader_idx_0': 0.18181663751602173, 'val_F1_@20/dataloader_idx_0': 0.09523725509643555, 'val_MAP_@5/dataloader_idx_0': 0.44430258870124817, 'val_MAP_@10/dataloader_idx_0': 0.2867147624492645, 'val_MAP_@20/dataloader_idx_0': 0.17679595947265625, 'val_NDCG_@5/dataloader_idx_0': 0.9771838784217834, 'val_NDCG_@10/dataloader_idx_0': 0.9771838784217834, 'val_NDCG_@20/dataloader_idx_0': 0.9771838784217834, 'val_MRR_@5/dataloader_idx_0': 0.969089686870575, 'val_MRR_@10/dataloader_idx_0': 0.969089686870575, 'val_MRR_@20/dataloader_idx_0': 0.969089686870575

In [21]:
results.get_best_result(metric="val_NDCG_@10/dataloader_idx_0", mode="max")

Result(
  metrics={'val_loss/dataloader_idx_0': 0.3252166211605072, 'val_Precision_@5/dataloader_idx_0': 0.20000000298023224, 'val_Precision_@10/dataloader_idx_0': 0.10000000149011612, 'val_Precision_@20/dataloader_idx_0': 0.05000000074505806, 'val_Recall_@5/dataloader_idx_0': 1.0, 'val_Recall_@10/dataloader_idx_0': 1.0, 'val_Recall_@20/dataloader_idx_0': 1.0, 'val_F1_@5/dataloader_idx_0': 0.33333075046539307, 'val_F1_@10/dataloader_idx_0': 0.18181663751602173, 'val_F1_@20/dataloader_idx_0': 0.09523725509643555, 'val_MAP_@5/dataloader_idx_0': 0.44430258870124817, 'val_MAP_@10/dataloader_idx_0': 0.2867147624492645, 'val_MAP_@20/dataloader_idx_0': 0.17679595947265625, 'val_NDCG_@5/dataloader_idx_0': 0.9771838784217834, 'val_NDCG_@10/dataloader_idx_0': 0.9771838784217834, 'val_NDCG_@20/dataloader_idx_0': 0.9771838784217834, 'val_MRR_@5/dataloader_idx_0': 0.969089686870575, 'val_MRR_@10/dataloader_idx_0': 0.969089686870575, 'val_MRR_@20/dataloader_idx_0': 0.969089686870575, 'val_loss/datal

In [22]:
#Problems with parallel execution:
# May generate the same experiment id (very unlikely)
### By default, there are N = 16^62 possible experiment ids
### L = already run experiments
### M = N - L possible experiment ids
### If generating K experiment ids concurrently, probability is sti
# TODO: reuse already run experiments, --> if using time budget, sleep for previous run-time (for fairness in comparison)

[36m(RayTrainWorker pid=2061042)[0m 
Validation DataLoader 1:  91%|█████████▏| 21/23 [00:00<00:00, 71.93it/s][A
Validation DataLoader 1:  96%|█████████▌| 22/23 [00:00<00:00, 72.33it/s][A
Validation DataLoader 1: 100%|██████████| 23/23 [00:00<00:00, 72.72it/s][A
Epoch 22:   0%|          | 0/23 [00:00<?, ?it/s, v_num=0]               [A


[36m(RayTrainWorker pid=2061041)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/user/ray_results/TorchTrainer_2024-12-24_14-59-29/TorchTrainer_4c2f9_00000_0_model_optimizer_params_lr=0.0550,model_rec_model_dropout_input=0.2770,model_rec_model_num_layers=1_2024-12-24_14-59-31/checkpoint_000021)
