In [1]:
from typing import List, Dict, Any

from pathlib import Path
import os
import time
import datetime as dt

import pandas as pd
from ray import train, tune

In [2]:
# Others config
SEED: int = 57
RAY_RESULTS_PATH: Path = Path('~/ray_results').expanduser()

# Dataset splits config
SPLITS_FREQ = 'W-THU'  # Split weekly
LAST_SPLITS = 10  # Use just last 10 splits
SPLITS_NORMALIZE = True

# Training config
SMALL_EXPERIMENT: bool = os.uname().nodename != 'lamarck'
MAX_EPOCHS: int = 200
EPOCHS_PER_ITER: int = 5
SAMPLES_PER_SPLIT: int = 100
OPTIM_METRIC: str = 'map@10'

# Search space config
MAX_EMBEDDING_DIM = 1024
MAX_BATCH_SIZE = 10 # 2**10
MIN_LR = 1e-4
GPUS = 16

# Eval config
TOP_K: List[int] = [5, 10]
METRICS: List[str] = ["recall", "ndcg", "precision", "map"]

## Obtain dataset

In [3]:
!pwd

/home/davo/Documents/GRASIA/recsys24-daos/notebooks


In [4]:
dfp = pd.read_csv("../data/decentraland/proposals.csv", parse_dates=['date', 'start', 'end'])
dfv = pd.read_csv("../data/decentraland/votes.csv", parse_dates=['date'])

print(dfp.info())
print(dfv.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1942 entries, 0 to 1941
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   id      1942 non-null   object        
 1   author  1942 non-null   object        
 2   date    1942 non-null   datetime64[ns]
 3   start   1942 non-null   datetime64[ns]
 4   end     1942 non-null   datetime64[ns]
dtypes: datetime64[ns](3), object(2)
memory usage: 76.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116560 entries, 0 to 116559
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   id        116560 non-null  object        
 1   proposal  116560 non-null  object        
 2   voter     116560 non-null  object        
 3   date      116560 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 3.6+ MB
None


## Split data

In [5]:
from recsys24_daos.model_selection import time_freq_split_current
from recsys24_daos.datasets import to_microsoft

In [6]:
df = to_microsoft(dfv)
df

Unnamed: 0,userID,itemID,timestamp,rating
0,0xe7af1c70f8f089c4c3bd71999692c6c5a15d9e2a,b86aa059-3d31-5d41-a472-70962816f779,2021-12-17 12:28:01,1
1,0xc54a6c3778016b06cbd126ccc3b5bc06c5f666fb,b86aa059-3d31-5d41-a472-70962816f779,2021-12-17 02:16:23,1
2,0xd82d005e8f8d5385db40ba23884a5c967bb1e8af,b86aa059-3d31-5d41-a472-70962816f779,2021-12-17 00:38:22,1
3,0xf4c64db66ffb301985f5ecd85c8f3f9c02f2659d,b86aa059-3d31-5d41-a472-70962816f779,2021-12-16 18:47:08,1
4,0xd5e9ef1cedad0d135d543d286a2c190b16cbb89e,b86aa059-3d31-5d41-a472-70962816f779,2021-12-16 18:32:15,1
...,...,...,...,...
116555,0x1156bf625b37623a86d004e66e0a01ec4b17e051,d083109e-4819-54b9-a01c-67bd5a770f65,2022-09-06 18:47:54,1
116556,0xbd03add5da0e173c67c9c1073ffba017147c42d4,d083109e-4819-54b9-a01c-67bd5a770f65,2022-09-06 18:44:04,1
116557,0x4da03f669dd9609dc49ca6f3451ba22d3b792395,d083109e-4819-54b9-a01c-67bd5a770f65,2022-09-06 18:21:14,1
116558,0xd6e62a97a55537cd04847bb73e22208bd20106aa,d083109e-4819-54b9-a01c-67bd5a770f65,2022-09-06 18:14:10,1


In [7]:
from recommenders.evaluation.python_evaluation import metrics as metrics_dict

folds = list(time_freq_split_current(df, SPLITS_FREQ, dfp, return_open=True, remove_not_in_train_col='userID'))
print(len(folds), "folds")
folds = folds[-LAST_SPLITS:]
for i, (dftrain, dftest, t, open_proposals) in enumerate(folds):
    min_train = dftrain['timestamp'].min().date()
    max_train = dftrain['timestamp'].max().date()
    min_test  = dftest['timestamp'].min().date()
    max_test  = dftest['timestamp'].max().date()

    train_users = len(set(dftrain['userID']))
    test_users = len(set(dftest['userID']))
    
    print(f"Split {i}, train from: {min_train} to {max_train}, test from: {min_test} to {max_test}")
    print(f"  t: {t}")
    print(f"  open proposals: {len(open_proposals)}")
    print(f"  len(train): {len(dftrain)}, len(test): {len(dftest)}")
    print(f"  users(train): {train_users}, users(test): {test_users}")

    print()
    dftest['prediction'] = 1
    for m in METRICS:
        f = metrics_dict[f'{m}_at_k']
        print(f"  highest possible {m}@{TOP_K[0]}:\t{f(dftest, dftest, k=TOP_K[0], relevancy_method='top_k'):.4f}")

    print("-"*30)

112 folds
Split 0, train from: 2021-05-24 to 2023-05-10, test from: 2023-05-11 to 2023-05-22
  t: 2023-05-11 00:00:00
  open proposals: 18
  len(train): 106129, len(test): 354
  users(train): 6863, users(test): 139

  highest possible recall@5:	0.9631
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.4432
  highest possible map@5:	0.9631
------------------------------
Split 1, train from: 2021-05-24 to 2023-05-17, test from: 2023-05-18 to 2023-05-31
  t: 2023-05-18 00:00:00
  open proposals: 25
  len(train): 107387, len(test): 811
  users(train): 6886, users(test): 169

  highest possible recall@5:	0.8565
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.5953
  highest possible map@5:	0.8565
------------------------------
Split 2, train from: 2021-05-24 to 2023-05-24, test from: 2023-05-25 to 2023-06-07
  t: 2023-05-25 00:00:00
  open proposals: 19
  len(train): 108802, len(test): 332
  users(train): 6910, users(test): 122

  highest possible recall@

## Testing model

In [8]:
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

from recsys24_daos.models import LightGCNCustom

2024-04-24 15:11:11.129634: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-24 15:11:11.132858: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-24 15:11:11.209197: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-24 15:11:11.454719: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
hparams = prepare_hparams(
    model_type='lightgcn',
    n_layers=3,
    batch_size=512,
    embed_size=64,
    epochs=2,
    learning_rate=0.001,
    decay=0.001,
    metrics=["recall", "ndcg", "precision", "map"],
    eval_epoch=2,
    top_k=TOP_K[0],
    save_model=False,
    MODEL_DIR='./data/model/lightgcn/',
)
dataloader = ImplicitCF(train=folds[-1][0], test=folds[-1][1], seed=SEED)
print("items:", dataloader.n_items, "user:", dataloader.n_users)
model = LightGCNCustom(data=dataloader, hparams=hparams)

  df = train if test is None else train.append(test)


items: 1941 user: 7234
Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


2024-04-24 15:11:14.530750: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled


In [10]:
model.fit()
model.run_eval()

Epoch 1 (train)5.0s: train loss = 0.59172 = (mf)0.59087 + (embed)0.00085
Epoch 2 (train)4.8s + (eval)0.1s: train loss = 0.34394 = (mf)0.33969 + (embed)0.00425, recall = 0.00000, ndcg = 0.00000, precision = 0.00000, map = 0.00000


[0.0, 0.0, 0.0, 0.0]

In [11]:
model.recommend_k_items(
    dataloader.test, 
    top_k=3, 
    use_id=True, 
    remove_seen=True, 
    recommend_from=folds[-1][3]
)

Unnamed: 0,userID,itemID,prediction
0,405,1789,2.781798
1,405,1920,2.547448
2,405,483,2.506250
3,266,1920,11.225468
4,266,926,11.072497
...,...,...,...
421,3973,1920,1.238768
422,3973,926,1.205764
423,3203,425,1.991336
424,3203,1920,1.783863


## Defining trainable

In [12]:
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k

In [13]:
class TrainLightGCN(tune.Trainable):
    def setup(
        self,
        config: Dict[str, Any],
        data,
    ):
        self.config = config

        self.hparams = prepare_hparams(
            model_type='lightgcn',
            n_layers=config['conv_layers'],
            batch_size=2**config['batch_size'],
            embed_size=config['embedding_dim'],
            epochs=EPOCHS_PER_ITER,
            learning_rate=config['learning_rate'],
            decay=config['l2'],
            metrics=METRICS,
            eval_epoch=-1,
            top_k=TOP_K[0],
            save_model=False,
            MODEL_DIR='./data/model/lightgcn/',
        )

        train, test, self.t, self.open_proposals = data
        self.dataloader = ImplicitCF(train=train, test=test, seed=SEED)
        self.model = LightGCNCustom(self.hparams, self.dataloader, seed=SEED)
        self.total_train = 0
        self.total_eval = 0

    @property
    def iteration(self):
        return self.model.epochs_done

    @property
    def training_iteration(self):
        return self.model.epochs_done

    def step(self):
        """
        As a rule of thumb, the execution time of step should be large enough to avoid overheads 
        (i.e. more than a few seconds), but short enough to report progress periodically 
        (i.e. at most a few minutes).
        """
        assert EPOCHS_PER_ITER > 0

        train_start = time.time()
        for _ in range(EPOCHS_PER_ITER):
            ret = self.model.fit_epoch()
        eval_start = train_end = time.time()

        eval_dict = {'model_'+k:v for k,v in zip(self.model.metrics, self.model.run_eval())}
        for k in TOP_K:
            recs = self.model.recommend_k_items(
                self.dataloader.test, 
                top_k=k,
                use_id=True, 
                remove_seen=True, 
                recommend_from=self.open_proposals,
            )
            
            eval_dict[f'precision@{k}'] = precision_at_k(self.dataloader.test, recs, k=k)
            eval_dict[f'ndcg@{k}'] = ndcg_at_k(self.dataloader.test, recs, k=k)
            eval_dict[f'recall@{k}'] = recall_at_k(self.dataloader.test, recs, k=k)
            eval_dict[f'map@{k}'] = map_at_k(self.dataloader.test, recs, k=k)

        eval_end = time.time()

        self.total_train += train_end - train_start
        self.total_eval += eval_end - eval_start
        
        return {
            'iteration': self.iteration,
            'loss': ret[0],
            'mf_loss': ret[1],
            'emb_loss': ret[2],
            **eval_dict,
            'time_train': train_end-train_start,
            'time_test': eval_end-eval_start,
            'time_total_train': self.total_train,
            'time_total_test': self.total_eval,
        }

    def save_checkpoint(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir, "model")
        self.model.saver.save(
            sess=self.model.sess,
            save_path=checkpoint_path,
        )
        return checkpoint_dir

    def load_checkpoint(self, checkpoint_path):
        self.model.load(checkpoint_path)

## Big experiment

In [14]:
RAY_RESULTS_PATH

PosixPath('/home/davo/ray_results')

In [15]:
print(os.uname().nodename)

### SET TRAINING RESOURCES
if os.uname().nodename == 'lamarck':
    # assert torch.cuda.is_available()

    NUM_SAMPLES = SAMPLES_PER_SPLIT
    # Every run takes approx half a gig of vram (no optimizations)
    # The RTX 4090 has 24GB so we can run the model about 48 times
    resources_per_trial={
        'cpu': 1,
        'gpu': 1 / GPUS,
    }
else:
    NUM_SAMPLES = 1
    resources_per_trial={
        'cpu': 1,
        # It takes about 1.5 GiB with full training data, but I put a bit more because
        # this notebook also takes a bit of memory
        'memory': 2e9,
    }
print(resources_per_trial)

aerial
{'cpu': 1, 'memory': 2000000000.0}


In [16]:
from ray.tune.search.hyperopt import HyperOptSearch

In [17]:
def getTunerOnFold(f, points_to_evaluate = None):
    ORG_NAME = "Decentraland"
    CUTOFF_DATE = dt.datetime.fromisoformat("2023-07-29")
    
    name = f'LightGCN_optim={OPTIM_METRIC},dao={ORG_NAME},freq={SPLITS_FREQ},normalize={SPLITS_NORMALIZE},cutoff_date={CUTOFF_DATE.isoformat()},fold={f}'
    paths = list(RAY_RESULTS_PATH.glob(f'{name}_*'))
    last_experiment = max(paths, key=lambda x: x.stat().st_ctime) if paths else None

    dftrain,dftest,t,open_proposals = folds[f]
    param_space = dict(
        fold=f,
        batch_size=tune.randint(6, MAX_BATCH_SIZE), # 64 - 1024
        embedding_dim=tune.lograndint(1, MAX_EMBEDDING_DIM, base=2),
        conv_layers=tune.randint(1,6),
        learning_rate=tune.qloguniform(MIN_LR, 1, 1e-4),
        l2=tune.loguniform(1e-7, 1e-2, 1e-7),
    )
    
    ### RESTORE EXPERIMENT OR CREATE A NEW ONE
    if last_experiment and tune.Tuner.can_restore(last_experiment):
        print(f"Restoring last experiment: {last_experiment}")
        tuner = tune.Tuner.restore(
            str(last_experiment),
            trainable=tune.with_resources(
                # tune.with_parameters(TrainLightGCN,  train=dftrain, test=dftest, open_proposals=open_proposals),
                tune.with_parameters(TrainLightGCN, data=folds[f]),
                resources_per_trial,
            ),
            restart_errored=True,
            param_space=param_space,
        )
    else:
        print(f"No experiment found for fold {f}, creating new tuner with {NUM_SAMPLES} samples")
        search_alg = None
        search_alg = HyperOptSearch(
            # points_to_evaluate=[{
            #     'batch_size': 8, # 2**8 = 256
            #     'learning_rate': 10e-2,
            #     'l2': 10e-6,
            #     'embedding_dim': 100,
            #     'conv_layers': 3,
            # }],
            points_to_evaluate = points_to_evaluate,
            random_state_seed=SEED,
        )
        # search_alg = tune.search.Repeater(search_alg, N_SPLITS-SKIP_SPLIT)
        
        tuner = tune.Tuner(
            tune.with_resources(
                # tune.with_parameters(TrainLightGCN,  train=dftrain, test=dftest, open_proposals=open_proposals),
                tune.with_parameters(TrainLightGCN, data=folds[f]),
                resources_per_trial,
            ),
            run_config=train.RunConfig(
                stop={'training_iteration': MAX_EPOCHS/EPOCHS_PER_ITER, 'time_total_train': 300},
                name=name + f'_{dt.datetime.now().isoformat()}',
                storage_path=RAY_RESULTS_PATH,
                # failure_config=train.FailureConfig(fail_fast='raise'),
                failure_config=train.FailureConfig(max_failures=3),
            ),
            param_space=param_space,
            tune_config=tune.TuneConfig(
                search_alg=search_alg,
                num_samples=NUM_SAMPLES,
                metric=OPTIM_METRIC,
                mode='max',
            )
        )

    return tuner

In [18]:
import logging

def findConfig(rg):
    for r in rg:
        if r.config:
            if all((r.config[k] == v for k, v in last_best_result.config.items() if k != 'fold')):
                return r

    return None

tuners = []
results = []
last_best_result = None
for f in range(LAST_SPLITS):
    best_prev_config = None
    if last_best_result is not None:
        best_prev_config = last_best_result.config.copy()
        best_prev_config['fold'] += 1
        best_prev_config = [best_prev_config]
    
    t = getTunerOnFold(f, best_prev_config)
    tuners.append(t)

    rg = t.fit()
    assert rg.num_errors == 0, f"There are {rg.num_errors} errors"
    assert rg.num_terminated >= NUM_SAMPLES, f'Some samples are not terminated ({rg.num_terminated} != {NUM_SAMPLES})'
    results.append(rg)

    # Assert that the prev config has been tried
    if last_best_result is not None:
        # if not any( 
        #     all((r.config[k] == v for k, v in last_best_result.config.items() if k != 'fold'))
        #     for r in rg if r.config
        # ):
        if not findConfig(rg):
            print("Best config:", last_best_result.config)
            assert False, f"The best config from previous fold has not been tested in fold {f}"    
        else:
            logging.info(f'Fold {f}. Best prev result was {last_best_result.path} and config has been found {findConfig(rg).path}')
    
    last_best_result = rg.get_best_result()

    print(f"Finished training for fold {f}")

0,1
Current time:,2024-04-24 15:11:37
Running for:,00:00:00.27
Memory:,13.4/15.3 GiB

Trial name,status,loc,batch_size,conv_layers,embedding_dim,fold,l2,learning_rate,iter,total time (s),iteration,loss,mf_loss
TrainLightGCN_f6e980e2,TERMINATED,147.96.81.131:3529215,7,3,151,9,0.000483403,0.0001,3,307.004,15,0.185933,0.180231
TrainLightGCN_1172fc5a,TERMINATED,147.96.81.131:3526930,6,4,851,9,0.00430162,0.0193,1,594.903,5,0.231485,0.0583442
TrainLightGCN_df63bfd6,TERMINATED,147.96.81.131:3527897,7,5,505,9,0.000667768,0.0127,2,473.29,10,0.0688388,0.0222416
TrainLightGCN_3852f46f,TERMINATED,147.96.81.131:3528600,7,5,950,9,0.00235779,0.0004,1,376.208,5,0.153857,0.11597
TrainLightGCN_39126830,TERMINATED,147.96.81.131:3528776,7,2,480,9,2.26589e-05,0.0002,2,343.453,10,0.0667196,0.0662592
TrainLightGCN_2b3ae0d5,TERMINATED,147.96.81.131:3528445,7,5,731,9,0.000299631,0.0054,1,347.246,5,0.0428984,0.0219095
TrainLightGCN_2c7772f1,TERMINATED,147.96.81.131:3525090,6,4,116,9,0.00126259,0.0265,2,661.504,10,0.0959942,0.0288229
TrainLightGCN_08540956,TERMINATED,147.96.81.131:3528192,9,5,380,9,0.000160068,0.0001,5,352.063,25,0.193556,0.191074
TrainLightGCN_78c23fd2,TERMINATED,147.96.81.131:3525985,6,3,644,9,0.00727965,0.3812,1,530.294,5,59.9729,3.57242
TrainLightGCN_259c47e7,TERMINATED,147.96.81.131:3527262,7,3,24,9,3.99676e-05,0.0032,2,387.814,10,0.043164,0.0419086


2024-04-24 15:11:36,768	INFO experiment_state.py:404 -- A local experiment checkpoint was found and will be used to restore the previous experiment state.
2024-04-24 15:11:36,771	INFO tune_controller.py:404 -- Using the newest experiment state file found within the experiment directory: experiment_state-2024-03-21_20-59-20.json
2024-04-24 15:11:37,245	INFO tune.py:1042 -- Total run time: 0.49 seconds (0.00 seconds for the tuning loop).


Finished training for fold 9


[33m(raylet)[0m [2024-04-24 15:20:26,862 E 170365 170365] (raylet) node_manager.cc:3024: 3 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: 0bbb06aad22aab3e3e47c1df6669f9e2730ceacf0a8b4fa6ccdb91e3, IP: 147.96.25.138) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 147.96.25.138`
[33m(raylet)[0m 
[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.
