In [1]:
from typing import Dict, List, Tuple, Union, Any, Optional

import os
import sys
print("Python version", sys.version)

# Ignore pandas warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pathlib import Path

import pandas as pd
import numpy as np
import tensorflow as tf

import ray
from ray import train, tune

from src.datasets import daocensus

%load_ext autoreload
%autoreload 2

tf.get_logger().setLevel('WARNING')

tf.config.list_physical_devices("GPU")
sys_details = tf.sysconfig.get_build_info()
cuda = sys_details.get("cuda_version", -1)
cudnn = sys_details.get("cudnn_version", -1)
print(cuda, cudnn)

Python version 3.9.18 (main, Oct 24 2023, 09:18:18) 
[GCC 11.4.0]


2023-10-25 15:39:13.841834: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-25 15:39:13.841855: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-25 15:39:13.841866: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


11.8 8


In [2]:
# Others config
SEED: int = 57
RAY_RESULTS_PATH: Path = Path('~/ray_results').expanduser()

# Dataset splits config
N_SPLITS: int = 5
SKIP_SPLIT: int = 1

# Training config
MAX_EPOCHS: int = 50
EPOCHS_PER_ITER: int = 5

# Eval config
TOP_K: int = 5
METRICS: List[str] = ["recall", "ndcg", "precision", "map"]

# Load data

In [3]:
dfv, dfp = daocensus.get("./data/daos-census", 'Decentraland', 'snapshot')
print(dfv.info())
print(dfp.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116560 entries, 0 to 116559
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   platform       116560 non-null  object        
 1   name           116560 non-null  object        
 2   id             116560 non-null  object        
 3   proposal       116533 non-null  category      
 4   deployment     116560 non-null  object        
 5   platform_vote  116560 non-null  object        
 6   voter          116560 non-null  category      
 7   date           116560 non-null  datetime64[ns]
 8   choice         116560 non-null  object        
 9   weight         116560 non-null  float64       
dtypes: category(2), datetime64[ns](1), float64(1), object(6)
memory usage: 7.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1915 entries, 0 to 1914
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype         
---  ------  

## Transform data

In [4]:
def to_microsoft(dfv):
    df = dfv[['voter', 'proposal', 'date']].rename(columns={
        'voter': 'userID',
        'proposal': 'itemID',
        'date': 'timestamp',
    })
    df['itemID'] = df['itemID'].astype('str')
    df['rating'] = 1
    return df

df = to_microsoft(dfv)
df.head()

Unnamed: 0,userID,itemID,timestamp,rating
0,0xe7af1c70f8f089c4c3bd71999692c6c5a15d9e2a,b86aa059-3d31-5d41-a472-70962816f779,2021-12-17 12:28:01,1
1,0xc54a6c3778016b06cbd126ccc3b5bc06c5f666fb,b86aa059-3d31-5d41-a472-70962816f779,2021-12-17 02:16:23,1
2,0xd82d005e8f8d5385db40ba23884a5c967bb1e8af,b86aa059-3d31-5d41-a472-70962816f779,2021-12-17 00:38:22,1
3,0xf4c64db66ffb301985f5ecd85c8f3f9c02f2659d,b86aa059-3d31-5d41-a472-70962816f779,2021-12-16 18:47:08,1
4,0xd5e9ef1cedad0d135d543d286a2c190b16cbb89e,b86aa059-3d31-5d41-a472-70962816f779,2021-12-16 18:32:15,1


# Split data

Each proposal remains open for a few days, our environment is different of a movies recommender system. For this reason, we will use a TimeSeriesSplit instead of a K-Fold to cross-validate the model.

![](https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_013.png)

El TimeSeriesSplit de scikit-learn no nos vale porque el número de elementos en cada split es el mismo, pero el tamaño del intervalo, no. Como queremos simular un comportamiento realista, haremos el split dividiendo por intervalos de igual longitud.

In [5]:
from recommenders.evaluation.python_evaluation import metrics as metrics_dict

In [6]:
def timeIntervalSplit(df: pd.DataFrame, splits: int, timestamp_col: str = 'timestamp', skip: int = 0, remove_not_in_train: str = None):
    total_time_diff = df[timestamp_col].max() - df[timestamp_col].min()
    k_time_diff = total_time_diff / (splits+1)

    acc_time = df[timestamp_col].min() + (1+skip)*k_time_diff
    for i in range(splits - skip):
        end_time = acc_time + k_time_diff
        
        train = df[df[timestamp_col] <= acc_time]
        test = df[ (acc_time < df[timestamp_col]) & (df[timestamp_col] < end_time) ]

        if remove_not_in_train is not None:
            msk = test[remove_not_in_train].isin(set(train[remove_not_in_train]))
            test = test[msk]
        
        acc_time = end_time
        yield train, test

max_train_prev = df['timestamp'].min().date()
folds = list(timeIntervalSplit(df, N_SPLITS, skip=SKIP_SPLIT, remove_not_in_train='userID'))
for i, (dftrain, dftest) in enumerate(folds):
    min_train = dftrain['timestamp'].min().date()
    max_train = dftrain['timestamp'].max().date()
    min_test  = dftest['timestamp'].min().date()
    max_test  = dftest['timestamp'].max().date()
    train_diff = (max_train-max_train_prev).days
    test_diff = (max_test-min_test).days

    train_users = len(set(dftrain['userID']))
    test_users = len(set(dftest['userID']))
    
    print(f"Split {i}, train from: {max_train_prev} to {max_train}, test from: {min_test} to {max_test}")
    print(f"  len(train): {len(dftrain)}, len(test): {len(dftest)}")
    print(f"  users(train): {train_users}, users(test): {test_users}")

    print()
    dftest['prediction'] = 1
    for m in METRICS:
        f = metrics_dict[f'{m}_at_k']
        print(f"  highest possible {m}@{TOP_K}:\t{f(dftest, dftest, k=TOP_K, relevancy_method='top_k'):.4f}")

    print("-"*30)

    max_train_prev = max_train

Split 0, train from: 2021-05-24 to 2022-02-10, test from: 2022-02-11 to 2022-06-22
  len(train): 21485, len(test): 7268
  users(train): 3040, users(test): 807

  highest possible recall@5:	0.7991
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.6072
  highest possible map@5:	0.7991
------------------------------
Split 1, train from: 2022-02-10 to 2022-06-22, test from: 2022-06-22 to 2022-10-31
  len(train): 32472, len(test): 8780
  users(train): 4031, users(test): 850

  highest possible recall@5:	0.8052
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.6169
  highest possible map@5:	0.8052
------------------------------
Split 2, train from: 2022-06-22 to 2022-10-31, test from: 2022-10-31 to 2023-03-11
  len(train): 49437, len(test): 15135
  users(train): 4870, users(test): 1089

  highest possible recall@5:	0.7839
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.6264
  highest possible map@5:	0.7839
------------------------------

# Defining training

In [7]:
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN

class LightGCNCustom(LightGCN):
    # Copied from LightGCN.fit but RETURNING the data and deleting unnecessary things
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.epochs_done = 0
    
    def fit_epoch(self):
        """Fit the model on self.data.train. If eval_epoch is not -1, evaluate the model on `self.data.test`
        every `eval_epoch` epoch to observe the training status.

        """
        loss, mf_loss, emb_loss = 0.0, 0.0, 0.0
        n_batch = self.data.train.shape[0] // self.batch_size + 1
        for idx in range(n_batch):
            users, pos_items, neg_items = self.data.train_loader(self.batch_size)
            _, batch_loss, batch_mf_loss, batch_emb_loss = self.sess.run(
                [self.opt, self.loss, self.mf_loss, self.emb_loss],
                feed_dict={
                    self.users: users,
                    self.pos_items: pos_items,
                    self.neg_items: neg_items,
                },
            )
            loss += batch_loss / n_batch
            mf_loss += batch_mf_loss / n_batch
            emb_loss += batch_emb_loss / n_batch

        if np.isnan(loss):
            print("ERROR: loss is nan.")
            sys.exit()

        self.epochs_done += 1

        return loss, mf_loss, emb_loss

In [8]:
from recommenders.models.deeprec.deeprec_utils import prepare_hparams
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF

class TrainLightGCN(tune.Trainable):
    def setup(
        self,
        config: Dict[str, Any],
        folds: List[Tuple[pd.DataFrame, pd.DataFrame]],
    ):
        self.config = config
        config['batch_size'] = 2**config['batch_size']
        self.fold = config['__trial_index__']

        self.hparams = prepare_hparams(
            model_type='lightgcn',
            n_layers=config['conv_layers'],
            batch_size=config['batch_size'],
            embed_size=config['embedding_dim'],
            epochs=EPOCHS_PER_ITER,
            learning_rate=config['learning_rate'],
            decay=config['l2'],
            metrics=METRICS,
            eval_epoch=-1,
            top_k=TOP_K,
            save_model=False,
            MODEL_DIR='./data/model/lightgcn/',
        )

        self.dataloader = ImplicitCF(train=folds[self.fold][0], test=folds[self.fold][1], seed=SEED)
        self.model = LightGCNCustom(self.hparams, self.dataloader, seed=SEED)

    @property
    def iteration(self):
        return self.model.epochs_done

    @property
    def training_iteration(self):
        return self.model.epochs_done

    def step(self):
        """
        As a rule of thumb, the execution time of step should be large enough to avoid overheads 
        (i.e. more than a few seconds), but short enough to report progress periodically 
        (i.e. at most a few minutes).
        """
        assert EPOCHS_PER_ITER > 0
        
        for _ in range(EPOCHS_PER_ITER):
            ret = self.model.fit_epoch()

        return {
            'iteration': self.iteration,
            'loss': ret[0],
            'mf_loss': ret[1],
            'emb_loss': ret[2],
            **{k:v for k,v in zip(self.model.metrics, self.model.run_eval())},
        }

    def save_checkpoint(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir, "model")
        self.model.saver.save(
            sess=self.model.sess,
            save_path=checkpoint_path,
        )
        return checkpoint_dir

    def load_checkpoint(self, checkpoint_path):
        self.model.load(checkpoint_path)

### Small test

In [9]:
# hparams = prepare_hparams(
#     model_type='lightgcn',
#     n_layers=3,
#     batch_size=512,
#     embed_size=64,
#     epochs=3,
#     learning_rate=0.001,
#     decay=0.001,
#     metrics=["recall", "ndcg", "precision", "map"],
#     eval_epoch=2,
#     top_k=TOP_K,
#     save_model=False,
#     MODEL_DIR='./data/model/lightgcn/',
# )
# dataloader = ImplicitCF(train=folds[-1][0], test=folds[-1][1], seed=SEED)
# print("items:", dataloader.n_items, "user:", dataloader.n_users)
# model = LightGCNCustom(
#     hparams,
#     dataloader,
#     seed=SEED,
# )

In [10]:
# model.fit()

### Big experiment

In [11]:
os.uname().nodename

'lamarck'

In [12]:
RAY_RESULTS_PATH

PosixPath('/home/daviddavo/ray_results')

In [13]:
import ray
from ray.tune.search.hyperopt import HyperOptSearch

last_experiment = max(RAY_RESULTS_PATH.glob('TrainLightGCN_*'), key=lambda x: x.stat().st_ctime)

### SET TRAINING RESOURCES
if os.uname().nodename == 'lamarck':
    # assert torch.cuda.is_available()
    
    NUM_SAMPLES = 500
    # Every run takes approx half a gig of vram (no optimizations)
    # The RTX 4090 has 24GB so we can run the model about 48 times
    resources_per_trial={
        'cpu': 1,
        # GPU has 25GiB, and each run might take up to 2GiB (torch version was lighter)
        # so each run might take up to 1/12th of the GPU
        # I use 1/8th so I don't take all the resources in the machine
        'gpu': 1/8,
    }
else:
    NUM_SAMPLES = 1
    resources_per_trial={
        'cpu': 1,
        # It takes about 1.5 GiB with full training data, but I put a bit more because
        # this notebook also takes a bit of memory
        'memory': 2e9,
    }

### RESTORE EXPERIMENT OR CREATE A NEW ONE
if tune.Tuner.can_restore(last_experiment):
    print(f"Restoring last experiment: {last_experiment}")
    tuner = tune.Tuner.restore(
        str(last_experiment),
        trainable=tune.with_resources(
            tune.with_parameters(TrainLightGCN, folds=folds),
            resources_per_trial,
        ),
        restart_errored=True
    )
else:
    print("No experiment found, creating new tuner")
    search_alg = HyperOptSearch()
    search_alg = tune.search.Repeater(search_alg, N_SPLITS-SKIP_SPLIT)
    
    tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(TrainLightGCN, folds=folds),
            resources_per_trial,
        ),
        run_config=train.RunConfig(
            stop={'training_iteration': MAX_EPOCHS/EPOCHS_PER_ITER},
            storage_path=RAY_RESULTS_PATH,
        ),
        param_space=dict(
            # batch size between 2**4 (32) and 2**10 (1024)
            batch_size=tune.randint(4,10),
            embedding_dim=tune.qlograndint(10, 500, 5),
            conv_layers=tune.randint(2,6),
            learning_rate=tune.qloguniform(1e-5, 1, 1e-5),
            l2=tune.loguniform(1e-9, 1e-1),
        ),
        tune_config=tune.TuneConfig(
            search_alg=search_alg,
            num_samples=(N_SPLITS-SKIP_SPLIT)*NUM_SAMPLES,
            metric='loss',
            mode='min',
        )
    )

Restoring last experiment: /home/daviddavo/ray_results/TrainLightGCN_2023-10-25_09-51-27


- TrainLightGCN_ac3e7bfa: FileNotFoundError('Could not fetch metrics for TrainLightGCN_ac3e7bfa: both result.json and progress.csv were not found at /home/daviddavo/ray_results/TrainLightGCN_2023-10-25_09-51-27/TrainLightGCN_ac3e7bfa_282_trial_index=1,batch_size=4,conv_layers=4,embedding_dim=420,l2=0.0000,learning_rate=0.0126_2023-10-25_14-17-19')
- TrainLightGCN_9e075b34: FileNotFoundError('Could not fetch metrics for TrainLightGCN_9e075b34: both result.json and progress.csv were not found at /home/daviddavo/ray_results/TrainLightGCN_2023-10-25_09-51-27/TrainLightGCN_9e075b34_286_trial_index=1,batch_size=4,conv_layers=4,embedding_dim=365,l2=0.0002,learning_rate=0.0141_2023-10-25_15-37-05')
- TrainLightGCN_dcc25f00: FileNotFoundError('Could not fetch metrics for TrainLightGCN_dcc25f00: both result.json and progress.csv were not found at /home/daviddavo/ray_results/TrainLightGCN_2023-10-25_09-51-27/TrainLightGCN_dcc25f00_284_trial_index=3,batch_size=4,conv_layers=4,embedding_dim=420,l2=

In [14]:
tuner.fit()

0,1
Current time:,2023-10-25 15:57:24
Running for:,00:17:58.71
Memory:,22.1/125.6 GiB

Trial name,status,loc,__trial_index__,batch_size,conv_layers,embedding_dim,l2,learning_rate,iter,total time (s),iteration,loss,mf_loss
TrainLightGCN_05ac43cc,RUNNING,147.96.81.131:1559803,1,4,4,360,0.000121534,0.00067,6.0,973.666,30.0,0.0163164,0.0108057
TrainLightGCN_09381197,RUNNING,147.96.81.131:1559921,1,4,5,335,3.05743e-06,0.00039,6.0,980.072,30.0,0.0230535,0.022879
TrainLightGCN_0ebe1c4f,RUNNING,147.96.81.131:1559836,2,4,4,360,0.000121534,0.00067,4.0,993.328,20.0,0.022126,0.0163082
TrainLightGCN_9328eef1,RUNNING,147.96.81.131:1559892,2,4,5,335,3.05743e-06,0.00039,4.0,994.409,20.0,0.033806,0.0336383
TrainLightGCN_a697bd36,RUNNING,147.96.81.131:1559979,3,4,5,300,0.000317037,0.01357,2.0,901.619,10.0,0.0564719,0.0204409
TrainLightGCN_bdf87cf1,RUNNING,147.96.81.131:1559876,3,4,4,370,7.97928e-06,0.00146,2.0,902.229,10.0,0.0124025,0.01165
TrainLightGCN_c283f7a4,RUNNING,147.96.81.131:1559804,0,4,5,335,3.05743e-06,0.00039,9.0,969.793,45.0,0.0158851,0.0157136
TrainLightGCN_00e77ec3,PENDING,,1,4,4,355,8.25231e-06,0.00142,,,,,
TrainLightGCN_04823b31,PENDING,,2,4,4,365,0.000170348,0.01408,,,,,
TrainLightGCN_0b135533,PENDING,,3,4,4,420,3.6367e-05,0.01264,,,,,


2023-10-25 15:39:25,740	INFO experiment_state.py:529 -- A local experiment checkpoint was found and will be used to restore the previous experiment state.
2023-10-25 15:39:25,745	INFO tune_controller.py:520 -- Using the newest experiment state file found within the experiment directory: experiment_state-2023-10-25_15-36-54.json
[2m[36m(pid=1559803)[0m 2023-10-25 15:39:27.346201: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[2m[36m(pid=1559803)[0m 2023-10-25 15:39:27.346225: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[2m[36m(pid=1559803)[0m 2023-10-25 15:39:27.346237: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for pl

[2m[36m(TrainLightGCN pid=1559803)[0m Already create adjacency matrix.
[2m[36m(TrainLightGCN pid=1559803)[0m Already normalize adjacency matrix.
[2m[36m(TrainLightGCN pid=1559803)[0m Using xavier initialization.


2023-10-25 15:57:26,571	ERROR repeater.py:161 -- Trial c283f7a4 not in group; cannot report score. Seen trials: ['2228f12c', 'd4f96bf0', '4acfeaa2', '2bd41311', '13e7d352', '2cc16d1f', 'd4628b1d', 'ff53063e', '4576e098', 'cb5f160e', '13682ac6', '356e8b1a', 'd2a864a0', 'db8399ad', '564df887', 'c3e05066', '33319571', 'af41f8be', '554d2bc0', 'a81850d3', '205f97fc', 'da376357', '541d6910', 'bb603659', '2660342c', '89b372eb', 'f9f41aff', '5ad36c3a', 'b0236a18', '2f7c81cc', '9e80d932', 'd7ed5e3b', '47e096ae', '1c13902d', '29496b49', 'a11102ea', 'f86fc62b', '21e7169d', '7503d0be', '042c1cfa', '5d9ae200', '98e646c8', '3b9c7935', '56138d88', 'e04ecf73', '7ef68c38', '8a10193c', '07df7241', '92fb3b54', 'bf21b133', 'd7989cc6', '15335e30', 'c42b1110', '3e29ff4e', '0b6a53f7', '167c2664', 'c7f8a3aa', 'd981c9c8', 'f50f7ecc', '7e0c790c', '82bd9f42', 'f7a168bb', 'b8067d68', 'eabd9c03', '22adac29', '8e6ef7e2', '49ed49b8', '15f97e22', '56b2bf49', 'bfd35a6b', 'b9cc2330', '54acd6a6', '661f4780', '429b2e97',

TuneError: The Ray Tune run failed. Please inspect the previous error messages for a cause. After fixing the issue, you can restart the run from scratch or continue this run. To continue this run, you can use `tuner = Tuner.restore("None", trainable=...)`.