# RecpackFusion

In [0]:
!pip install wandb einops &> /dev/null
!rm -r recpackfusion
!git clone https://github.com/gabriben/recfusion.git
!cd recfusion; pip install . &> /dev/null

In [0]:
from recpack.quick import quick_train


model = 'RecFusionMLP'
dataset = 'MovieLens1M'
prep_hypers = {
    'ds_path': '/dbfs/datasets/RecSys/',
    'min_rating': 4,
    'min_items_per_user': 5,
    'min_users_per_item': 5,
    'generalization': 'StrongGeneralization', # WeakGeneralization,
    "train_val_test": [0.8, 0.1, 0.1],
    'force_even_items': False
}
train_hypers = {
    # "max_epochs":1,
    "stop_early": True, "max_iter_no_change" : 10, "min_improvement": 0.001,
    "validation_sample_size": 1000,
    # "dim_bottleneck_layer" : 128,
    # 'T': 2, 
    # 'p_dnns_depth': 5,
    # "T": 100,
    "batch_size" : 200,
    # "M" : 200,
    # "p_dnns_depth": 2,
    # "anneal_steps" : 20    
    # "time_embedding_as_input" : True  
    #"schedule_type": 'fixed',
    #'jascha_bin_process': True,
    #'b_start' : 0.1,    
}
val_metric = {'NDCGK':100}    
test_metrics = {
    'NDCGK' : [10, 20, 50, 100],
    'RecallK' : [10, 20, 50],
    'HitK': [20, 50, 100],
    'CalibratedRecallK': [10, 20, 50]
} 

In [0]:
from hyperopt import fmin, hp, tpe, atpe
from hyperopt import SparkTrials, STATUS_OK

In [0]:
import os

# First, set up the scikit-learn workflow, wrapped within a function.
def minimize_hyperopt(params):
  """
  This is our main training function which we pass to Hyperopt.
  It takes in hyperparameter settings, fits a model based on those settings,
  evaluates the model, and returns the loss.

  :param params: map specifying the hyperparameter settings to test
  :return: loss for the fitted model
  """

  os.environ["WANDB_API_KEY"] = ""
  os.environ["WANDB_PROJECT"] = ""
  
  train_hypers.update(params)
  m, v = quick_train(model, dataset, prep_hypers, train_hypers, val_metric, test_metrics)
  loss = v
  
  return {'loss': loss, 'status': STATUS_OK}


In [0]:
# http://hyperopt.github.io/hyperopt/scaleout/spark/

# Next, define a search space for Hyperopt.

hypers = {
  'T' : list(range(1,1000)),
  'p_dnns_depth' : list(range(1,10)),
  'batch_size' : list(range(100, 1050, 50)),
  # 'time_embedding_as_input' : [False, True]
}

# see http://hyperopt.github.io/hyperopt/getting-started/search_spaces/
search_space = {
  "T": hp.choice('T', hypers['T']),
  "p_dnns_depth": hp.choice('p_dnns_depth', hypers['p_dnns_depth']),
  "batch_size": hp.choice('batch_size', hypers['batch_size']),
  # "time_embedding_as_input" : hp.choice('time_embedding_as_input', hypers['time_embedding_as_input'])
}

# Select a search algorithm for Hyperopt to use.
algo=tpe.suggest

# We can distribute tuning across our Spark cluster
# by calling `fmin` with a `SparkTrials` instance.
spark_trials = SparkTrials() # parallelism=4
best_hyperparameters = fmin(
  fn=minimize_hyperopt,
  space=search_space,
  algo=algo,
  trials=spark_trials,
  max_evals=100)

#hyperopt returns the index and not the value of the best hypers:

for k, v in best_hyperparameters.items():
  best_hyperparameters[k] = hypers[k][v]
  
# from hyperopt import space_eval
# space_eval(search_space, best_hyperparameters)

best_hyperparameters

# train with optimal hypers

In [0]:
train_hypers |= best_hyperparameters | {"stop_early": True}
m, v = quick_train(model, dataset, prep_hypers, train_hypers, val_metric, test_metrics)
m

In [0]:
train_hypers |= best_hyperparameters | {"stop_early": False}
m, v = quick_train(model, dataset, prep_hypers, train_hypers, val_metric, test_metrics)
m