In [4]:
import sys
import os
import logging
import papermill as pm
import scrapbook as sb
from tempfile import TemporaryDirectory
import numpy as np
import tensorflow.compat.v1 as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED
from recommenders.models.deeprec.deeprec_utils import (
    prepare_hparams
)

from resources.data_preprocessing import data_preprocessing
# from recommenders.datasets.amazon_reviews import download_and_extract, data_preprocessing
from recommenders.datasets.download_utils import maybe_download


from recommenders.models.deeprec.models.sequential.sli_rec import SLI_RECModel as SeqModel
####  to use the other model, use one of the following lines:
#from recommenders.models.deeprec.models.sequential.asvd import A2SVDModel as SeqModel
# from recommenders.models.deeprec.models.sequential.caser import CaserModel as SeqModel
# from recommenders.models.deeprec.models.sequential.gru4rec import GRU4RecModel as SeqModel
# from recommenders.models.deeprec.models.sequential.sum import SUMModel as SeqModel

#from recommenders.models.deeprec.models.sequential.nextitnet import NextItNetModel

from recommenders.models.deeprec.io.sequential_iterator import SequentialIterator
#from recommenders.models.deeprec.io.nextitnet_iterator import NextItNetIterator

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.7.13 (default, Mar 16 2022, 17:37:17) 
[GCC 7.5.0]
Tensorflow version: 2.8.0


In [5]:
##  ATTENTION: change to the corresponding config file, e.g., caser.yaml for CaserModel, sum.yaml for SUMModel
# yaml_file = '../../recommenders/models/deeprec/config/sli_rec.yaml'  
yaml_file = './sli_rec.yaml'  

In [6]:
EPOCHS = 10
BATCH_SIZE = 400
RANDOM_SEED = SEED  # Set None for non-deterministic result

data_path = os.path.join("resources/")

In [7]:
# for test
train_file = os.path.join(data_path, r'train_data')
valid_file = os.path.join(data_path, r'valid_data')
test_file = os.path.join(data_path, r'test_data')
user_vocab = os.path.join(data_path, r'user_vocab.pkl')
item_vocab = os.path.join(data_path, r'item_vocab.pkl')
cate_vocab = os.path.join(data_path, r'cate_vocab.pkl')
output_file = os.path.join(data_path, r'output_sli_rec.txt')

# reviews_name = 'json'
# meta_name = 'json'
# reviews_file = os.path.join(data_path, reviews_name)
# meta_file = os.path.join(data_path, meta_name)
train_num_ngs = 4 # number of negative instances with a positive instance for training
valid_num_ngs = 4 # number of negative instances with a positive instance for validation
test_num_ngs = 9 # number of negative instances with a positive instance for testing
sample_rate = 0.01 # sample a small item set for training and testing here for fast example

input_files = [data_path, train_file, valid_file, test_file, user_vocab, item_vocab, cate_vocab]

if not os.path.exists(train_file):
    # download_and_extract(reviews_name, reviews_file)
    # download_and_extract(meta_name, meta_file)
    data_preprocessing(*input_files, sample_rate=sample_rate, valid_num_ngs=valid_num_ngs, test_num_ngs=test_num_ngs)
    #### uncomment this for the NextItNet model, because it does not need to unfold the user history
    # data_preprocessing(*input_files, sample_rate=sample_rate, valid_num_ngs=valid_num_ngs, test_num_ngs=test_num_ngs, is_history_expanding=False)


In [8]:
### NOTE:  
### remember to use `_create_vocab(train_file, user_vocab, item_vocab, cate_vocab)` to generate the user_vocab, item_vocab and cate_vocab files, if you are using your own dataset rather than using our demo Amazon dataset.
hparams = prepare_hparams(yaml_file, 
                          embed_l2=0., 
                          layer_l2=0., 
                          learning_rate=0.001,  # set to 0.01 if batch normalization is disable
                          epochs=EPOCHS,
                          batch_size=BATCH_SIZE,
                          show_step=20,
                          MODEL_DIR=os.path.join(data_path, "model", "sli_rec/"),
                          SUMMARIES_DIR=os.path.join(data_path, "summary", "sli_rec/"),
                          user_vocab=user_vocab,
                          item_vocab=item_vocab,
                          cate_vocab=cate_vocab,
                          need_sample=True,
                          train_num_ngs=train_num_ngs, # provides the number of negative instances for each positive instance for loss computation.
            )

In [9]:
input_creator = SequentialIterator
#### uncomment this for the NextItNet model, because it needs a special data iterator for training
#input_creator = NextItNetIterator

In [10]:
model = SeqModel(hparams, input_creator, seed=RANDOM_SEED)

## sometimes we don't want to train a model from scratch
## then we can load a pre-trained model like this: 
#model.load_model(r'your_model_path')

  training=self.is_train_stage,
  return layer.apply(inputs, training=training)


In [11]:
# test_num_ngs is the number of negative lines after each positive line in your test_file
print(model.run_eval(test_file, num_ngs=test_num_ngs)) 

{'auc': 0.5339, 'logloss': 0.6931, 'mean_mrr': 0.2909, 'ndcg@2': 0.158, 'ndcg@4': 0.2545, 'ndcg@6': 0.3326, 'group_auc': 0.5339}


In [12]:
with Timer() as train_time:
    model = model.fit(train_file, valid_file, valid_num_ngs=valid_num_ngs) 

# valid_num_ngs is the number of negative lines after each positive line in your valid_file 
# we will evaluate the performance of model on valid_file every epoch
print('Time cost for training is {0:.2f} mins'.format(train_time.interval/60.0))

step 20 , total_loss: 1.5454, data_loss: 1.5454
step 40 , total_loss: 1.4382, data_loss: 1.4382
step 60 , total_loss: 1.4560, data_loss: 1.4560
step 80 , total_loss: 1.3815, data_loss: 1.3815
step 100 , total_loss: 1.3516, data_loss: 1.3516
step 120 , total_loss: 1.2937, data_loss: 1.2937
step 140 , total_loss: 1.3227, data_loss: 1.3227
step 160 , total_loss: 1.3556, data_loss: 1.3556
step 180 , total_loss: 1.2994, data_loss: 1.2994
step 200 , total_loss: 1.3157, data_loss: 1.3157
step 220 , total_loss: 1.2972, data_loss: 1.2972
step 240 , total_loss: 1.3546, data_loss: 1.3546
step 260 , total_loss: 1.2766, data_loss: 1.2766
step 280 , total_loss: 1.3071, data_loss: 1.3071
step 300 , total_loss: 1.3313, data_loss: 1.3313
step 320 , total_loss: 1.2846, data_loss: 1.2846
step 340 , total_loss: 1.3081, data_loss: 1.3081
step 360 , total_loss: 1.3228, data_loss: 1.3228
step 380 , total_loss: 1.2918, data_loss: 1.2918
step 400 , total_loss: 1.2388, data_loss: 1.2388
step 420 , total_loss: 1

In [13]:
res_syn = model.run_eval(test_file, num_ngs=test_num_ngs)
print(res_syn)

{'auc': 0.9018, 'logloss': 0.3918, 'mean_mrr': 0.7208, 'ndcg@2': 0.6864, 'ndcg@4': 0.77, 'ndcg@6': 0.7871, 'group_auc': 0.8998}


In [14]:
sb.glue("res_syn", res_syn)

In [15]:
model = model.predict(test_file, output_file)

In [16]:
model_best_trained = SeqModel(hparams, input_creator, seed=RANDOM_SEED)
path_best_trained = os.path.join(hparams.MODEL_DIR, "best_model")
print('loading saved model in {0}'.format(path_best_trained))
model_best_trained.load_model(path_best_trained)

  training=self.is_train_stage,
  return layer.apply(inputs, training=training)


loading saved model in resources/Recommenders/model/best_model


In [17]:
model_best_trained.run_eval(test_file, num_ngs=test_num_ngs)

{'auc': 0.9018,
 'group_auc': 0.8998,
 'logloss': 0.3918,
 'mean_mrr': 0.7208,
 'ndcg@2': 0.6864,
 'ndcg@4': 0.77,
 'ndcg@6': 0.7871}

In [18]:
model_best_trained.predict(test_file, output_file)

<recommenders.models.deeprec.models.sequential.sli_rec.SLI_RECModel at 0x7f2f69884910>