In [1]:
import sys
import os
import logging
import papermill as pm
import scrapbook as sb
from tempfile import TemporaryDirectory
import numpy as np
import tensorflow.compat.v1 as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED
from recommenders.models.deeprec.deeprec_utils import (
    prepare_hparams
)

from resources.data_preprocessing import data_preprocessing
# from recommenders.datasets.amazon_reviews import download_and_extract, data_preprocessing
from recommenders.datasets.download_utils import maybe_download


# from recommenders.models.deeprec.models.sequential.sli_rec import SLI_RECModel as SeqModel
####  to use the other model, use one of the following lines:
#from recommenders.models.deeprec.models.sequential.asvd import A2SVDModel as SeqModel
# from recommenders.models.deeprec.models.sequential.caser import CaserModel as SeqModel
# from recommenders.models.deeprec.models.sequential.gru4rec import GRU4RecModel as SeqModel
from recommenders.models.deeprec.models.sequential.sum import SUMModel as SeqModel

#from recommenders.models.deeprec.models.sequential.nextitnet import NextItNetModel

from recommenders.models.deeprec.io.sequential_iterator import SequentialIterator
#from recommenders.models.deeprec.io.nextitnet_iterator import NextItNetIterator

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.8.9 (tags/v3.8.9:a743f81, Apr  6 2021, 14:02:34) [MSC v.1928 64 bit (AMD64)]
Tensorflow version: 2.8.0


In [2]:
##  ATTENTION: change to the corresponding config file, e.g., caser.yaml for CaserModel, sum.yaml for SUMModel
# yaml_file = '../../recommenders/models/deeprec/config/sli_rec.yaml'  
yaml_file = './sum.yaml'  

In [3]:
EPOCHS = 10
BATCH_SIZE = 400
RANDOM_SEED = SEED  # Set None for non-deterministic result

data_path = os.path.join("resources/")

In [4]:
# for test
train_file = os.path.join(data_path, r'train_data')
valid_file = os.path.join(data_path, r'valid_data')
test_file = os.path.join(data_path, r'test_data')
user_vocab = os.path.join(data_path, r'user_vocab.pkl')
item_vocab = os.path.join(data_path, r'item_vocab.pkl')
cate_vocab = os.path.join(data_path, r'cate_vocab.pkl')
output_file = os.path.join(data_path, r'output_sum.txt')

# reviews_name = 'json'
# meta_name = 'json'
# reviews_file = os.path.join(data_path, reviews_name)
# meta_file = os.path.join(data_path, meta_name)
train_num_ngs = 4 # number of negative instances with a positive instance for training
valid_num_ngs = 4 # number of negative instances with a positive instance for validation
test_num_ngs = 9 # number of negative instances with a positive instance for testing
sample_rate = 0.01 # sample a small item set for training and testing here for fast example

input_files = [data_path, train_file, valid_file, test_file, user_vocab, item_vocab, cate_vocab]

if not os.path.exists(train_file):
    # download_and_extract(reviews_name, reviews_file)
    # download_and_extract(meta_name, meta_file)
    data_preprocessing(*input_files, sample_rate=sample_rate, valid_num_ngs=valid_num_ngs, test_num_ngs=test_num_ngs)
    #### uncomment this for the NextItNet model, because it does not need to unfold the user history
    # data_preprocessing(*input_files, sample_rate=sample_rate, valid_num_ngs=valid_num_ngs, test_num_ngs=test_num_ngs, is_history_expanding=False)


In [5]:
### NOTE:  
### remember to use `_create_vocab(train_file, user_vocab, item_vocab, cate_vocab)` to generate the user_vocab, item_vocab and cate_vocab files, if you are using your own dataset rather than using our demo Amazon dataset.
hparams = prepare_hparams(yaml_file, 
                          embed_l2=0., 
                          layer_l2=0., 
                          learning_rate=0.001,  # set to 0.01 if batch normalization is disable
                          epochs=EPOCHS,
                          batch_size=BATCH_SIZE,
                          show_step=20,
                          MODEL_DIR=os.path.join(data_path, "model", "sum/"),
                          SUMMARIES_DIR=os.path.join(data_path, "summary", "sum/"),
                          user_vocab=user_vocab,
                          item_vocab=item_vocab,
                          cate_vocab=cate_vocab,
                          need_sample=True,
                          train_num_ngs=train_num_ngs, # provides the number of negative instances for each positive instance for loss computation.
            )

In [6]:
input_creator = SequentialIterator
#### uncomment this for the NextItNet model, because it needs a special data iterator for training
#input_creator = NextItNetIterator

In [7]:
model = SeqModel(hparams, input_creator, seed=RANDOM_SEED)

## sometimes we don't want to train a model from scratch
## then we can load a pre-trained model like this: 
#model.load_model(r'your_model_path')

  self._erase_W = self.add_variable(
  self._erase_b = self.add_variable(
  self._reset_W = self.add_variable(
  self._reset_b = self.add_variable(
  self._add_W = self.add_variable(
  self._add_b = self.add_variable(
  self.heads = self.add_variable(
  self._beta = self.add_variable(
  self._alpha = self.add_variable(
  curr_hidden_nn_layer = tf.compat.v1.layers.batch_normalization(
  return layer.apply(inputs, training=training)


In [8]:
# test_num_ngs is the number of negative lines after each positive line in your test_file
print(model.run_eval(test_file, num_ngs=test_num_ngs)) 

{'auc': 0.5123, 'logloss': 0.6932, 'mean_mrr': 0.2811, 'ndcg@2': 0.1449, 'ndcg@4': 0.2484, 'ndcg@6': 0.3269, 'group_auc': 0.5127}


In [9]:
with Timer() as train_time:
    model = model.fit(train_file, valid_file, valid_num_ngs=valid_num_ngs) 

# valid_num_ngs is the number of negative lines after each positive line in your valid_file 
# we will evaluate the performance of model on valid_file every epoch
print('Time cost for training is {0:.2f} mins'.format(train_time.interval/60.0))

step 20 , total_loss: 1.4799, data_loss: 1.4799
step 40 , total_loss: 1.3720, data_loss: 1.3720
step 60 , total_loss: 1.2900, data_loss: 1.2900
step 80 , total_loss: 1.3291, data_loss: 1.3291
step 100 , total_loss: 1.2899, data_loss: 1.2899
step 120 , total_loss: 1.2870, data_loss: 1.2870
step 140 , total_loss: 1.2741, data_loss: 1.2741
step 160 , total_loss: 1.3022, data_loss: 1.3022
step 180 , total_loss: 1.2206, data_loss: 1.2206
step 200 , total_loss: 1.2368, data_loss: 1.2368
step 220 , total_loss: 1.2290, data_loss: 1.2290
step 240 , total_loss: 1.2345, data_loss: 1.2345
step 260 , total_loss: 1.2742, data_loss: 1.2742
step 280 , total_loss: 1.1996, data_loss: 1.1996
step 300 , total_loss: 1.2116, data_loss: 1.2116
step 320 , total_loss: 1.2233, data_loss: 1.2233
step 340 , total_loss: 1.1729, data_loss: 1.1729
step 360 , total_loss: 1.2119, data_loss: 1.2119
step 380 , total_loss: 1.2462, data_loss: 1.2462
step 400 , total_loss: 1.2387, data_loss: 1.2387
step 420 , total_loss: 1

In [10]:
res_syn = model.run_eval(test_file, num_ngs=test_num_ngs)
print(res_syn)

{'auc': 0.8255, 'logloss': 0.5659, 'mean_mrr': 0.5999, 'ndcg@2': 0.5289, 'ndcg@4': 0.6456, 'ndcg@6': 0.6863, 'group_auc': 0.831}


In [11]:
sb.glue("res_syn", res_syn)

In [12]:
model = model.predict(test_file, output_file)

In [13]:
model_best_trained = SeqModel(hparams, input_creator, seed=RANDOM_SEED)
path_best_trained = os.path.join(hparams.MODEL_DIR, "best_model")
print('loading saved model in {0}'.format(path_best_trained))
model_best_trained.load_model(path_best_trained)

  self._erase_W = self.add_variable(
  self._erase_b = self.add_variable(
  self._reset_W = self.add_variable(
  self._reset_b = self.add_variable(
  self._add_W = self.add_variable(
  self._add_b = self.add_variable(
  self.heads = self.add_variable(
  self._beta = self.add_variable(
  self._alpha = self.add_variable(
  curr_hidden_nn_layer = tf.compat.v1.layers.batch_normalization(
  return layer.apply(inputs, training=training)


loading saved model in resources/model\sum/best_model


In [14]:
model_best_trained.run_eval(test_file, num_ngs=test_num_ngs)

{'auc': 0.8255,
 'logloss': 0.5659,
 'mean_mrr': 0.5999,
 'ndcg@2': 0.5289,
 'ndcg@4': 0.6456,
 'ndcg@6': 0.6863,
 'group_auc': 0.831}

In [15]:
model_best_trained.predict(test_file, output_file)

<recommenders.models.deeprec.models.sequential.sum.SUMModel at 0x1458576be80>