In [1]:
import sys
import os
import logging
import papermill as pm
import scrapbook as sb
from tempfile import TemporaryDirectory
import numpy as np
import tensorflow.compat.v1 as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED
from recommenders.models.deeprec.deeprec_utils import (
    prepare_hparams
)

from resources.data_preprocessing2 import data_preprocessing
# from recommenders.datasets.amazon_reviews import download_and_extract, data_preprocessing
from recommenders.datasets.download_utils import maybe_download


from recommenders.models.deeprec.models.sequential.sli_rec import SLI_RECModel as SeqModel
####  to use the other model, use one of the following lines:
#from recommenders.models.deeprec.models.sequential.asvd import A2SVDModel as SeqModel
# from recommenders.models.deeprec.models.sequential.caser import CaserModel as SeqModel
# from recommenders.models.deeprec.models.sequential.gru4rec import GRU4RecModel as SeqModel
# from recommenders.models.deeprec.models.sequential.sum import SUMModel as SeqModel

#from recommenders.models.deeprec.models.sequential.nextitnet import NextItNetModel

from recommenders.models.deeprec.io.sequential_iterator import SequentialIterator
#from recommenders.models.deeprec.io.nextitnet_iterator import NextItNetIterator

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.8.9 (tags/v3.8.9:a743f81, Apr  6 2021, 14:02:34) [MSC v.1928 64 bit (AMD64)]
Tensorflow version: 2.8.0


In [2]:
##  ATTENTION: change to the corresponding config file, e.g., caser.yaml for CaserModel, sum.yaml for SUMModel
# yaml_file = '../../recommenders/models/deeprec/config/sli_rec.yaml'  
yaml_file = './sli_rec.yaml'  

In [3]:
EPOCHS = 10
BATCH_SIZE = 400
RANDOM_SEED = SEED  # Set None for non-deterministic result

data_path = os.path.join("resources/results/20220504")

In [4]:
# for test
train_file = os.path.join(data_path, r'train_data')
valid_file = os.path.join(data_path, r'valid_data')
test_file = os.path.join(data_path, r'test_data')
user_vocab = os.path.join(data_path, r'user_vocab.pkl')
item_vocab = os.path.join(data_path, r'item_vocab.pkl')
cate_vocab = os.path.join(data_path, r'cate_vocab.pkl')
output_file = os.path.join(data_path, r'output_sli_rec.txt')

# reviews_name = 'json'
# meta_name = 'json'
# reviews_file = os.path.join(data_path, reviews_name)
# meta_file = os.path.join(data_path, meta_name)
train_num_ngs = 4 # number of negative instances with a positive instance for training
valid_num_ngs = 4 # number of negative instances with a positive instance for validation
test_num_ngs = 9 # number of negative instances with a positive instance for testing
sample_rate = 0.1 # sample a small item set for training and testing here for fast example

input_files = [data_path, train_file, valid_file, test_file, user_vocab, item_vocab, cate_vocab]

if not os.path.exists(train_file):
    # download_and_extract(reviews_name, reviews_file)
    # download_and_extract(meta_name, meta_file)
    data_preprocessing(*input_files, sample_rate=sample_rate, valid_num_ngs=valid_num_ngs, test_num_ngs=test_num_ngs)
    #### uncomment this for the NextItNet model, because it does not need to unfold the user history
    # data_preprocessing(*input_files, sample_rate=sample_rate, valid_num_ngs=valid_num_ngs, test_num_ngs=test_num_ngs, is_history_expanding=False)


In [5]:
### NOTE:  
### remember to use `_create_vocab(train_file, user_vocab, item_vocab, cate_vocab)` to generate the user_vocab, item_vocab and cate_vocab files, if you are using your own dataset rather than using our demo Amazon dataset.
hparams = prepare_hparams(yaml_file, 
                          embed_l2=0., 
                          layer_l2=0., 
                          learning_rate=0.001,  # set to 0.01 if batch normalization is disable
                          epochs=EPOCHS,
                          batch_size=BATCH_SIZE,
                          show_step=20,
                          MODEL_DIR=os.path.join(data_path, "model", "sli_rec/"),
                          SUMMARIES_DIR=os.path.join(data_path, "summary", "sli_rec/"),
                          user_vocab=user_vocab,
                          item_vocab=item_vocab,
                          cate_vocab=cate_vocab,
                          need_sample=True,
                          train_num_ngs=train_num_ngs, # provides the number of negative instances for each positive instance for loss computation.
            )

In [6]:
input_creator = SequentialIterator
#### uncomment this for the NextItNet model, because it needs a special data iterator for training
#input_creator = NextItNetIterator

In [7]:
model = SeqModel(hparams, input_creator, seed=RANDOM_SEED)

## sometimes we don't want to train a model from scratch
## then we can load a pre-trained model like this: 
#model.load_model(r'your_model_path')

  curr_hidden_nn_layer = tf.compat.v1.layers.batch_normalization(
  return layer.apply(inputs, training=training)


In [8]:
# test_num_ngs is the number of negative lines after each positive line in your test_file
print(model.run_eval(test_file, num_ngs=test_num_ngs)) 

{'auc': 0.5504, 'logloss': 0.6931, 'mean_mrr': 0.322, 'ndcg@2': 0.1975, 'ndcg@4': 0.2897, 'ndcg@6': 0.3625, 'group_auc': 0.5484}


In [9]:
with Timer() as train_time:
    model = model.fit(train_file, valid_file, valid_num_ngs=valid_num_ngs) 

# valid_num_ngs is the number of negative lines after each positive line in your valid_file 
# we will evaluate the performance of model on valid_file every epoch
print('Time cost for training is {0:.2f} mins'.format(train_time.interval/60.0))

step 20 , total_loss: 1.5138, data_loss: 1.5138
step 40 , total_loss: 1.3492, data_loss: 1.3492
step 60 , total_loss: 1.3590, data_loss: 1.3590
step 80 , total_loss: 1.2899, data_loss: 1.2899
step 100 , total_loss: 1.3073, data_loss: 1.3073
step 120 , total_loss: 1.2321, data_loss: 1.2321
step 140 , total_loss: 1.2895, data_loss: 1.2895
step 160 , total_loss: 1.3049, data_loss: 1.3049
step 180 , total_loss: 1.2702, data_loss: 1.2702
step 200 , total_loss: 1.2973, data_loss: 1.2973
step 220 , total_loss: 1.2337, data_loss: 1.2337
step 240 , total_loss: 1.2505, data_loss: 1.2505
step 260 , total_loss: 1.2658, data_loss: 1.2658
step 280 , total_loss: 1.2386, data_loss: 1.2386
step 300 , total_loss: 1.2466, data_loss: 1.2466
step 320 , total_loss: 1.1930, data_loss: 1.1930
step 340 , total_loss: 1.2470, data_loss: 1.2470
step 360 , total_loss: 1.2177, data_loss: 1.2177
step 380 , total_loss: 1.1590, data_loss: 1.1590
step 400 , total_loss: 1.2311, data_loss: 1.2311
step 420 , total_loss: 1

In [10]:
res_syn = model.run_eval(test_file, num_ngs=test_num_ngs)
print(res_syn)

{'auc': 0.8907, 'logloss': 0.4206, 'mean_mrr': 0.6947, 'ndcg@2': 0.654, 'ndcg@4': 0.7474, 'ndcg@6': 0.7679, 'group_auc': 0.8907}


In [11]:
sb.glue("res_syn", res_syn)

In [12]:
model = model.predict(test_file, output_file)

In [7]:
model_best_trained = SeqModel(hparams, input_creator, seed=RANDOM_SEED)
path_best_trained = os.path.join(hparams.MODEL_DIR, "best_model")
print('loading saved model in {0}'.format(path_best_trained))
model_best_trained.load_model(path_best_trained)

  curr_hidden_nn_layer = tf.compat.v1.layers.batch_normalization(
  return layer.apply(inputs, training=training)


loading saved model in resources/results/20220504\model\sli_rec/best_model


In [8]:
model_best_trained.run_eval(test_file, num_ngs=test_num_ngs)

{'auc': 0.8907,
 'logloss': 0.4206,
 'mean_mrr': 0.6947,
 'ndcg@2': 0.654,
 'ndcg@4': 0.7474,
 'ndcg@6': 0.7679,
 'group_auc': 0.8907}

In [9]:
model_best_trained.predict(test_file, output_file)

<recommenders.models.deeprec.models.sequential.sli_rec.SLI_RECModel at 0x2130339aa00>