In [1]:
import sys
import os
import logging
import papermill as pm
import scrapbook as sb
from tempfile import TemporaryDirectory
import numpy as np
import tensorflow.compat.v1 as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED
from recommenders.models.deeprec.deeprec_utils import (
    prepare_hparams
)

from resources.data_preprocessing import data_preprocessing
# from recommenders.datasets.amazon_reviews import download_and_extract, data_preprocessing
from recommenders.datasets.download_utils import maybe_download


# from recommenders.models.deeprec.models.sequential.sli_rec import SLI_RECModel as SeqModel
####  to use the other model, use one of the following lines:
#from recommenders.models.deeprec.models.sequential.asvd import A2SVDModel as SeqModel
from recommenders.models.deeprec.models.sequential.caser import CaserModel as SeqModel
# from recommenders.models.deeprec.models.sequential.gru4rec import GRU4RecModel as SeqModel
# from recommenders.models.deeprec.models.sequential.sum import SUMModel as SeqModel

#from recommenders.models.deeprec.models.sequential.nextitnet import NextItNetModel

from recommenders.models.deeprec.io.sequential_iterator import SequentialIterator
#from recommenders.models.deeprec.io.nextitnet_iterator import NextItNetIterator

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.8.9 (tags/v3.8.9:a743f81, Apr  6 2021, 14:02:34) [MSC v.1928 64 bit (AMD64)]
Tensorflow version: 2.8.0


In [2]:
##  ATTENTION: change to the corresponding config file, e.g., caser.yaml for CaserModel, sum.yaml for SUMModel
# yaml_file = '../../recommenders/models/deeprec/config/sli_rec.yaml'  
yaml_file = './caser.yaml'  

In [3]:
EPOCHS = 10
BATCH_SIZE = 400
RANDOM_SEED = SEED  # Set None for non-deterministic result

data_path = os.path.join("resources/")

In [4]:
# for test
train_file = os.path.join(data_path, r'train_data')
valid_file = os.path.join(data_path, r'valid_data')
test_file = os.path.join(data_path, r'test_data')
user_vocab = os.path.join(data_path, r'user_vocab.pkl')
item_vocab = os.path.join(data_path, r'item_vocab.pkl')
cate_vocab = os.path.join(data_path, r'cate_vocab.pkl')
output_file = os.path.join(data_path, r'output_caser.txt')

# reviews_name = 'json'
# meta_name = 'json'
# reviews_file = os.path.join(data_path, reviews_name)
# meta_file = os.path.join(data_path, meta_name)
train_num_ngs = 4 # number of negative instances with a positive instance for training
valid_num_ngs = 4 # number of negative instances with a positive instance for validation
test_num_ngs = 9 # number of negative instances with a positive instance for testing
sample_rate = 0.01 # sample a small item set for training and testing here for fast example

input_files = [data_path, train_file, valid_file, test_file, user_vocab, item_vocab, cate_vocab]

if not os.path.exists(train_file):
    # download_and_extract(reviews_name, reviews_file)
    # download_and_extract(meta_name, meta_file)
    data_preprocessing(*input_files, sample_rate=sample_rate, valid_num_ngs=valid_num_ngs, test_num_ngs=test_num_ngs)
    #### uncomment this for the NextItNet model, because it does not need to unfold the user history
    # data_preprocessing(*input_files, sample_rate=sample_rate, valid_num_ngs=valid_num_ngs, test_num_ngs=test_num_ngs, is_history_expanding=False)


In [5]:
### NOTE:  
### remember to use `_create_vocab(train_file, user_vocab, item_vocab, cate_vocab)` to generate the user_vocab, item_vocab and cate_vocab files, if you are using your own dataset rather than using our demo Amazon dataset.
hparams = prepare_hparams(yaml_file, 
                          embed_l2=0., 
                          layer_l2=0., 
                          learning_rate=0.001,  # set to 0.01 if batch normalization is disable
                          epochs=EPOCHS,
                          batch_size=BATCH_SIZE,
                          show_step=20,
                          MODEL_DIR=os.path.join(data_path, "model", "caser/"),
                          SUMMARIES_DIR=os.path.join(data_path, "summary", "caser/"),
                          user_vocab=user_vocab,
                          item_vocab=item_vocab,
                          cate_vocab=cate_vocab,
                          need_sample=True,
                          train_num_ngs=train_num_ngs, # provides the number of negative instances for each positive instance for loss computation.
            )

In [6]:
input_creator = SequentialIterator
#### uncomment this for the NextItNet model, because it needs a special data iterator for training
#input_creator = NextItNetIterator

In [7]:
model = SeqModel(hparams, input_creator, seed=RANDOM_SEED)

## sometimes we don't want to train a model from scratch
## then we can load a pre-trained model like this: 
#model.load_model(r'your_model_path')

  return tf.compat.v1.layers.conv1d(
  return layer.apply(inputs)
  out_v = tf.compat.v1.layers.flatten(out_v)
  return layer.apply(inputs)
  curr_hidden_nn_layer = tf.compat.v1.layers.batch_normalization(
  return layer.apply(inputs, training=training)


In [8]:
# test_num_ngs is the number of negative lines after each positive line in your test_file
print(model.run_eval(test_file, num_ngs=test_num_ngs)) 

{'auc': 0.5032, 'logloss': 0.6932, 'mean_mrr': 0.2804, 'ndcg@2': 0.1496, 'ndcg@4': 0.2359, 'ndcg@6': 0.3131, 'group_auc': 0.5056}


In [9]:
with Timer() as train_time:
    model = model.fit(train_file, valid_file, valid_num_ngs=valid_num_ngs) 

# valid_num_ngs is the number of negative lines after each positive line in your valid_file 
# we will evaluate the performance of model on valid_file every epoch
print('Time cost for training is {0:.2f} mins'.format(train_time.interval/60.0))

step 20 , total_loss: 1.4993, data_loss: 1.4993
step 40 , total_loss: 1.4213, data_loss: 1.4213
step 60 , total_loss: 1.3456, data_loss: 1.3456
step 80 , total_loss: 1.3380, data_loss: 1.3380
step 100 , total_loss: 1.3079, data_loss: 1.3079
step 120 , total_loss: 1.2797, data_loss: 1.2797
step 140 , total_loss: 1.3192, data_loss: 1.3192
step 160 , total_loss: 1.2867, data_loss: 1.2867
step 180 , total_loss: 1.2731, data_loss: 1.2731
step 200 , total_loss: 1.2542, data_loss: 1.2542
step 220 , total_loss: 1.2495, data_loss: 1.2495
step 240 , total_loss: 1.1705, data_loss: 1.1705
step 260 , total_loss: 1.2580, data_loss: 1.2580
step 280 , total_loss: 1.1555, data_loss: 1.1555
step 300 , total_loss: 1.1686, data_loss: 1.1686
step 320 , total_loss: 1.2010, data_loss: 1.2010
step 340 , total_loss: 1.2430, data_loss: 1.2430
step 360 , total_loss: 1.1813, data_loss: 1.1813
step 380 , total_loss: 1.2442, data_loss: 1.2442
step 400 , total_loss: 1.2015, data_loss: 1.2015
step 420 , total_loss: 1

In [10]:
res_syn = model.run_eval(test_file, num_ngs=test_num_ngs)
print(res_syn)

{'auc': 0.7988, 'logloss': 0.6248, 'mean_mrr': 0.5647, 'ndcg@2': 0.4828, 'ndcg@4': 0.6079, 'ndcg@6': 0.6545, 'group_auc': 0.8083}


In [11]:
sb.glue("res_syn", res_syn)

In [12]:
model = model.predict(test_file, output_file)

In [13]:
model_best_trained = SeqModel(hparams, input_creator, seed=RANDOM_SEED)
path_best_trained = os.path.join(hparams.MODEL_DIR, "best_model")
print('loading saved model in {0}'.format(path_best_trained))
model_best_trained.load_model(path_best_trained)

  return tf.compat.v1.layers.conv1d(
  return layer.apply(inputs)
  out_v = tf.compat.v1.layers.flatten(out_v)
  return layer.apply(inputs)
  curr_hidden_nn_layer = tf.compat.v1.layers.batch_normalization(
  return layer.apply(inputs, training=training)


loading saved model in resources/model\caser/best_model


In [14]:
model_best_trained.run_eval(test_file, num_ngs=test_num_ngs)

{'auc': 0.8284,
 'logloss': 0.5215,
 'mean_mrr': 0.5944,
 'ndcg@2': 0.5212,
 'ndcg@4': 0.6423,
 'ndcg@6': 0.6831,
 'group_auc': 0.83}

In [15]:
model_best_trained.predict(test_file, output_file)

<recommenders.models.deeprec.models.sequential.caser.CaserModel at 0x1c113e832e0>