In [1]:
import os
os.chdir('../../')

In [2]:
import data
import preprocess_utils.session2vec as sess2vec
import utils.sparsedf as sparsedf
from utils.df import scale_dataframe
from utils.dataset import SequenceDataset
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from IPython.display import display
pd.options.display.max_columns = None

Using TensorFlow backend.


In [3]:
from recommenders.recommender_base import RecommenderBase
from recommenders.recurrent.gru4rec import GRU4Rec
from recommenders.recurrent.hgru4rec import HGRU4Rec

from sklearn.preprocessing import MinMaxScaler

In [4]:
mode = 'small'

In [5]:
class GRU4RecRecommender(RecommenderBase):
    """
    A **simplified** interface to Recurrent Neural Network models for Session-based recommendation.
    Based on the following two papers:

    * Recurrent Neural Networks with Top-k Gains for Session-based Recommendations, Hidasi and Karatzoglou, CIKM 2018
    * Personalizing Session-based Recommendation with Hierarchical Recurrent Neural Networks, Quadrana et al, Recsys 2017

    """

    def __init__(self,
                dataset,
                session_layers,
                user_layers=None,
                batch_size=32,
                learning_rate=0.1,
                momentum=0.0,
                dropout=None,
                epochs=10,
                personalized=True):
        """
        :param session_layers: number of units per layer used at session level.
            It has to be a list of integers for multi-layer networks, or a integer value for single-layer networks.
        :param user_layers: number of units per layer used at user level. Required only by personalized models.
            It has to be a list of integers for multi-layer networks, or a integer value for single-layer networks.
        :param batch_size: the mini-batch size used in training
        :param learning_rate: the learning rate used in training (Adagrad optimized)
        :param momentum: the momentum coefficient used in training
        :param dropout: dropout coefficients.
            If personalized=False, it's a float value for the hidden-layer(s) dropout.
            If personalized=True, it's a 3-tuple with the values for the dropout of (user hidden, session hidden, user-to-session hidden) layers.
        :param epochs: number of training epochs
        :param personalized: whether to train a personalized model using the HRNN model.
            It will require user ids at prediction time.
        """
        self.dataset = dataset

        super().__init__(dataset.mode, dataset.cluster, 'RNNRecommender')
        if isinstance(session_layers, int):
            session_layers = [session_layers]
        if isinstance(user_layers, int):
            user_layers = [user_layers]
        self.session_layers = session_layers
        self.user_layers = user_layers
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.momentum = momentum
        if dropout is None:
            if not personalized:
                dropout = 0.0
            else:
                dropout = (0.0, 0.0, 0.0)
        self.dropout = dropout
        self.epochs = epochs
        self.personalized = personalized
        self.pseudo_session_id = 0

    def __str__(self):
        return 'RNNRecommender(' \
               'session_layers={session_layers}, ' \
               'user_layers={user_layers}, ' \
               'batch_size={batch_size}, ' \
               'learning_rate={learning_rate}, ' \
               'momentum={momentum}, ' \
               'dropout={dropout}, ' \
               'epochs={epochs}, ' \
               'personalized={personalized}, ' \
               ')'.format(**self.__dict__)

    def fit(self):
        train_data = dataset.load_train()
        
        if not self.personalized:
            # fit GRU4Rec
            self.model = GRU4Rec(layers=self.session_layers,
                                 n_epochs=self.epochs,
                                 batch_size=self.batch_size,
                                 learning_rate=self.learning_rate,
                                 momentum=self.momentum,
                                 dropout_p_hidden=self.dropout,
                                 session_key='session_id',
                                 item_key='reference',
                                 time_key='timestamp')
        else:
            if self.user_layers is None:
                raise ValueError('You should set the value of user_layers before training the personalized model.')

            if len(self.dropout) != 3:
                raise ValueError('dropout should be a 3-tuple with '
                                 '(user hidden, session hidden, user-to-session hidden) dropout values.')

            self.model = HGRU4Rec(session_layers=self.session_layers,
                                  user_layers=self.user_layers,
                                  batch_size=self.batch_size,
                                  n_epochs=self.epochs,
                                  learning_rate=self.learning_rate,
                                  momentum=self.momentum,
                                  dropout_p_hidden_usr=self.dropout[0],
                                  dropout_p_hidden_ses=self.dropout[1],
                                  dropout_p_init=self.dropout[2],
                                  session_key='session_id',
                                  user_key='user_id',
                                  item_key='reference',
                                  time_key='timestamp')
        
        self.model.fit(train_data)


    def recommend(self, user_profile, user_id=None):
        if not self.personalized:
            for item in user_profile:
                pred = self.model.predict_next_batch(np.array([self.pseudo_session_id]),
                                                     np.array([item]),
                                                     batch=1)
        else:
            if user_id is None:
                raise ValueError('user_id required by personalized models')
            for item in user_profile:
                pred = self.model.predict_next_batch(np.array([self.pseudo_session_id]),
                                                     np.array([item]),
                                                     np.array([user_id]),
                                                     batch=1)
        # sort items by predicted score
        pred.sort_values(0, ascending=False, inplace=True)
        # increase the psuedo-session id so that future call to recommend() won't be connected
        self.pseudo_session_id += 1
        # convert to the required output format
        return [([x.index], x._2) for x in pred.reset_index().itertuples()]
    
    def recommend_batch(self, x):
        
        pred = []
        for i in x:
            pred.append( self.recommend(i[1,:], i.user_id) )
            
    
    def get_scores_batch(self):
        pass

## Dataset

In [6]:
dataset = SequenceDataset(f'dataset/preprocessed/cluster_recurrent/{mode}')

In [10]:
train_df = dataset.load_train()
train_df.head(40)

train_vec: (128209, 23)




Unnamed: 0_level_0,user_id,session_id,timestamp,step,reference,platform,city,current_filters,impression_price,mobile,desktop,tablet,show_impression,clickout item,interaction item rating,interaction item info,interaction item image,interaction item deals,change of sort order,filter selection,search for item,search for destination,search for poi
orig_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
27774,0004IOZI7CKF,0146f7cb014ba,1541266717,1.0,"Valencia, Spain",DE,"Valencia, Spain",0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
27775,0004IOZI7CKF,0146f7cb014ba,1541266769,2.0,3381482,DE,"Valencia, Spain",0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99062525,0004IOZI7CKF,0146f7cb014ba,1541266796,2.04,110976,DE,"Valencia, Spain",0,0.008513,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99062526,0004IOZI7CKF,0146f7cb014ba,1541266796,2.0784,88759,DE,"Valencia, Spain",0,0.010072,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99062527,0004IOZI7CKF,0146f7cb014ba,1541266796,2.1168,150904,DE,"Valencia, Spain",0,0.018106,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99062528,0004IOZI7CKF,0146f7cb014ba,1541266796,2.1552,3549934,DE,"Valencia, Spain",0,0.017266,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99062529,0004IOZI7CKF,0146f7cb014ba,1541266796,2.1936,4084196,DE,"Valencia, Spain",0,0.010432,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99062530,0004IOZI7CKF,0146f7cb014ba,1541266796,2.232,2094980,DE,"Valencia, Spain",0,0.008993,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99062531,0004IOZI7CKF,0146f7cb014ba,1541266796,2.2704,110979,DE,"Valencia, Spain",0,0.009472,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99062532,0004IOZI7CKF,0146f7cb014ba,1541266796,2.3088,110981,DE,"Valencia, Spain",0,0.011631,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Train the model

In [8]:
model = GRU4RecRecommender(dataset,
                            session_layers=[20], 
                            user_layers=[20],
                            batch_size=16,
                            learning_rate=0.5,
                            momentum=0.1,
                            dropout=(0.1,0.1,0.1),
                            epochs=5,
                            personalized=True)

In [9]:
model.fit()



train_vec: (128209, 23)

You can find the C code in this temporary file: /var/folders/cf/sn3_1n6n2f7f07qb9l48l2vh0000gn/T/theano_compilation_error_5uzzmwes


Exception: ('The following error happened while compiling the node', DotModulo(A, s, m, A2, s2, m2), '\n', "Compilation failed (return status=1): In file included from /Users/federico/.theano/compiledir_Darwin-18.5.0-x86_64-i386-64bit-i386-3.6.8-64/tmpzyg0hgby/mod.cpp:1:. In file included from /Users/federico/miniconda3/envs/recsys/include/python3.6m/Python.h:25:. /Users/federico/miniconda3/envs/recsys/bin/../include/c++/v1/stdio.h:108:15: fatal error: 'stdio.h' file not found. #include_next <stdio.h>.               ^~~~~~~~~. 1 error generated.. ", '[DotModulo(A, s, m, A2, s2, m2)]')

## debug

In [None]:
train_gen, val_gen = dataset.get_train_validation_generator()
batch_x, batch_y = train_gen.__getitem__(0)

In [None]:
batch_x.shape

## Use the model

In [None]:
#X_test_df = pd.read_csv(f'dataset/preprocessed/cluster_recurrent/{mode}/X_test.csv').set_index('orig_index')
#X_test_df = scale_dataframe(X_test_df, ['impression_price'])
target_indices = data.target_indices(mode, 'cluster_recurrent')
#X_test_df.head(3)

In [None]:
target_indices[:10]

In [None]:
recomendations = model.recommend_batch()
model.compute_MRR(recomendations)

### Recommend batch

In [None]:
tensor, indices = sess2vec.sessions2tensor(X_test_df, drop_cols=['user_id','session_id','step','reference','platform','city','current_filters'], return_index=True)

In [None]:
print(tensor.shape)
print(indices.shape)

In [None]:
predictions = model.model.predict(X_test)
predictions.shape

In [None]:
# flatten X and the indices to be 2-dimensional
predictions = predictions.reshape((-1, predictions.shape[-1]))
indices = indices.flatten()

In [None]:
print(predictions.shape)
print(indices.shape)

In [None]:
pred_df = pd.DataFrame(predictions)
pred_df.head()

In [None]:
pred_df['orig_index'] = indices
pred_df = pred_df.set_index('orig_index')

In [None]:
pred_df.head()

In [None]:
pred_df.loc[target_indices]