In [1]:
import os
os.chdir('../../')

In [2]:
import data
import preprocess_utils.session2vec as sess2vec
import utils.sparsedf as sparsedf
from utils.df import MinMaxScaler
from utils.dataset import SequenceDatasetForClassification, SequenceDatasetForRegression
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from IPython.display import display
pd.options.display.max_columns = None

Using TensorFlow backend.


In [3]:
from recommenders.recommender_base import RecommenderBase
from recommenders.recurrent.RNNClassificationRecommender import RNNClassificationRecommender

from numpy.linalg import norm as L2Norm
from sklearn.metrics import classification_report
from sklearn.utils import shuffle

import keras
from keras import metrics
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, GRU, Embedding, Dropout
from keras.callbacks import EarlyStopping

In [4]:
mode = 'small'

## Load the dataset

In [5]:
dataset = SequenceDatasetForClassification(f'dataset/preprocessed/cluster_recurrent/{mode}/dataset_classification_p6')

In [6]:
x, y = dataset.load_Xtrain(), dataset.load_Ytrain()
x, y = shuffle(x, y)

  return self.partial_fit(X, y)


X_train: (37664, 6, 118)
Y_train: (37664, 25)


## Build the model

In [None]:
import keras.backend as K
import tensorflow as tf

def softmax_loss(y_true, y_pred):
    y_pred = K.print_tensor(y_pred, message='pred = ')

    softmax = K.softmax(y_pred)
    softmax = K.print_tensor(softmax, message='softmax = ')

    log_soft = K.log(softmax)
    log_soft = K.print_tensor(log_soft, message='log softmax = ')

    #batchd = K.batch_dot(y_true, log_soft, axes=1)
    batchd = y_true * log_soft
    batchd = K.print_tensor(batchd, message='batch dot = ')

    return -K.sum(batchd)

def soft_loss2(y_true, y_pred):
    preds_softmax = tf.nn.softmax(y_pred)
    step1 = y_true * tf.log(preds_softmax)
    return -tf.reduce_sum(step1, reduction_indices=[1])

In [9]:
def mrr(y_true, y_pred):
    y_true = y_true
    y_pred = y_pred
    mrr = 0
    current_percentage = 0
    for i in range(1, 26, 1):
        if i == 1:
            mrr = metrics.top_k_categorical_accuracy(y_true, y_pred, k=i)
            current_percentage = metrics.top_k_categorical_accuracy(y_true, y_pred, k=i)
        else:
            t = metrics.top_k_categorical_accuracy(y_true, y_pred, k=i)
            mrr += (t - current_percentage) * (1 / i)
            current_percentage = t
    return mrr

In [10]:
m = Sequential()
#m.add( TimeDistributed(Dense(64), input_shape=(6,68)) )
m.add( GRU(64, input_shape=(6,118), recurrent_dropout=0.2, dropout=0.2, return_sequences=True) )
m.add( GRU(32, recurrent_dropout=0.2, dropout=0.2, return_sequences=False) )
m.add( Dense(32, activation='relu') )
#m.add( Dropout(0.2) )
m.add( Dense(25, activation='softmax') )
#m.add( Dropout(0.1) )

adam = keras.optimizers.Adam(lr=1e-1)
#m.compile(adam, loss='categorical_crossentropy', metrics=['accuracy'])
m.compile(adam, loss='cosine_proximity', metrics=['accuracy', mrr])
#m.compile(adam, loss=soft_loss2, metrics=['accuracy', mrr])
m.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_5 (GRU)                  (None, 6, 64)             35136     
_________________________________________________________________
gru_6 (GRU)                  (None, 32)                9312      
_________________________________________________________________
dense_5 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_6 (Dense)              (None, 25)                825       
Total params: 46,329
Trainable params: 46,329
Non-trainable params: 0
_________________________________________________________________


## Train the model

In [None]:
m.fit(x=x, y=y, epochs=3, validation_split=0.2, batch_size=32)

Train on 30131 samples, validate on 7533 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

## Load the model from file

# test

In [30]:
target_indices = data.target_indices(mode)
target_indices[:10]

array([54066, 54071, 54108, 54184, 54211, 54343, 54397, 54606, 54609,
       54692])

In [31]:
def recommend_batch(m, dataset, target_indices):
    X, indices = dataset.load_Xtest()

    # predict the references
    predictions = m.predict(X)

    # take only the last index for each session (target row) and flatten
    #predictions = predictions.reshape((-1, predictions.shape[-1]))
    #indices = indices[:,-1].flatten()

    # take only the target predictions
    pred_df = pd.DataFrame(predictions)
    pred_df['orig_index'] = indices
    pred_df = pred_df.set_index('orig_index')
    predictions = pred_df.loc[target_indices]
    del pred_df

    assert len(predictions) == len(target_indices)

    full_df = data.full_df()

    result_predictions = []
    for index in tqdm(target_indices):
        # get the impressions of the clickout to predict
        impr = list(map(int, full_df.loc[index]['impressions'].split('|')))
        # build a list of (impression, score)
        prediction_impressions_distances = [ (impr[j], predictions.at[index,j]) for j in range(len(impr)) ]
        # order the list based on scores (greater is better)
        prediction_impressions_distances.sort(key=lambda tup: tup[1], reverse=True)
        # get only the impressions ids
        ordered_impressions = list(map(lambda x: x[0], prediction_impressions_distances))
        # append the couple (index, reranked impressions)
        result_predictions.append( (index, ordered_impressions) )

    print('prediction created !!!')

    return result_predictions

In [32]:
def compute_MRR(mode, predictions):
    """
    compute the MRR mean reciprocal rank of some predictions
    it uses the mode parameter to know which handle to retrieve to compute the score

    :param mode: 'local' or 'small' say which train has been used
    :param predictions: session_id, ordered impressions_list
    :param verboose: if True print the MRR
    :return: MRR of the given predictions
    """
    assert (mode == 'local' or mode == 'small')

    train_df = data.train_df('full')   #data.train_df("full", cluster=self.cluster)

    target_indices, recs = zip(*predictions)
    target_indices = list(target_indices)
    correct_clickouts = train_df.loc[target_indices].reference.values
    len_rec = len(recs)

    RR = 0
    print("Calculating MRR (hoping for a 0.99)")
    for i in tqdm(range(len_rec)):
        correct_clickout = int(correct_clickouts[i])
        if correct_clickout in predictions[i][1]:
            rank_pos = recs[i].index(correct_clickout) + 1
            if rank_pos <= 25:
                RR += 1 / rank_pos

    MRR = RR / len_rec
    print(f'MRR: {MRR}')

    return MRR

In [33]:
recommendations = recommend_batch(m, dataset, target_indices)

HBox(children=(IntProgress(value=0, max=9246), HTML(value='')))


prediction created !!!


In [34]:
compute_MRR(mode, recommendations)

Calculating MRR (hoping for a 0.99)


HBox(children=(IntProgress(value=0, max=9246), HTML(value='')))


MRR: 0.5099219530770764


0.5099219530770764