In [1]:
import sys
import os
os.chdir("../")

In [2]:
import data
from utils.dataset import DatasetScoresClassification #, SequenceDatasetForClassification
from recommenders.recurrent.RNNClassificationRecommender import RNNClassificationRecommender
#model = RNNClassificationRecommender(dataset, (6,168), 'gru', 2, 64, 2)

import numpy as np
import pandas as pd
import copy

from sklearn.model_selection import KFold

from joblib import Parallel, delayed

Using TensorFlow backend.


In [3]:
dataset = DatasetScoresClassification(f'dataset/preprocessed/cluster_recurrent/small/dataset_classification_p6')

In [4]:
class KFoldScorer(object):
    """
    Get the scores for the dataset by fitting each model in K-fold (except one) and
    computing the scores for the left-out fold.
    The underlying model should implement the following methods:
    - fit_cv(x, y, x_val, y_val, **params)
    - get_scores_cv(x)      : must return a dataframe with columns [ user_id | session_id | item_id | score ]
    """

    def __init__(self, model_class, init_params, k):
        #assert hasattr(model, 'fit_cv') and hasattr(model, 'get_scores_cv'), \
        #    'Model must implement methods: fit_cv, get_scores_cv'
        self.model_class = model_class
        self.init_params = init_params
        self.k = k
        self.scores = []
        
    # train a single model on a fold
    def _fit_model(self, x, y, x_val, y_val, fit_params, pool_id=0):
        # print(x_val.shape)
        # print(x_val[:,:,0][:,5])
        # print(x_val[:,:,0][:,5].shape)
        # print()
        # return pool_id
        print(f'start {pool_id}')
        model = self.model_class(**self.init_params)
        model.fit_cv(x, y, x_val, y_val, **fit_params)
        print(f'fit end {pool_id}')
        return model.get_scores_cv(x_val)

    def fit_predict(self, dataset, fit_params={}, n_jobs=None, 
                    index_columns=[0], save_folder='scores/'):
        """ Fit and compute the scores for each fold """
        assert hasattr(dataset, 'load_Xtrain') and hasattr(dataset, 'load_Ytrain') and hasattr(dataset, 'load_Xtest'), \
            'Dataset must implement methods: load_Xtrain, load_Ytrain, load_Xtest'
        
        X_train, Y_train, X_test = dataset.load_Xtrain()[:5], dataset.load_Ytrain()[:5], dataset.load_Xtest()[:5]

        
        # kfold
        kf = KFold(n_splits=self.k)
        
        # fit in each fold
        self.scores = Parallel(backend='multiprocessing', n_jobs=n_jobs)(delayed(self._fit_model)
                                (X_train[train_indices], Y_train[train_indices], 
                                    X_train[test_indices], Y_train[test_indices], fit_params, idx)
                                for idx, (train_indices,test_indices) in enumerate(kf.split(X_train)) )
        
        # fit in all the train and get scores for test
        # model = copy.deepcopy(self.model)
        # model.fit_cv(X_train, Y_train, None, None, **fit_params)
        # scores_test = model.get_scores_cv(X_test)
        # self.scores.append( scores_test )
        
        # check_folder(save_folder)
        self.scores = pd.concat(self.scores)
        return self.scores

In [5]:
init_params = {
    'dataset': dataset,
    'input_shape': (6,168),
    'cell_type': 'gru',
    'num_recurrent_layers': 2,
    'num_recurrent_units': 64,
    'num_dense_layers': 2
}
fit_params = {'epochs': 1, 'early_stopping_patience': 4}

kfscorer = KFoldScorer(model_class=RNNClassificationRecommender, init_params=init_params, k=2)

_ = kfscorer.fit_predict(dataset, fit_params=fit_params, n_jobs=2)

X_train: (37664, 6, 169)
Y_train: (37664, 25)
X_test: (9246, 6, 169)
start 0
start 1
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, 6, 64)             44736     
_________________________________________________________________
gru_2 (GRU)                  (None, 64)                24768     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 25)                1625      
_________________________________________________________________
dropout_2 (Dropout)          (None, 25)                0         
Total params: 75,289
Trainable params: 75,289
Non-trainab

  mask |= (ar1 == a)
  mask |= (ar1 == a)


Done!
Done!


In [6]:
len(kfscorer.scores)

85

In [7]:
scores = kfscorer.scores
scores

Unnamed: 0,user_id,session_id,item_id,score
27778.0,0004IOZI7CKF,0146f7cb014ba,2632452,0.082701
27778.0,0004IOZI7CKF,0146f7cb014ba,5747484,0.049080
27778.0,0004IOZI7CKF,0146f7cb014ba,110985,0.050279
27778.0,0004IOZI7CKF,0146f7cb014ba,84220,0.031571
27778.0,0004IOZI7CKF,0146f7cb014ba,3752262,0.019891
27778.0,0004IOZI7CKF,0146f7cb014ba,85103,0.038115
27778.0,0004IOZI7CKF,0146f7cb014ba,85285,0.040473
27778.0,0004IOZI7CKF,0146f7cb014ba,1905271,0.030329
27778.0,0004IOZI7CKF,0146f7cb014ba,82038,0.048797
27778.0,0004IOZI7CKF,0146f7cb014ba,85306,0.051958
