In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings

from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from keras.models import Model, Sequential, load_model

from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras_radam import RAdam

warnings.filterwarnings('ignore')
%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Dot Product with RMSE loss function

In [12]:
from helpers import load_data

DATA_TRAIN_PATH = "data/data_train.csv"
ratings = load_data(DATA_TRAIN_PATH)

DATA_TEST_PATH = "data/sampleSubmission.csv"
samples = load_data(DATA_TEST_PATH)

In [13]:
samples.head()

Unnamed: 0,user_id,movie_id,rating
0,37,1,3
1,73,1,3
2,156,1,3
3,160,1,3
4,248,1,3


In [14]:
n_users = len(ratings.user_id.unique())
n_movies = len(ratings.movie_id.unique())

In [15]:
### normalization  ###

pop_ratings = ratinprefs = pop_ratings['rating']gs[ratings['movie_id'].isin((rating_counts).index[0:500])]
pop_ratings = ratings.set_index(['movie_id', 'user_id'])

prefs = pop_ratings['rating']

mean_0 = prefs.mean() # global mean
prefs = prefs - mean_0

mean_i = prefs.groupby("movie_id").mean() # item mean
prefs = prefs - mean_i

mean_u = prefs.groupby(['user_id']).mean() # user mean
prefs = prefs - mean_u

pref_matrix = prefs.reset_index()[['user_id', 'movie_id', 'rating']].pivot(index='user_id', columns='movie_id', values='rating')

prefs = prefs.reset_index()

In [16]:
prefs

Unnamed: 0,movie_id,user_id,rating
0,1,44,0.441303
1,1,61,-0.828520
2,1,67,1.005327
3,1,72,-0.446089
4,1,86,1.303265
...,...,...,...
1176947,1000,9990,0.292686
1176948,1000,9992,1.274889
1176949,1000,9994,-0.741606
1176950,1000,9997,0.664086


In [17]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(prefs, test_size=0.1, random_state=42)

In [18]:
test.head()

Unnamed: 0,movie_id,user_id,rating
904556,706,5062,-1.61351
171326,148,9044,-0.089214
190893,169,1736,-0.265671
1082109,877,5270,-2.622723
881877,691,6100,0.024668


In [19]:
# https://www.kdnuggets.com/2019/07/building-recommender-system-part-2.html

In [30]:
import tensorflow as tf

from keras.layers import Input, Dense, Lambda, Conv1D
from keras.models import Model, load_model as keras_load_model
from keras import losses, backend
from keras.callbacks import EarlyStopping

ENCODING_DIM = 25
ITEM_COUNT = 1000

# ~~~ build recommender ~~~ #
input_layer = Input(shape=(ITEM_COUNT, ))
# compress to low dimension
encoded = Dense(ENCODING_DIM, kernel_size=2, activation="linear", use_bias=False)(input_layer)
# blow up to large dimension
decoded = Dense(ITEM_COUNT, kernel_size=4, activation="linear", use_bias=False)(encoded)       

# define subsets of the model:
# 1. the recommender itself
recommender = Model(input_layer, decoded)

# 2. the encoder
encoder = Model(input_layer, encoded)

# 3. the decoder
encoded_input = Input(shape=(ENCODING_DIM, ))
decoder = Model(encoded_input, recommender.layers[-1](encoded_input))

ValueError: Input 0 is incompatible with layer conv1d_4: expected ndim=3, found ndim=2

In [23]:
from keras import losses
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from keras.models import Model, Sequential, load_model

from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras_radam import RAdam

#creating movie embedding path
movie_input = Input(shape=[1], name="Movie-Input")
movie_embedding = Embedding(n_movies+1, ENCODING_DIM, name="Movie-Embedding")(movie_input)
movie_vec = Flatten(name="Flatten-Movies")(movie_embedding)

# creating user embedding path
user_input = Input(shape=[1], name="User-Input")
user_embedding = Embedding(n_users+1, ENCODING_DIM, name="User-Embedding")(user_input)
user_vec = Flatten(name="Flatten-Users")(user_embedding)

original_inputs = recommender.input
y_true_inputs = Input(shape=(ITEM_COUNT, ))
original_outputs = recommender.output
# give 80% of the weight to guessing the missings, 20% to reproducing the knowns
prod = Dot(name="Dot-Product", axes=1)([movie_vec, user_vec])

wrapper_model = Model(inputs=[original_inputs, y_true_inputs], outputs=prod)
wrapper_model.compile('sgd', loss='mean_squared_error')

ValueError: Graph disconnected: cannot obtain value for tensor Tensor("User-Input:0", shape=(?, 1), dtype=float32) at layer "User-Input". The following previous layers were accessed without issue: []

In [21]:
def generate(pref_matrix, batch_size=64, mask_fraction=0.2):
    """
    Generate training triplets from this dataset.

    :param batch_size: Size of each training data batch.
    :param mask_fraction: Fraction of ratings in training data input to mask. 0.2 = hide 20% of input ratings.
    :param repeat: Steps between shuffles.
    :return: A generator that returns tuples of the form ([X, y], zeros) where X, y, and zeros all have
             shape[0] = batch_size. X, y are training inputs for the recommender.
    """

    def select_and_mask(frac):
        def applier(row):
            row = row.copy()
            idx = np.where(row != 0)[0]
            if len(idx) > 0:
                masked = np.random.choice(idx, size=(int)(frac*len(idx)), replace=False)
                row[masked] = 0
            return row
        return applier
    
    indices = np.arange(pref_matrix.shape[0])
    batches_per_epoch = int(np.floor(len(indices)/batch_size))
    while True:
        np.random.shuffle(indices)

        for batch in range(0, batches_per_epoch):
            idx = indices[batch*batch_size:(batch+1)*batch_size]

            y = np.array(pref_matrix[idx,:])
            X = np.apply_along_axis(select_and_mask(frac=mask_fraction), axis=1, arr=y)

            yield [X, y], np.zeros(batch_size)

In [22]:
def fit(wrapper_model, pref_matrix, batch_size=64, mask_fraction=0.2, epochs=1, verbose=1, patience=0):
    stopper = EarlyStopping(monitor="loss", min_delta=0.00001, patience=patience, verbose=verbose)
    batches_per_epoch = int(np.floor(pref_matrix.shape[0]/batch_size))

    generator = generate(pref_matrix, batch_size, mask_fraction)

    history = wrapper_model.fit_generator(
        generator,
        steps_per_epoch=batches_per_epoch,
        epochs=epochs,
        callbacks = [stopper] if patience > 0 else []
    )

    return history

In [23]:
# stop after 3 epochs with no improvement
fit(wrapper_model, pref_matrix.fillna(0).values, batch_size=100, epochs=100, patience=3)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 00023: early stopping


<keras.callbacks.callbacks.History at 0x7f9608cb8fd0>

In [30]:
def predict(samples, recommender, mean_0, mean_i):

    pref_mat = ratings[['user_id', 'movie_id', 'rating']].pivot(index='user_id', columns='movie_id', values='rating')
    X = pref_mat.fillna(0).values
    y = recommender.predict(X)

    output = pd.DataFrame(y, index=pref_mat.index, columns=pref_mat.columns)
#     output = output.iloc[1:] # drop the bad user

    output = output.add(mean_u, axis=0)
    output = output.add(mean_i, axis=1)
    output = output.add(mean_0)

    return output

In [50]:
y = predict(samples, recommender, mean_0, mean_i).transpose()

In [66]:
y

array([[2.32972075, 0.19826059, 2.47371177, ..., 1.17281638, 2.84938211,
        0.49275711],
       [4.68157039, 5.29992605, 4.83235615, ..., 4.12743551, 3.33042934,
        1.02073001],
       [2.65217522, 1.58907372, 1.68058943, ..., 1.19723547, 3.07446921,
        1.14775626],
       ...,
       [2.60508539, 0.94451486, 1.25441925, ..., 1.11553853, 2.08232397,
        1.02743039],
       [2.83048513, 1.35161429, 2.11480091, ..., 3.31339171, 2.93739731,
        2.08761636],
       [2.55746458, 1.51257677, 1.30760753, ..., 1.85319616, 2.60622361,
        1.37421876]])

In [59]:
y = y.to_numpy()

In [68]:
from helpers import *

data = read_txt(DATA_TEST_PATH)[1:]
data = [deal_line(line) for line in data]
n = len(data)
pred = np.zeros((n,1))
for i in range(n):
    pred[i] = np.clip(round(y[data[i][1]-1][data[i][0]-1]), 1, 5)

In [69]:
pred

array([[1.],
       [1.],
       [2.],
       ...,
       [2.],
       [1.],
       [3.]])

In [70]:
submission = load_data(DATA_TEST_PATH)
submission['rating'] = pred

In [71]:
submission

Unnamed: 0,user_id,movie_id,rating
0,37,1,1.0
1,73,1,1.0
2,156,1,2.0
3,160,1,1.0
4,248,1,1.0
...,...,...,...
1176947,9974,1000,2.0
1176948,9977,1000,1.0
1176949,9978,1000,2.0
1176950,9982,1000,1.0


In [72]:
from helpers import create_csv

DATA_SUBMISSION = "data/submission_dotprodRMSEnorm_tuto.csv"
create_csv(DATA_SUBMISSION, submission)

Dot Product with RMSE using librairies : pandas and keras - AICrowd : 1.3

The score stays the same when we increase 'epoch'.