In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

ratingsTrainLarge = pd.read_csv('comp3208-train.csv', header=None)
ratingsTrainLarge.head()

In [None]:
#Misses first row when we do this???
ratingsTrainLargeDataframe=pd.DataFrame(ratingsTrainLarge.values, columns = ["UserID", "ItemID", "Rating", "Timestamp"])
ratingsTrainLargeDataframe.head()

In [None]:
a = ratingsTrainLargeDataframe.groupby('UserID')['Rating'].count()
top_users = a.sort_values(ascending=False)[:15]

b = ratingsTrainLargeDataframe.groupby('ItemID')['Rating'].count()
top_items = b.sort_values(ascending=False)[:15]

top_ratings = ratingsTrainLargeDataframe.join(top_users, rsuffix='_ratings', how='inner', on='UserID')
top_ratings = top_ratings.join(top_items, rsuffix='_ratings', how='inner', on='ItemID')

pd.crosstab(top_ratings.UserID, top_ratings.ItemID, top_ratings.Rating, aggfunc=np.sum)

In [None]:
user_encoderLarge = LabelEncoder()
ratingsTrainLargeDataframe['UserID'] = user_encoderLarge.fit_transform(ratingsTrainLargeDataframe['UserID'].values)
n_users = ratingsTrainLargeDataframe['UserID'].nunique()

item_encoderLarge = LabelEncoder()
ratingsTrainLargeDataframe['ItemID'] = item_encoderLarge.fit_transform(ratingsTrainLargeDataframe['ItemID'].values)
n_items = ratingsTrainLargeDataframe['ItemID'].nunique()

#ratingsDataframe['Rating'] = ratingsDataframe['Rating'].astype(str).str.replace(r'(\rating', '0')

ratingsTrainLargeDataframe['Rating'] = ratingsTrainLargeDataframe['Rating'].astype(str).str.replace("rating","3.5")
ratingsTrainLargeDataframe['Rating'] = ratingsTrainLargeDataframe['Rating'].values.astype(np.float32)

min_rating = min(ratingsTrainLargeDataframe['Rating'].values.astype(np.float32))
max_rating = max(ratingsTrainLargeDataframe['Rating'].values.astype(np.float32))

#n_users, n_items, min_rating, max_rating

#(671, 9066, 0.5, 5.0)
#(274246, 19807, 0.0, 5.0)

In [None]:
X = ratingsTrainLargeDataframe[['UserID', 'ItemID']].values
y = ratingsTrainLargeDataframe['Rating'].values

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.1, random_state=42)
#X_train.shape, X_validation.shape, y_train.shape, y_validation.shape
#n_factors = 50

#((90003, 2), (10001, 2), (90003,), (10001,))

In [None]:
n_factors = 50
X_train_array = [X_train[:, 0], X_train[:, 1]]
X_validation_array = [X_validation[:, 0], X_validation[:, 1]]

In [None]:
from keras.layers import Add, Activation, Lambda, Input, Reshape, Dot, Embedding
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint

earlyStopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
modelCheckpoint = ModelCheckpoint('bestModelLarge.h5', monitor='val_loss', mode='min',  verbose=1, save_best_only=True)

# load a saved model
#saved_model = load_model('bestModel.h5')


class EmbeddingLayer:
    def __init__(self, n_items, n_factors):
        self.n_items = n_items
        self.n_factors = n_factors
    
    def __call__(self, x):
        x = Embedding(self.n_items,
                      self.n_factors,
                      embeddings_initializer='he_normal',
                      embeddings_regularizer=l2(1e-6))(x)
        x = Reshape((self.n_factors,))(x)
        return x

def RecommenderV2(n_users, n_movies, n_factors, min_rating, max_rating):
    user = Input(shape=(1,))
    u = EmbeddingLayer(n_users, n_factors)(user)
    ub = EmbeddingLayer(n_users, 1)(user)
    
    movie = Input(shape=(1,))
    m = EmbeddingLayer(n_movies, n_factors)(movie)
    mb = EmbeddingLayer(n_movies, 1)(movie)
    x = Dot(axes=1)([u, m])
    x = Add()([x, ub, mb])
    x = Activation('sigmoid')(x)
    x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)
    model = Model(inputs=[user, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt)
    return model


In [None]:
model = RecommenderV2(n_users, n_items, n_factors, min_rating, max_rating)
model.summary()

In [None]:
history = model.fit(x=X_train_array,
                    y=y_train,
                    batch_size=64000,
                    epochs=100,
                    verbose=1,
                    validation_data=(X_validation_array, y_validation),
                    callbacks=[earlyStopping, modelCheckpoint]) 

#lr=0.001
#Epoch 00011: val_loss improved from 0.70822 to 0.70743, saving model to bestModel.h5
#475/475 [==============================] - 93s 196ms/step - loss: 0.6295 - val_loss: 0.7074

In [None]:
ratingsTestLarge  = pd.read_csv('comp3208-test.csv', header=None)
ratingsTestLargeDataframe=pd.DataFrame(ratingsTestLarge.values, columns = ["UserID", "ItemID", "Timestamp"])
ratingsTestLargeDataframe.insert(2, 'Rating', '')
ratingsTestLargeDataframe.head()
#ratingsTestSmallDataframe.info()

In [None]:
user_encoderLargeTest = LabelEncoder()
ratingsTestLargeDataframe['UserID'] = user_encoderLargeTest.fit_transform(ratingsTestLargeDataframe['UserID'].values)
n_users = ratingsTestLargeDataframe['UserID'].nunique()

item_encoderLargeTest = LabelEncoder()
ratingsTestLargeDataframe['ItemID'] = item_encoderLargeTest.fit_transform(ratingsTestLargeDataframe['ItemID'].values)
n_items = ratingsTestLargeDataframe['ItemID'].nunique()

n_users, n_items

In [None]:
from keras.models import load_model
saved_model = load_model('bestModel.h5')

X = ratingsTestLargeDataframe[['UserID', 'ItemID']].values
X_validation_array = [X[:, 0], X[:, 1]]

ratingsTestLargeDataframe

In [None]:
from keras.models import load_model
#saved_model = load_model('bestModel.h5')
#_, test_acc = saved_model.evaluate(X, y, verbose=0)

# Generate predictions for samples
predictions = saved_model.predict(X_validation_array)
print(predictions)

In [None]:
predictions.size

In [None]:
def round_number(number):
    return round(number *2) /2

#roundedPredictions = [roundedPredictions.append(round_number(y) for y in x in predictions]
#Then for arr in array: first?
roundedPredictions=[]

#for x in assignmentTraining_text['tweetText'].str.split():
#    for i in x:
#        corpus.append(i)
        
for x in predictions:
    for y in x:
        roundedPredictions.append(round_number(y))


ratingsTestLargeDataframe['Rating'] = roundedPredictions
ratingsTestLargeDataframe

In [None]:
ratingsTestLargeDataframe['UserID'] = user_encoderLargeTest.inverse_transform(ratingsTestLargeDataframe['UserID'].values)
ratingsTestLargeDataframe['ItemID'] = item_encoderLargeTest.inverse_transform(ratingsTestLargeDataframe['ItemID'].values)
ratingsTestLargeDataframe

In [None]:
ratingsTestLargeDataframe.to_csv('predictionsLarge.csv', index=False)  