# Imports

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf

from keras.models import Model
from keras.layers import Input, Reshape, Dot, Concatenate, Dense, Dropout, Lambda, Activation, Add
from keras.layers.embeddings import Embedding
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.regularizers import l2

# Mounting Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Loading Data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/one-m-capstone/data/combined.csv').drop(columns=['Unnamed: 0'])

In [None]:
df.head()

Unnamed: 0,movie_id,title,genres,user_id,rating,timestamp,gender,age,occupation,zip,age_elab,occ_elab
0,1,Toy Story (1995),Animation Children's Comedy,1,5,978824268,F,1,10,48067,Under 18,K-12 student
1,48,Pocahontas (1995),Animation Children's Musical Romance,1,5,978824351,F,1,10,48067,Under 18,K-12 student
2,150,Apollo 13 (1995),Drama,1,5,978301777,F,1,10,48067,Under 18,K-12 student
3,260,Star Wars: Episode IV - A New Hope (1977),Action Adventure Fantasy Sci-Fi,1,4,978300760,F,1,10,48067,Under 18,K-12 student
4,527,Schindler's List (1993),Drama War,1,5,978824195,F,1,10,48067,Under 18,K-12 student


In [None]:
df.shape

(910328, 12)

# Feature Engineering

In [None]:
user_enc = LabelEncoder()
df['user'] = user_enc.fit_transform(df['user_id'].values)
n_users = df['user'].nunique()

item_enc = LabelEncoder()
df['movie'] = item_enc.fit_transform(df['movie_id'].values)
n_movies = df['movie'].nunique()

genre_enc = LabelEncoder()
df['genre'] = genre_enc.fit_transform(df['genres'].values)
n_genres = df['genre'].nunique()

occupation_enc = LabelEncoder()
df['occ'] = genre_enc.fit_transform(df['occupation'].values)
n_occs = df['occ'].nunique()

gender_enc = LabelEncoder()
df['gender_enc'] = gender_enc.fit_transform(df['gender'].values)
n_genders = df['gender_enc'].nunique()

age_enc = LabelEncoder()
df['age_enc'] = age_enc.fit_transform(df['age'].values)
n_ages = df['age_enc'].nunique()

min_rating = min(df['rating'])
max_rating = max(df['rating'])

n_users, n_movies, n_genres, n_occs, n_genders, n_ages, min_rating, max_rating

(4314, 3688, 300, 21, 2, 7, 1, 5)

# Splitting Data

In [None]:
X = df[['user', 'movie', 'genre', 'occ', 'gender_enc', 'age_enc']].values
y = df['rating'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((728262, 6), (182066, 6), (728262,), (182066,))

In [None]:
# This is a hyper parameter
n_factors = 50

In [None]:
X_train_array = [X_train[:, 0], X_train[:, 1], X_train[:, 2], X_train[:, 3], X_train[:, 4], X_train[:, 5]]
X_test_array = [X_test[:, 0], X_test[:, 1], X_test[:, 2], X_test[:, 3], X_test[:, 4], X_test[:, 5]]

# Neural Model

This neural net works by essentially learning the relationships between the matrix factorization process.

https://medium.com/@jdwittenauer/deep-learning-with-keras-recommender-systems-e7b99cb29929

In [None]:
class EmbeddingLayer:
    def __init__(self, n_items, n_factors):
        self.n_items = n_items
        self.n_factors = n_factors
    
    def __call__(self, x):
        x = Embedding(self.n_items, self.n_factors, embeddings_initializer='he_normal',
                      embeddings_regularizer=l2(1e-6))(x)
        x = Reshape((self.n_factors,))(x)
        return x

In [None]:
def DeepLearningRec(n_users, n_movies, n_genres, n_occs, n_genders, n_factors, n_ages, min_rating, max_rating):

  user = Input(shape=(1,))
  u = EmbeddingLayer(n_users, n_factors)(user)
  
  movie = Input(shape=(1,))
  m = EmbeddingLayer(n_movies, n_factors)(movie)
  
  genre = Input(shape=(1,))
  g = EmbeddingLayer(n_genres, n_factors)(genre)

  occ = Input(shape=(1,))
  o = EmbeddingLayer(n_genres, n_factors)(occ)

  gender = Input(shape=(1,))
  gend = EmbeddingLayer(n_genders, n_factors)(gender)

  age = Input(shape=(1,))
  a = EmbeddingLayer(n_ages, n_factors)(age)

  x = Concatenate()([u, m, g, o, gend, a])
  x = Dropout(0.05)(x) 
  
  x = Dense(16, kernel_initializer='he_normal')(x)
  x = Activation('relu')(x)
  x = Dropout(0.5)(x)
  
  x = Dense(1, kernel_initializer='he_normal')(x)
  x = Activation('sigmoid')(x)
  x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)    
  
  model = Model(inputs=[user, movie, genre, occ, gender, age], outputs=x)
  opt = Adam(learning_rate=0.001)
  model.compile(loss='mean_squared_error', optimizer=opt)    
  
  return model

In [None]:
model = DeepLearningRec(n_users, n_movies, n_genres, n_occs, n_genders, n_ages, n_factors, min_rating, max_rating)
model.summary()

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_58 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_59 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_60 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_61 (InputLayer)           [(None, 1)]          0                                            
___________________________________________________________________________________________

In [None]:
early_stop = EarlyStopping(patience=3)
history = model.fit(x=X_train_array, y=y_train, 
                    batch_size=128, 
                    epochs=40,
                    verbose=1,
                    callbacks=[early_stop],
                    validation_data=(X_test_array, y_test))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40


In [None]:
n_users, n_movies, n_genres, n_occs, n_genders, n_ages, min_rating, max_rating

(6040, 3706, 301, 21, 2, 7, 1, 5)

In [None]:
X_test_arrayzzz

[array([ 509, 2278,  677, ..., 4351, 3181, 1551]),
 array([ 574, 2983,  572, ...,  788, 2547, 1142]),
 array([146, 298, 160, ..., 176, 203,  73]),
 array([12,  7,  0, ...,  4, 12, 20]),
 array([1, 1, 1, ..., 1, 1, 1]),
 array([1, 5, 2, ..., 1, 2, 4])]

In [None]:
model.predict([np.array([6038]), np.array([3705]), np.array([300]), np.array([20]),
               np.array([1]), np.array([6])])

array([[3.7281327]], dtype=float32)