In [1]:
import pandas as pd
import numpy as np

<h2>Importing Dataset</h2>

In [27]:
links = pd.read_csv('ml-latest-small/links.csv') # we can use this for stuff like fine-tuning (but beyond RecSys course scope)
movies = pd.read_csv('ml-latest-small/movies.csv') # using this for now
ratings = pd.read_csv('ml-latest-small/ratings.csv') # using this for now
tags = pd.read_csv('ml-latest-small/tags.csv') # can be used to construct features

In [3]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
tags.head() 

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


<b>ML people hate null values:</b> <i> so we can remove them!</i>

In [28]:
ratings = ratings[ratings.rating.notna()]

<b>We will train the model for lone active user</b></br>  <b style="color:red">Challenge:</b> How can we train a model for all users at once?

In [29]:
active_user = ratings[['userId', 'movieId', 'rating']].where(ratings['userId'] == 1)
active_user.head()

Unnamed: 0,userId,movieId,rating
0,1.0,1.0,4.0
1,1.0,3.0,4.0
2,1.0,6.0,4.0
3,1.0,47.0,5.0
4,1.0,50.0,5.0


In [30]:
active_user = active_user.merge(movies, on='movieId', how='inner')
active_user.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1.0,1.0,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1.0,3.0,4.0,Grumpier Old Men (1995),Comedy|Romance
2,1.0,6.0,4.0,Heat (1995),Action|Crime|Thriller
3,1.0,47.0,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1.0,50.0,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


<h2>Constructing Features</h2>By merging title and genres

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
titles = [title.lower() for title in active_user['title']]
genres = [genre.lower().split("|") for genre in active_user['genres']]

In [15]:
binarizer = MultiLabelBinarizer()
genre_vector = binarizer.fit_transform(genres)

In [16]:
def get_word_embeddings(titles):
    return np.random.rand(len(titles), embedding_dim)

In [17]:
embedding_dim = 100
title_embeddings = get_word_embeddings(titles)

In [18]:
features = np.hstack((genre_vector, title_embeddings))

In [19]:
scaler = StandardScaler()
features = scaler.fit_transform(features)

In [21]:
print(features)

[[-0.79611734  1.31507101  2.64575131 ... -0.05827654  1.7416223
  -1.76424691]
 [-0.79611734 -0.76041521 -0.37796447 ...  1.41126989  1.34606848
   0.85832428]
 [ 1.25609625 -0.76041521 -0.37796447 ... -0.73255915  1.57618274
  -0.61575326]
 ...
 [-0.79611734 -0.76041521 -0.37796447 ... -0.06142391  0.77515985
  -1.01722473]
 [-0.79611734  1.31507101  2.64575131 ... -0.70609946 -1.42961855
  -0.87024755]
 [-0.79611734 -0.76041521 -0.37796447 ... -1.59287798  1.00172837
  -0.69508887]]


In [24]:
features.shape

(232, 117)

<h2>Constructing Labels</h2>Labels are binary in most RecSys models

In [31]:
active_user['rating'] = active_user['rating'].apply(lambda x : 0 if x < 3 else 1)
active_user.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1.0,1.0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1.0,3.0,1,Grumpier Old Men (1995),Comedy|Romance
2,1.0,6.0,1,Heat (1995),Action|Crime|Thriller
3,1.0,47.0,1,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1.0,50.0,1,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [32]:
labels = np.array(active_user['rating'])
print(labels)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1
 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1]


In [33]:
labels.shape

(232,)

<h2>Architecture of Neural Network</h2>Feel free to try out different ones.

In [34]:
import tensorflow as tf
from tensorflow.keras import layers, models

In [40]:
from sklearn.model_selection import train_test_split

# Split the data into train-test set (90-10 ratio)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=42)

In [41]:
# Model
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(117,)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))

In [42]:
# Compiling
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [43]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 64)                7552      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dropout_3 (Dropout)         (None, 32)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 9,665
Trainable params: 9,665
Non-trainable params: 0
_________________________________________________________________


In [44]:
# Training
history = model.fit(features, labels, epochs=20, batch_size=32, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [45]:
# Evaluating
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')

Test accuracy: 0.9583333134651184


In [46]:
# Generating predictions for our test samples
predictions = model.predict(X_test)



In [50]:
# Printing the labels assigned to each test sample
predicted_ratings = (predictions >= 0.5).astype(int)
print(predicted_ratings)

[[1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]]


<h2 style="color:red">Question: </h2>Why do you think the predicted labels have so many (actually all!) 1s?