<a href="https://colab.research.google.com/github/dipta007/Movie-Recommendation/blob/multi-label-classification/Movie_Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
from os.path import join
import os

ROOT = '/content/drive'     # default for the drive
PROJ = 'My Drive/Colab_data/MR1'       # path to your project on Drive
drive.mount(ROOT)           # we mount the drive at /content/drive

PROJECT_PATH = join(ROOT, PROJ)
!mkdir "{PROJECT_PATH}"    # in case we haven't created it already   
%cd "{PROJECT_PATH}"
%ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
mkdir: cannot create directory ‘/content/drive/My Drive/Colab_data/MR1’: File exists
/content/drive/My Drive/Colab_data/MR1
[0m[01;34mdata[0m/  multilayer_perceptron_graph.png


In [2]:
%matplotlib inline
import numpy as np
import pandas as pd

DATA_PATH = './data/ml-latest-small'

ratings = pd.read_csv(f"{DATA_PATH}/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
movies = pd.read_csv(f"{DATA_PATH}/movies.csv")
movies_ind_to_name = dict(zip(movies.movieId, movies.title))
movies_name_to_ind = dict(zip(movies.title, movies.movieId))
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

user_enc = LabelEncoder()
ratings['user'] = user_enc.fit_transform(ratings['userId'].values)
n_users = ratings['user'].nunique()

movie_enc = LabelEncoder()
movie_enc.fit(movies['movieId'])
ratings['movie'] = movie_enc.transform(ratings['movieId'].values)
n_movies = movies['movieId'].nunique()

ratings['rating'] = ratings['rating'].values.astype(np.float32)
min_rating = min(ratings['rating'])
max_rating = max(ratings['rating'])

# n_users, n_movies, min_rating, max_rating
# del ratings['timestamp']
ratings.head()
ratings.shape

(100836, 6)

In [19]:
n_users, n_movies

(610, 9742)

In [11]:
# Delete all the rows with less ratings
indexes = ratings[ ratings.rating < 3.5 ].index
nratings = ratings.drop(indexes)
nratings.shape

(61716, 6)

In [0]:
user_movie_count = [0 for _ in range(n_users)]
user_movie = {}
for index, row in nratings.iterrows():
  now = int(row.user)
  user_movie_count[now] += 1
  if now not in user_movie:
    user_movie[now] = []
  user_movie[now].append(row.movie)

MIN_NUMBER_OF_MOVIES = 4

In [13]:
MIN_NUMBER_OF_MOVIES, n_users, n_movies, min_rating, max_rating

(4, 610, 9742, 0.5, 5.0)

In [0]:
def convert_to_one_hot(movies):
  now = [0 for _ in range(n_movies)]
  for movie in movies:
    now[int(movie)] = 1.0
  return now

prev_movies = []
next_movies = []
for user in range(n_users):
  if user not in user_movie:
    continue
  for i in range(len(user_movie[user]) - MIN_NUMBER_OF_MOVIES):
    now_movies = user_movie[user]
    
    # Previous movies he has saw already
    prev_movies.append(now_movies[i:i+MIN_NUMBER_OF_MOVIES]);

    # Next movies he should see
    next_movies.append(convert_to_one_hot(now_movies[i+MIN_NUMBER_OF_MOVIES:]));


In [15]:
len(prev_movies), len(next_movies)

(59286, 59286)

In [16]:
X = np.array(prev_movies).reshape(len(prev_movies), 4, 1)
Y = np.array(next_movies)
X.shape, Y.shape

((59286, 4, 1), (59286, 9742))

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=47)

In [18]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((47428, 4, 1), (11858, 4, 1), (47428, 9742), (11858, 9742))

In [20]:
from keras.models import Model
from keras.layers import Input, Reshape, Dot
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam, RMSprop
from keras.regularizers import l2
from keras.layers import Concatenate, Dense, Dropout, Activation, Lambda, LSTM

def Model1(n_users, n_movies):
  input1 = Input(shape=(MIN_NUMBER_OF_MOVIES, 1,))
  x = LSTM(128)(input1)
  x = Dense(1000)(x)
  x = Dense(n_movies)(x)
  x = Activation('sigmoid')(x)

  model = Model(inputs=input1, outputs=x)
  # rms = RMSprop(lr=0.001)
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  
  # model.compile(loss='mean_squared_error', optimizer=rms, metrics=['accuracy'])
  return model

Using TensorFlow backend.


In [21]:
model = Model1(n_users, n_movies)
model.summary()

# from keras.utils import plot_model
# plot_model(model, to_file='multilayer_perceptron_graph.png')

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 4, 1)              0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               66560     
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              129000    
_________________________________________________________________
dense_2 (Dense)              (None, 9742)              9751742   
_________________________________________________________________
activation_1 (Activation)    (None, 9742)              0         
Total params: 9,947,302
Trainable params: 9,947,302
Non-trainable params: 0
_________________________________________________________________


In [22]:
history = model.fit(x=X, y=Y, batch_size=64, epochs=5, verbose=1, 
                    validation_data=(X_test, y_test))

Train on 59286 samples, validate on 11858 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [0]:
def get_result(movie_no, n = 10):
  nolan = movie_enc.transform([109487, 33794, 33794, 74458])
  pred = model.predict(nolan.reshape(1, 4, 1))[0]
  res = []
  for (ind, pred) in enumerate(pred):
      res.append((ind, pred))

  res.sort(key=lambda x: x[1], reverse=True)
  res = [(movie_enc.inverse_transform([id])[0], pred) for (id, pred) in res]
  for (movieId, pred) in res[0:n]:
    print(f"{movies_ind_to_name[movieId]} -> {pred}")
  return None

In [37]:
get_result([109487, 33794, 33794, 74458], 4)

Inception (2010) -> 0.48378628492355347
Django Unchained (2012) -> 0.45947539806365967
Guardians of the Galaxy (2014) -> 0.4578720033168793
The Martian (2015) -> 0.4250876009464264


In [0]:
for thresh in [0.4, 0.3]:
  res = get_result(pred[0], thresh)
  print(len(res))
  print(res)

In [0]:
movie_enc.inverse_transform([7355, 8045, 8457, 8528, 8618, 8663, 8668, 8882])