In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
#Reading ratings file
ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', 
                      usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])
max_userid = ratings['user_id'].drop_duplicates().max()
max_movieid = ratings['movie_id'].drop_duplicates().max()

#Reading users file
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', 
                    usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

#Reading movies file
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', 
                     usecols=['movie_id', 'title', 'genres'])

In [24]:
#This gives us the total no.of users and total no.of movies.
print(max_userid)
print(max_movieid)

6040
3952


In [25]:
#Dont confuse with 'user_id' and 'user_emb_id'. Both are same except for the fact that
#user_id starts with 1 and user_emb_id starts with 0. Same applies for movie_id and movie_emb_id
ratings

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id
0,1,1193,5,0,1192
1,1,661,3,0,660
2,1,914,3,0,913
3,1,3408,4,0,3407
4,1,2355,5,0,2354
...,...,...,...,...,...
1000204,6040,1091,1,6039,1090
1000205,6040,1094,5,6039,1093
1000206,6040,562,5,6039,561
1000207,6040,1096,4,6039,1095


In [26]:
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [27]:
#We actually dont care about the gender, zipcode and other stuff.
users

Unnamed: 0,user_id,gender,zipcode,age_desc,occ_desc
0,1,F,48067,Under 18,K-12 student
1,2,M,70072,56+,self-employed
2,3,M,55117,25-34,scientist
3,4,M,02460,45-49,executive/managerial
4,5,M,55455,25-34,writer
...,...,...,...,...,...
6035,6036,F,32603,25-34,scientist
6036,6037,F,76006,45-49,academic/educator
6037,6038,F,14706,56+,academic/educator
6038,6039,F,01060,45-49,other or not specified


In [28]:
#This is the sparse matrix with users along the row and movies along the columns
#A sparse matrix is a matrix with mostly 0's in it.
merged_df = pd.merge(ratings, movies, on='movie_id')
sparse_matrix = merged_df.pivot_table(index='user_id', columns='title', values='rating')
sparse_matrix.head()

title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),"13th Warrior, The (1999)",187 (1997),2 Days in the Valley (1996),20 Dates (1998),"20,000 Leagues Under the Sea (1954)",200 Cigarettes (1999),2001: A Space Odyssey (1968),2010 (1984),24 7: Twenty Four Seven (1997),24-hour Woman (1998),28 Days (2000),3 Ninjas: High Noon On Mega Mountain (1998),3 Strikes (2000),"301, 302 (1995)","39 Steps, The (1935)","400 Blows, The (Les Quatre cents coups) (1959)",42 Up (1998),52 Pick-Up (1986),54 (1998),"7th Voyage of Sinbad, The (1958)",8 1/2 (1963),8 1/2 Women (1999),8 Heads in a Duffel Bag (1997),8 Seconds (1994),8MM (1999),A Chef in Love (1996),Abbott and Costello Meet Frankenstein (1948),"Abominable Snowman, The (1957)",About Adam (2000),About Last Night... (1986),...,Wonderland (1997),Wonderland (1999),Woo (1998),"Wood, The (1999)","Wooden Man's Bride, The (Wu Kui) (1994)",Working Girl (1988),"World Is Not Enough, The (1999)","World of Apu, The (Apur Sansar) (1959)","Wrong Man, The (1956)","Wrong Trousers, The (1993)",Wrongfully Accused (1998),Wyatt Earp (1994),"X-Files: Fight the Future, The (1998)",X-Men (2000),X: The Unknown (1956),Xiu Xiu: The Sent-Down Girl (Tian yu) (1998),Yankee Zulu (1994),"Yards, The (1999)","Year My Voice Broke, The (1987)",Year of Living Dangerously (1982),Year of the Horse (1997),Yellow Submarine (1968),Yojimbo (1961),You Can't Take It With You (1938),You So Crazy (1994),You've Got Mail (1998),Young Doctors in Love (1982),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kjærlighetens kjøtere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,4.0,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,...,,,,,,,,,,,,,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,


In [29]:
#We have shuffled the user_ids and movie_ids as we are going to train on them and 
#we are taking only 'user_emb_id' as USERS and 'movie_emb_id' as MOVIES
shuffled_ratings = ratings.sample(frac=1.)

USERS = shuffled_ratings['user_emb_id'].values
print('USERS:', USERS, ', shape =', USERS.shape)

MOVIES = shuffled_ratings['movie_emb_id'].values
print('MOVIES:', MOVIES, ', shape =', MOVIES.shape)

RATINGS = shuffled_ratings['rating'].values
print('RATINGS:', RATINGS, ', shape =', RATINGS.shape)

USERS: [4417 2287 2790 ...  970 1032 1903] , shape = (1000209,)
MOVIES: [3124 1041 3872 ... 1220 2426 3440] , shape = (1000209,)
RATINGS: [5 1 4 ... 4 2 3] , shape = (1000209,)


In [0]:
#Simon Funk's architecture is coded with keras version 1, so we degarde keras to version 1.2.2
#keras version 2 doesnot support Merge layer
!pip install keras==1.2.2

Collecting keras==1.2.2
[?25l  Downloading https://files.pythonhosted.org/packages/75/3e/9926ce5c678b7a7978724a2ecf24857d89a415d152b8d3443e6d45c228b2/Keras-1.2.2.tar.gz (175kB)
[K     |█▉                              | 10kB 19.2MB/s eta 0:00:01[K     |███▊                            | 20kB 3.3MB/s eta 0:00:01[K     |█████▋                          | 30kB 4.7MB/s eta 0:00:01[K     |███████▌                        | 40kB 3.1MB/s eta 0:00:01[K     |█████████▍                      | 51kB 3.8MB/s eta 0:00:01[K     |███████████▏                    | 61kB 4.5MB/s eta 0:00:01[K     |█████████████                   | 71kB 5.2MB/s eta 0:00:01[K     |███████████████                 | 81kB 5.8MB/s eta 0:00:01[K     |████████████████▉               | 92kB 6.5MB/s eta 0:00:01[K     |██████████████████▊             | 102kB 5.0MB/s eta 0:00:01[K     |████████████████████▌           | 112kB 5.0MB/s eta 0:00:01[K     |██████████████████████▍         | 122kB 5.0MB/s eta 0:00:01

In [0]:
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
#Importing Simon Funk's Architecture.
from SimonFunk import CFModel

In [0]:
#We have used MSE as Loss Function and Adamax as Learning Algorithm
model = CFModel(max_userid, max_movieid, 100)
model.compile(loss='mse', optimizer='adamax')

In [33]:
# Callbacks monitor the validation loss
# Save the model weights each time the validation loss has improved
callbacks = [EarlyStopping('val_loss', patience=2), 
             ModelCheckpoint('weights.h5', save_best_only=True)]

#30 epochs, 90% training data, 10% validation data
history = model.fit([USERS, MOVIES], RATINGS, nb_epoch=30, validation_split=.1, verbose=2, callbacks=callbacks)

Train on 900188 samples, validate on 100021 samples
Epoch 1/30
209s - loss: 8.2116 - val_loss: 2.2828
Epoch 2/30
210s - loss: 1.4895 - val_loss: 1.1352
Epoch 3/30
209s - loss: 1.0024 - val_loss: 0.9388
Epoch 4/30
219s - loss: 0.8889 - val_loss: 0.8738
Epoch 5/30
214s - loss: 0.8393 - val_loss: 0.8390
Epoch 6/30
213s - loss: 0.8073 - val_loss: 0.8173
Epoch 7/30
221s - loss: 0.7834 - val_loss: 0.8014
Epoch 8/30
212s - loss: 0.7623 - val_loss: 0.7883
Epoch 9/30
216s - loss: 0.7423 - val_loss: 0.7791
Epoch 10/30
213s - loss: 0.7229 - val_loss: 0.7690
Epoch 11/30
220s - loss: 0.7038 - val_loss: 0.7619
Epoch 12/30
212s - loss: 0.6848 - val_loss: 0.7558
Epoch 13/30
218s - loss: 0.6657 - val_loss: 0.7501
Epoch 14/30
213s - loss: 0.6463 - val_loss: 0.7494
Epoch 15/30
215s - loss: 0.6261 - val_loss: 0.7467
Epoch 16/30
209s - loss: 0.6062 - val_loss: 0.7452
Epoch 17/30
209s - loss: 0.5858 - val_loss: 0.7468
Epoch 18/30
209s - loss: 0.5653 - val_loss: 0.7487
Epoch 19/30
209s - loss: 0.5454 - val_l

In [0]:
trained_model = CFModel(max_userid, max_movieid, 100)
trained_model.load_weights('weights.h5')

In [35]:
#We have taken a random user with user_id=2309
users[users['user_id'] == 2309] 

Unnamed: 0,user_id,gender,zipcode,age_desc,occ_desc
2308,2309,F,8901,45-49,other or not specified


In [0]:
# User-defined function to predict the ratings given User ID and Movie ID
# rate() is a function inside 'SimonFunk.py' which returns predicted rating for a given user_id and movie_id.
def predict_rating(user_id, movie_id):
    return trained_model.rate(user_id - 1, movie_id - 1)

In [37]:
#Shows user 2309 and the movies he has watched along with the ratings.
random_user = ratings[ratings['user_id'] == 2309][['user_id', 'movie_id', 'rating']]
random_user 

Unnamed: 0,user_id,movie_id,rating
393025,2309,1259,4
393026,2309,587,5
393027,2309,3006,5
393028,2309,7,4
393029,2309,3946,1
...,...,...,...
393193,2309,1090,4
393194,2309,1096,3
393195,2309,1097,3
393196,2309,1242,3


In [38]:
#Here we show the TOP 20 movies of user 2309 had already rated along with the predicted ratings.
#We have called the user defined function 'predict_rating' (takes user_id and movie_id as arguments and return the predicted raing) on the lambda function which will return predicted rating for every single movie that the user 2309 has already rated.
#Finally we merge the 'ratings' dataframe with the 'movies' dataframe
random_user = ratings[ratings['user_id'] == 2309][['user_id', 'movie_id', 'rating']]
random_user['prediction'] = random_user.apply(lambda x: predict_rating(2309, x['movie_id']), axis=1)
random_user.sort_values(by='rating', 
                         ascending=False).merge(movies, 
                                                on='movie_id', 
                                                how='inner', 
                                                suffixes=['_u', '_m']).head(20)

Unnamed: 0,user_id,movie_id,rating,prediction,title,genres
0,2309,351,5,3.469095,"Corrina, Corrina (1994)",Comedy|Drama|Romance
1,2309,2028,5,3.471596,Saving Private Ryan (1998),Action|Drama|War
2,2309,1353,5,3.612819,"Mirror Has Two Faces, The (1996)",Comedy|Romance
3,2309,2268,5,4.084363,"Few Good Men, A (1992)",Crime|Drama
4,2309,587,5,4.245144,Ghost (1990),Comedy|Romance|Thriller
5,2309,1207,5,3.951824,To Kill a Mockingbird (1962),Drama
6,2309,161,5,3.80928,Crimson Tide (1995),Drama|Thriller|War
7,2309,3871,5,3.468126,Shane (1953),Drama|Western
8,2309,3095,5,3.656307,"Grapes of Wrath, The (1940)",Drama
9,2309,1307,5,4.106775,When Harry Met Sally... (1989),Comedy|Romance


In [39]:
#Here we have recommmeded a lit of unrated 20 movies sorted by prediction value in decending order for the user 2309
#We first take all the movies that the user 2309 is yet to watch, predict rating for those movies and suggest movies with higher predicted rating value.
recommendations = ratings[ratings['movie_id'].isin(random_user['movie_id']) == False][['movie_id']].drop_duplicates()
recommendations['prediction'] = recommendations.apply(lambda x: predict_rating(2309, x['movie_id']), axis=1)
recommendations.sort_values(by='prediction',
                          ascending=False).merge(movies,
                                                 on='movie_id',
                                                 how='inner',
                                                 suffixes=['_u', '_m']).head(20)

Unnamed: 0,movie_id,prediction,title,genres
0,2609,4.473856,"King of Masks, The (Bian Lian) (1996)",Drama
1,1035,4.351915,"Sound of Music, The (1965)",Musical
2,597,4.280185,Pretty Woman (1990),Comedy|Romance
3,2905,4.250436,Sanjuro (1962),Action|Adventure
4,3092,4.248705,Chushingura (1962),Drama
5,508,4.179266,Philadelphia (1993),Drama
6,1111,4.16783,Microcosmos (Microcosmos: Le peuple de l'herbe...,Documentary
7,539,4.163404,Sleepless in Seattle (1993),Comedy|Romance
8,3844,4.15887,Steel Magnolias (1989),Drama
9,2810,4.147011,Perfect Blue (1997),Animation|Mystery
