## Initialize

In [1]:
import pandas as pd

## Load Data

In [2]:
# Movies metadata
dfMvs = pd.read_csv('movies_metadata_clean.csv')

# User ratings for each movie
dfMvsRtg = pd.read_csv('movies_ratings_small.csv')
dfMvs.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,10/30/1995,373554033,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,12/15/1995,262797249,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,12/22/1995,0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,12/22/1995,81452156,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,2/10/1995,76578911,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173


## Build Recommendations

### 1. Content Based Filtering

In [3]:
dfMvs.shape

(45463, 24)

In [4]:
# Remove Duplicates
dfMvs.drop_duplicates(subset='title', keep='first', inplace=True)
dfMvs.shape

(42277, 24)

In [5]:
# Prepare description (documents) column
dfMvs['tagline'] = dfMvs['tagline'].fillna('')
dfMvs['description'] = dfMvs['overview'] + dfMvs['tagline']
dfMvs['description'] = dfMvs['description'].fillna('')
dfMvs['description']

0        Led by Woody, Andy's toys live happily in his ...
1        When siblings Judy and Peter discover an encha...
2        A family wedding reignites the ancient feud be...
3        Cheated on, mistreated and stepped on, the wom...
4        Just when George Banks has recovered from his ...
                               ...                        
45456    It's the year 3000 AD. The world's most danger...
45458    Rising and falling between a man and woman.Ris...
45459    An artist struggles to finish his work while a...
45461    In a small town live two brothers, one a minis...
45462    50 years after decriminalisation of homosexual...
Name: description, Length: 42277, dtype: object

### Build Model

In [6]:
# Generate a matrix of common terms that show up in each movie

from sklearn.feature_extraction.text import TfidfVectorizer

mdlTfidfMvs = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')
tfidf_matrix = mdlTfidfMvs.fit_transform(dfMvs['description'])
tfidf_matrix.shape

(42277, 1045224)

In [7]:
# Calculate the cosine similarity between each pair of movies
from sklearn.metrics.pairwise import cosine_similarity

cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_sim.shape

(42277, 42277)

## Generate Recommendation

In [8]:
# Prepare recommendation function
titles = dfMvs['title']
indicies = pd.Series(dfMvs.index, index=dfMvs['title'])

def get_recommendations(title):
    idx = indicies[title]
    sim_scores = list(enumerate(cos_sim[idx]))
    sim_scores = sorted(sim_scores, reverse=True, key=lambda x: x[1])
    sim_scores = sim_scores[1:11]
    movie_indicies = [i[0] for i in sim_scores]
    return titles.iloc[movie_indicies]

In [9]:
get_recommendations('The Godfather')

44027    The Godfather Trilogy: 1972-1990
1178               The Godfather: Part II
31971                    Honor Thy Father
23125                          Blood Ties
38027            A Mother Should Be Loved
18322                     The Outside Man
11297                    Household Saints
4324                                 Made
5433                   Johnny Dangerously
18224                           Miss Bala
Name: title, dtype: object

In [10]:
get_recommendations('Jumanji')

21632                  Table No. 21
9503                      Word Wars
43124                       The Bar
8801                        Quintet
17223                The Dark Angel
37444            The Ouija Exorcism
15512               Le Pont du Nord
34771    Doctor Who: Last Christmas
44373             Liar Game: Reborn
35507                      The Mend
Name: title, dtype: object

In [11]:
get_recommendations('The Lion King')

9115                  The Lion King 2: Simba's Pride
571                                            Andre
34680    How the Lion Cub and the Turtle Sang a Song
37467                                The Real Miyagi
9353                               The Lion King 1¬Ω
416                                     Black Beauty
17041                                   African Cats
11676                                  Becoming Jane
9776                                Courage Mountain
6094                                       Born Free
Name: title, dtype: object

### 2. Collaborative Filtering
This is more unsupervised learning in terms of the way we use features, but a lot of the libraries will fall under the supervised learning models (because we are predicting another user's rating).

In [12]:
dfMvsRtg.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [15]:
# Prepare the data into the surprise format

from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split 

X = Dataset.load_from_df(dfMvsRtg.drop('timestamp', axis=1), reader=Reader())

In [16]:
X_train, X_test = train_test_split(X, test_size=0.25)

In [17]:
# Define, fit, predict, evaluate

mdlSvdMvsRtg = SVD()
mdlSvdMvsRtg.fit(X_train)
test_pred = mdlSvdMvsRtg.test(X_test)

In [18]:
# find RMSE accuracy

from surprise import accuracy

accuracy.rmse(test_pred)

RMSE: 0.9021


0.902057623020617

In [19]:
# Cross-validate

from surprise.model_selection import cross_validate

# Run 5-fold cv
cross_validate(mdlSvdMvsRtg, X, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8905  0.8929  0.8942  0.9015  0.8986  0.8955  0.0040  
MAE (testset)     0.6876  0.6889  0.6897  0.6914  0.6929  0.6901  0.0019  
Fit time          6.33    5.95    6.33    6.63    6.90    6.43    0.32    
Test time         0.30    0.20    0.17    0.37    0.18    0.24    0.08    


{'test_rmse': array([0.89045756, 0.89285586, 0.89420079, 0.90148555, 0.89863697]),
 'test_mae': array([0.68762394, 0.68888329, 0.68974833, 0.69143266, 0.69286163]),
 'fit_time': (6.325135231018066,
  5.9491636753082275,
  6.33420991897583,
  6.631695985794067,
  6.903264045715332),
 'test_time': (0.2991199493408203,
  0.201002836227417,
  0.17285585403442383,
  0.36893582344055176,
  0.18295621871948242)}

In [25]:
dfMvsRtg[dfMvsRtg.userId==1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [21]:
mdlSvdMvsRtg.predict(1, 302)

Prediction(uid=1, iid=302, r_ui=None, est=2.7652781014354066, details={'was_impossible': False})

There are more types of matrix factorization out there, but SVD is the simplest... Also it is well optimized through surprise