https://github.com/topspinj/recommender-tutorial/blob/master/part-2-cold-start-problem.ipynb

In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [69]:
movies = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [70]:
movies['genres'] = movies['genres'].apply(lambda x: x.split("|"))
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [71]:
import re
import math
movies['year'] = movies['title'].apply(lambda x: int(re.search(r"\(([0-9]+)\)", x).group(1)) if re.search(r"\(([0-9]+)\)", x) is not None else 0)
movies['title'] = movies['title'].apply(lambda x: x[:-6].rstrip())
movies['decade'] = movies['year'].apply(lambda x: 10*math.floor(x/10))
movies.head()

Unnamed: 0,movieId,title,genres,year,decade
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1990
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1990
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,1990
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,1990
4,5,Father of the Bride Part II,[Comedy],1995,1990


In [73]:
movies.loc[movies.year == 0]

Unnamed: 0,movieId,title,genres,year,decade
6059,40697,Bab,[Sci-Fi],0,0
9031,140956,Ready Play,"[Action, Sci-Fi, Thriller]",0,0
9091,143410,Hyen,[(no genres listed)],0,0
9138,147250,The Adventures of Sherlock Holmes and Doctor,[(no genres listed)],0,0
9179,149334,Nocturnal A,"[Drama, Thriller]",0,0
9259,156605,Pa,[(no genres listed)],0,0
9367,162414,Moo,[Drama],0,0
9448,167570,,[(no genres listed)],0,0
9514,171495,,[(no genres listed)],0,0
9515,171631,Maria Bamford: Ol,[(no genres listed)],0,0


In [83]:
movies = movies.loc[movies.year != 0]
movies = movies.loc[movies.decade >= 1900]

In [75]:
movies.loc[movies.title == 'Dark Knight, The']

Unnamed: 0,movieId,title,genres,year,decade
6710,58559,"Dark Knight, The","[Action, Crime, Drama, IMAX]",2008,2000


In [76]:
from re import search
## movies[movies['title'].str.contains(", The")]
movies['newname'] = movies['title'].apply(lambda x: 'The ' + x.replace(', The', '') if x.find(', The') != -1 else x)


International movies also have the original name and all movies starting with 'The' seems to have names modified.

In [77]:
movies[movies['title'].str.contains(", The")]

Unnamed: 0,movieId,title,genres,year,decade,newname
10,11,"American President, The","[Comedy, Drama, Romance]",1995,1990,The American President
28,29,"City of Lost Children, The (Cité des enfants p...","[Adventure, Drama, Fantasy, Mystery, Sci-Fi]",1995,1990,The City of Lost Children (Cité des enfants pe...
46,50,"Usual Suspects, The","[Crime, Mystery, Thriller]",1995,1990,The Usual Suspects
49,54,"Big Green, The","[Children, Comedy]",1995,1990,The Big Green
52,58,"Postman, The (Postino, Il)","[Comedy, Drama, Romance]",1994,1990,"The Postman (Postino, Il)"
...,...,...,...,...,...,...
8741,127180,"Story of Film: An Odyssey, The",[Documentary],2011,2010,The Story of Film: An Odyssey
8808,130578,"Gunman, The","[Action, Thriller]",2015,2010,The Gunman
9018,140523,"Visit, The","[Comedy, Horror]",2015,2010,The Visit
9120,145935,"Peanuts Movie, The","[Adventure, Animation, Children, Comedy]",2015,2010,The Peanuts Movie


In [78]:
from collections import Counter

genres_counts = Counter(g for genres in movies['genres'] for g in genres)
print(f"There are {len(genres_counts)} genre labels.")
print(genres_counts)

movies = movies[movies['genres']!='(no genres listed)']

del genres_counts['(no genres listed)']

There are 20 genre labels.
Counter({'Drama': 4359, 'Comedy': 3756, 'Thriller': 1892, 'Action': 1827, 'Romance': 1596, 'Adventure': 1263, 'Crime': 1199, 'Horror': 978, 'Sci-Fi': 978, 'Fantasy': 779, 'Children': 664, 'Animation': 611, 'Mystery': 573, 'Documentary': 440, 'War': 382, 'Musical': 334, 'Western': 167, 'IMAX': 158, 'Film-Noir': 87, '(no genres listed)': 25})


In [79]:
print("The 5 most common genres: \n", genres_counts.most_common(5))

The 5 most common genres: 
 [('Drama', 4359), ('Comedy', 3756), ('Thriller', 1892), ('Action', 1827), ('Romance', 1596)]


In [80]:
genres = list(genres_counts.keys())

for g in genres:
    movies[g] = movies['genres'].transform(lambda x: int(g in x))

In [112]:
pickle.dump(movies, open('movies.pickle', 'wb'))

In [84]:
movie_decades = pd.get_dummies(movies['decade'])
movie_decades.head()

Unnamed: 0,1900,1910,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010
0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0


In [85]:
movie_features = pd.concat([movies[genres], movie_decades], axis=1)
movie_features.head()

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,...,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010
0,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [86]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(movie_features, movie_features)
print(f"Dimensions of our movie features cosine similarity matrix: {cosine_sim.shape}")

Dimensions of our movie features cosine similarity matrix: (9727, 9727)


In [111]:
import pickle

pickle.dump(cosine_sim, open('cosine_sim.pickle', 'wb'))

In [95]:
from fuzzywuzzy import process

def movie_finder(title):
    all_titles = movies['newname'].tolist()
    closest_match = process.extractOne(title,all_titles)
    return closest_match[0]

In [104]:
title = movie_finder('jumanji')
title

'Jumanji'

In [105]:
movie_idx = dict(zip(movies['newname'], list(movies.index)))
idx = movie_idx[title]
idx

1

In [106]:
n_recommendations=10
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:(n_recommendations+1)]
similar_movies = [i[0] for i in sim_scores]

In [107]:
print(f"Because you watched {title}:")
movies['newname'].iloc[similar_movies]

Because you watched Jumanji:


53                             The Indian in the Cupboard
109                             The NeverEnding Story III
1618           The NeverEnding Story II: The Next Chapter
8719                          The Cave of the Golden Rose
9565                                   Gulliver's Travels
1357                                        The Borrowers
1565                                            Tall Tale
2539                       We're Back! A Dinosaur's Story
5624    Kirikou and the Sorceress (Kirikou et la sorci...
5975    Asterix & Obelix vs. Caesar (Astérix et Obélix...
Name: newname, dtype: object

In [116]:
def get_content_based_recommendations(title_string, n_recommendations=10):
    title = movie_finder(title_string)
    idx = movie_idx[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:(n_recommendations+1)]
    similar_movies = [i[0] for i in sim_scores]
    print(f"Recommendations for {title}:")
    print(movies['title'].iloc[similar_movies])
    return movies['title'].iloc[similar_movies]

In [117]:
x = get_content_based_recommendations('aladin', 5)

Recommendations for Aladdin:
1177                                             Hercules
95                                 Muppet Treasure Island
673     Land Before Time III: The Time of the Great Gi...
1757                                        Bug's Life, A
3727                       Ferngully: The Last Rainforest
Name: title, dtype: object


In [122]:
x.tolist()

['Hercules',
 'Muppet Treasure Island',
 'Land Before Time III: The Time of the Great Giving',
 "Bug's Life, A",
 'Ferngully: The Last Rainforest']