https://github.com/topspinj/recommender-tutorial/blob/master/part-2-cold-start-problem.ipynb

In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from re import search

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
movies = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Get rating to get top 1k rated movies.

In [3]:
ratings = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
top_movie_ids = ratings.groupby(['movieId']).size()
top1k_rated_movies = top_movie_ids.nlargest(1000).index.tolist()
movies = movies.loc[movies.movieId.isin(top1k_rated_movies)]
movies.shape

(1000, 3)

In [5]:
movies['genres'] = movies['genres'].apply(lambda x: x.split("|"))
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]
5,6,Heat (1995),"[Action, Crime, Thriller]"


Calculating decade for the movies

In [6]:
import re
import math
movies['year'] = movies['title'].apply(lambda x: int(re.search(r"\(([0-9]+)\)", x).group(1)) if re.search(r"\(([0-9]+)\)", x) is not None else 0)
movies['title'] = movies['title'].apply(lambda x: x[:-6].rstrip())
movies['decade'] = movies['year'].apply(lambda x: 10*math.floor(x/10))
movies.head()

Unnamed: 0,movieId,title,genres,year,decade
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1990
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1990
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,1990
4,5,Father of the Bride Part II,[Comedy],1995,1990
5,6,Heat,"[Action, Crime, Thriller]",1995,1990


In [7]:
movies.loc[movies.year == 0]

Unnamed: 0,movieId,title,genres,year,decade


In [8]:
movies = movies.loc[movies.year != 0]
movies = movies.loc[movies.decade >= 1900]

International movies also have the original name and all movies starting with 'The' seems to have names modified.

In [9]:

## movies[movies['title'].str.contains(", The")]
movies['newname'] = movies['title'].apply(lambda x: 'The ' + x.replace(', The', '') if x.find(', The') != -1 else x)
movies[movies['title'].str.contains(", The")]

Unnamed: 0,movieId,title,genres,year,decade,newname
10,11,"American President, The","[Comedy, Drama, Romance]",1995,1990,The American President
28,29,"City of Lost Children, The (Cité des enfants p...","[Adventure, Drama, Fantasy, Mystery, Sci-Fi]",1995,1990,The City of Lost Children (Cité des enfants pe...
46,50,"Usual Suspects, The","[Crime, Mystery, Thriller]",1995,1990,The Usual Suspects
52,58,"Postman, The (Postino, Il)","[Comedy, Drama, Romance]",1994,1990,"The Postman (Postino, Il)"
53,60,"Indian in the Cupboard, The","[Adventure, Children, Fantasy]",1995,1990,The Indian in the Cupboard
...,...,...,...,...,...,...
7776,91658,"Girl with the Dragon Tattoo, The","[Drama, Thriller]",2011,2010,The Girl with the Dragon Tattoo
7927,95510,"Amazing Spider-Man, The","[Action, Adventure, Sci-Fi, IMAX]",2012,2010,The Amazing Spider-Man
8053,98809,"Hobbit: An Unexpected Journey, The","[Adventure, Fantasy, IMAX]",2012,2010,The Hobbit: An Unexpected Journey
8305,106782,"Wolf of Wall Street, The","[Comedy, Crime, Drama]",2013,2010,The Wolf of Wall Street


In [10]:
from collections import Counter

genres_counts = Counter(g for genres in movies['genres'] for g in genres)
print(f"There are {len(genres_counts)} genre labels.")
print(genres_counts)

movies = movies[movies['genres']!='(no genres listed)']

del genres_counts['(no genres listed)']

There are 19 genre labels.
Counter({'Drama': 407, 'Comedy': 405, 'Action': 335, 'Thriller': 277, 'Adventure': 271, 'Sci-Fi': 194, 'Romance': 182, 'Crime': 162, 'Fantasy': 127, 'Children': 93, 'Mystery': 81, 'Animation': 74, 'Horror': 70, 'War': 52, 'IMAX': 45, 'Musical': 44, 'Western': 22, 'Film-Noir': 9, 'Documentary': 5})


In [11]:
print("The 5 most common genres: \n", genres_counts.most_common(5))

The 5 most common genres: 
 [('Drama', 407), ('Comedy', 405), ('Action', 335), ('Thriller', 277), ('Adventure', 271)]


In [12]:
genres = list(genres_counts.keys())

for g in genres:
    movies[g] = movies['genres'].transform(lambda x: int(g in x))

In [13]:
pickle.dump(movies, open('movies.pickle', 'wb'))

In [14]:
movie_decades = pd.get_dummies(movies['decade'])
movie_decades = movie_decades
movie_decades.head()

Unnamed: 0,1930,1940,1950,1960,1970,1980,1990,2000,2010
0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0
5,0,0,0,0,0,0,1,0,0


In [15]:
movies.groupby('decade')['decade'].count()

decade
1930      3
1940     10
1950     17
1960     32
1970     46
1980    124
1990    386
2000    310
2010     71
Name: decade, dtype: int64

In [30]:
movie_features = pd.concat([movies[genres + ['newname']], movie_decades], axis=1)
movie_features.reset_index(inplace = True)
movie_features.drop(columns = 'index', inplace = True)
pickle.dump(movie_features, open('movie_features.pickle', 'wb'))
pickle.dump(movie_features[['newname']], open('movies.pickle', 'wb'))
movie_features.head()

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Action,Crime,Thriller,Drama,...,newname,1930,1940,1950,1960,1970,1980,1990,2000,2010
0,1,1,1,1,1,0,0,0,0,0,...,Toy Story,0,0,0,0,0,0,1,0,0
1,1,0,1,0,1,0,0,0,0,0,...,Jumanji,0,0,0,0,0,0,1,0,0
2,0,0,0,1,0,1,0,0,0,0,...,Grumpier Old Men,0,0,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0,0,0,0,...,Father of the Bride Part II,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,1,1,0,...,Heat,0,0,0,0,0,0,1,0,0


In [31]:
pickle.load(open('movies.pickle', 'rb')).head()

Unnamed: 0,newname
0,Toy Story
1,Jumanji
2,Grumpier Old Men
3,Father of the Bride Part II
4,Heat


In [18]:
features = list(movie_features)
features.remove('newname')

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(movie_features[features], movie_features[features])
print(f"Dimensions of our movie features cosine similarity matrix: {cosine_sim.shape}")

Dimensions of our movie features cosine similarity matrix: (999, 999)


In [20]:
movie_features.shape

(999, 29)

In [None]:
#np.savez_compressed('cosine_sim.npz', a=cosine_sim) #for heroku
#loaded = np.load('cosine_sim.npz')
#cosine_sim = loaded['a']

In [21]:
pickle.dump(cosine_sim, open('cosine_sim1k.pickle', 'wb')) ##~700mb  ##8mb

In [22]:
from fuzzywuzzy import process

def movie_finder(title):
    all_titles = movie_features['newname'].tolist()
    closest_match = process.extractOne(title,all_titles)
    return closest_match[0]

In [23]:
title = movie_finder('star wars Episode II - Attack')
title

'Star Wars: Episode II - Attack of the Clones'

In [24]:
movie_idx = dict(zip(movie_features['newname'], list(movie_features.index)))
pickle.dump(movie_idx, open('movie_idx.pickle', 'wb'))

idx = movie_idx[title]

In [25]:
def get_content_based_recommendations(title_string, n_recommendations=10):
    title = movie_finder(title_string)
    idx = movie_idx[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:(n_recommendations+1)]
    similar_movies = [i[0] for i in sim_scores]
    print(f"Recommendations for {title}:\n")
    print(movie_features['newname'].loc[similar_movies].tolist())
    return movie_features['newname'].loc[similar_movies].tolist()

In [26]:
x = get_content_based_recommendations(title, 5)

Recommendations for Star Wars: Episode II - Attack of the Clones:

['Spider-Man 2', 'Star Trek', 'Avatar', 'The Matrix Reloaded', 'The Matrix Revolutions']
