In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer
from surprise import Reader, Dataset, SVD
import joblib

In [2]:
# load dataset
md = pd.read_csv('./sources/movies_metadata.csv')
credits = pd.read_csv('./sources/credits.csv')
keywords = pd.read_csv('./sources/keywords.csv')
ratings = pd.read_csv('./sources/ratings.csv')
links = pd.read_csv('./sources/links.csv')
md = md.drop([19730, 29503, 35587])
md['id'] = md['id'].astype('int')
links.rename(columns = {'movieId': 'id'}, inplace=True)
md = md.merge(links, on="id")
links = links[links['tmdbId'].notnull()]['tmdbId'].astype('int')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
# data pre-processing

smd = md[md['id'].isin(links)]
smd.shape

(7639, 26)

In [4]:
# maybe unecessary?
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [5]:
# Initialize values for weighted_rating to work
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
m = vote_counts.quantile(0.95)

In [6]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [7]:
# setup description based recommendations
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [8]:
# get cosine_sim and tfidf matrix
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [9]:
# setup for get_recommendations function
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [10]:
def get_recommendations(title):
    index = indices[title]
    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [11]:
# metadata based recommendations
credits['id'] = credits['id'].astype('int')
keywords['id'] = keywords['id'].astype('int')
md['id'] = md['id'].astype('int')

md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')
smd = md[md['id'].isin(links)]

In [12]:
# restrict crew to director and cast to top 3
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [13]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [14]:
# more data manip
smd['director'] = smd['crew'].apply(get_director)
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >= 3 else x)
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# strip spaces, convert to lowercase for all features
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
# mention director 3 times for more weight
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x, x, x])

In [15]:
# pre-processing keywords
# calculate frequency counts of every keyword
s = smd.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
# remove keywords that only appear once
s = s[s > 1]
# convert each word to their stem (ie. so 'cars' and 'car' are the same)
stemmer = SnowballStemmer('english')

  s = smd.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)


In [16]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [17]:
# setup for improved_recommendations
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

# create word soup
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

# smd.head()

# # create count vectorizer
# count = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
# count_matrix = count.fit_transform(smd['soup'])
# cosine_sim = cosine_similarity(count_matrix, count_matrix)

smd = smd.reset_index()
# titles = smd['title']
# indices = pd.Series(smd.index, index=smd['title'])

In [18]:
smd.pop('index')
smd.pop('adult')
smd.pop('belongs_to_collection')
smd.pop('genres')
smd.pop('homepage')
smd.pop('imdb_id')
smd.pop('original_language')
# smd.pop('original_title')
smd.pop('cast')
smd.pop('crew')
smd.pop('keywords')
smd.pop('cast_size')
smd.pop('crew_size')
smd.pop('director')
smd.pop('budget')
# smd.pop('poster_path')
smd.pop('production_companies')
smd.pop('production_countries')
smd.pop('revenue')
# smd.pop('release_date')
smd.pop('runtime')
smd.pop('spoken_languages')
smd.pop('status')
smd.pop('tagline')
smd.pop('video')
smd.pop('popularity')
smd.pop('id')
smd.pop('imdbId')

smd.head()

Unnamed: 0,original_title,overview,poster_path,release_date,title,vote_average,vote_count,tmdbId,year,soup
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,1995-10-30,Toy Story,7.7,5415.0,88224.0,1995,jealousi toy boy friendship friend rivalri boy...
1,Jumanji,When siblings Judy and Peter discover an encha...,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,1995-12-15,Jumanji,6.9,2413.0,42164.0,1995,disappear basedonchildren'sbook newhom reclus ...
2,Heat,"Obsessive master thief, Neil McCauley leads a ...",/zMyfPUelumio3tiDKPffaUpsQTD.jpg,1995-12-15,Heat,7.7,1886.0,220.0,1995,robberi detect bank obsess chase shoot thief h...
3,GoldenEye,James Bond must unmask the mysterious head of ...,/5c0ovjT41KnYIHYuF4AWsTe3sKh.jpg,1995-11-16,GoldenEye,6.6,1194.0,23449.0,1995,cuba falselyaccus secretident computervirus se...
4,Cutthroat Island,"Morgan Adams and her slave, William Shaw, are ...",/odM9973kIv9hcjfHPp6g6BlyTIJ.jpg,1995-12-22,Cutthroat Island,5.7,137.0,9361.0,1995,exoticisland treasur map ship pirat geenadavis...


In [19]:
new_first = smd.pop('tmdbId')
smd.insert(0, 'tmdbId', new_first)
new_second = smd.pop('title')
smd.insert(1, 'title', new_second)
description = smd.pop('overview')
smd.insert(2, 'description', description)
year = smd.pop('year')
smd.insert(3, 'year', year)

smd = smd[smd['tmdbId'].isin(links)]
smd['tmdbId'] = smd['tmdbId'].astype(int)
smd['vote_count'] = smd['vote_count'].astype(int)

In [21]:
smd.head()

Unnamed: 0,tmdbId,title,description,year,original_title,poster_path,release_date,vote_average,vote_count,soup
0,88224,Toy Story,"Led by Woody, Andy's toys live happily in his ...",1995,Toy Story,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,1995-10-30,7.7,5415,jealousi toy boy friendship friend rivalri boy...
1,42164,Jumanji,When siblings Judy and Peter discover an encha...,1995,Jumanji,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,1995-12-15,6.9,2413,disappear basedonchildren'sbook newhom reclus ...
2,220,Heat,"Obsessive master thief, Neil McCauley leads a ...",1995,Heat,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,1995-12-15,7.7,1886,robberi detect bank obsess chase shoot thief h...
3,23449,GoldenEye,James Bond must unmask the mysterious head of ...,1995,GoldenEye,/5c0ovjT41KnYIHYuF4AWsTe3sKh.jpg,1995-11-16,6.6,1194,cuba falselyaccus secretident computervirus se...
4,9361,Cutthroat Island,"Morgan Adams and her slave, William Shaw, are ...",1995,Cutthroat Island,/odM9973kIv9hcjfHPp6g6BlyTIJ.jpg,1995-12-22,5.7,137,exoticisland treasur map ship pirat geenadavis...


In [22]:
smd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7735 entries, 0 to 7766
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tmdbId          7735 non-null   int64  
 1   title           7735 non-null   object 
 2   description     7601 non-null   object 
 3   year            7735 non-null   object 
 4   original_title  7735 non-null   object 
 5   poster_path     7714 non-null   object 
 6   release_date    7727 non-null   object 
 7   vote_average    7735 non-null   float64
 8   vote_count      7735 non-null   int64  
 9   soup            7735 non-null   object 
dtypes: float64(1), int64(2), object(7)
memory usage: 664.7+ KB


In [23]:
smd.to_csv('best_metadata_big.csv', index=False)

In [146]:
def improved_recommendations(title):
    index = indices[title]
    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]

    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    m = vote_counts.quantile(0.60)
    C = vote_averages.mean()
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [147]:
improved_recommendations('The Dark Knight')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['vote_count'] = qualified['vote_count'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['vote_average'] = qualified['vote_average'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['wr'] = qualified.apply(weighted_rating, axis=1)


Unnamed: 0,title,vote_count,vote_average,year,wr
941,Pirates of the Caribbean: The Curse of the Bla...,7191,7,2003,6.745183
109,Batman,2145,7,1989,6.444463
775,The Others,1708,7,2001,6.381221
287,Grease,1633,7,1978,6.368891
1242,Crash,1172,7,2004,6.280801
1305,Ice Age: The Meltdown,3034,6,2006,5.952643
1345,The Fountain,852,6,2006,5.918
312,My Best Friend's Wedding,606,6,1997,5.910629
24,Batman Forever,1529,5,1995,5.515381
311,Batman & Robin,1447,4,1997,5.122749


In [148]:
# Collaborative filtering
reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fa127efde20>

In [149]:
# Hybrid recommender
# Input: userId and movie title
# Output: similar movies sorted based on expected ratings by that particular user
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [150]:
# prepare data for hybrid function
id_map = pd.read_csv('./sources/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')
indices_map = id_map.set_index('id')

KeyError: "['id'] not in index"

In [None]:
indices['Toy Story']

In [None]:
def hybrid(userId, title):
    index = indices[title]
    # maybe not necessary?
    # tmdbId = id_map.loc[title]['id']
    # movie_id = id_map.loc[title]['movieId']

    sim_scores = list(enumerate(cosine_sim[int(index)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]

    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [None]:
hybrid(500, 'Avatar')