## https://www.kaggle.com/code/rounakbanik/movie-recommender-systems/notebook
## https://www.kaggle.com/code/leedohyun/recommendation-engine-contents-user-based

In [1]:
import pandas as pd
import numpy as np

In [2]:
#loading cleaned metadata
df = pd.read_csv("DataPreprocessing/clean_movies_metadata.csv")

In [3]:
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'original_language', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'release_year', 'release_month', 'rating', 'rating_count'],
      dtype='object')

In [4]:
# combined keywords with metadata
keywords = pd.read_csv('data/keywords.csv')
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [5]:
keywords['id'] = keywords['id'].astype('int')
df['id'] =df['id'].astype('int')

In [6]:
df = df.merge(keywords, on='id')

In [7]:
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,original_language,overview,popularity,production_companies,...,spoken_languages,status,tagline,title,video,release_year,release_month,rating,rating_count,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,1995,10,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...",...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,1995,12,6.9,2413.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,1995,12,6.5,92.0,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,False,,16000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,1995,12,6.1,34.0,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0.0,"[{'id': 35, 'name': 'Comedy'}]",,11862,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,1995,2,5.7,173.0,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [8]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [9]:
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,original_language,overview,popularity,production_companies,...,spoken_languages,status,tagline,title,video,release_year,release_month,rating,rating_count,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,1995,10,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...",...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,1995,12,6.9,2413.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,1995,12,6.5,92.0,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,False,,16000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,1995,12,6.1,34.0,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0.0,"[{'id': 35, 'name': 'Comedy'}]",,11862,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,1995,2,5.7,173.0,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [10]:
# parse the genre and keyword, and extract 'name'

def get_names(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 5:
            names = names[:5]
        return names

    return []

In [11]:

for feature in features:
    df[feature] = df[feature].apply(get_names)

In [12]:
df[['title', 'keywords', 'genres', 'tagline']].head()

Unnamed: 0,title,keywords,genres,tagline
0,Toy Story,"[jealousy, toy, boy, friendship, friends]","[Animation, Comedy, Family]",
1,Jumanji,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]",Roll the dice and unleash the excitement!
2,Grumpier Old Men,"[fishing, best friend, duringcreditsstinger, o...","[Romance, Comedy]",Still Yelling. Still Fighting. Still Ready for...
3,Waiting to Exhale,"[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]",Friends are the people who let you be yourself...
4,Father of the Bride Part II,"[baby, midlife crisis, confidence, aging, daug...",[Comedy],Just When His World Is Back To Normal... He's ...


In [13]:
#Replace NaN with an empty string
df['overview'] = df['overview'].fillna('')
df['tagline'] = df['tagline'].fillna('')

In [14]:
df[['title', 'keywords', 'genres', 'tagline', 'overview']].head()

Unnamed: 0,title,keywords,genres,tagline,overview
0,Toy Story,"[jealousy, toy, boy, friendship, friends]","[Animation, Comedy, Family]",,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]",Roll the dice and unleash the excitement!,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,"[fishing, best friend, duringcreditsstinger, o...","[Romance, Comedy]",Still Yelling. Still Fighting. Still Ready for...,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]",Friends are the people who let you be yourself...,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,"[baby, midlife crisis, confidence, aging, daug...",[Comedy],Just When His World Is Back To Normal... He's ...,Just when George Banks has recovered from his ...


In [15]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' +  ' '.join(x['genres'])

In [16]:
df['soup'] = df.apply(create_soup, axis=1)

In [17]:
df['soup'] = df['soup'] + df['overview']

In [18]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     df['soup'].head(2)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
#tfid = TfidfVectorizer(stop_words='english')

### since making cos_sim matrix taking too long, temporaily used only 10000 rows
#tfid_matrix = tfid.fit_transform(df.overview.iloc[1:10000])

In [21]:
# cos_sim = cosine_similarity(tfid_matrix,tfid_matrix)

In [22]:
### making recommend engine based on cosine similarities

def recommend_engine(title,cos_sim):
    idx = indices[title]
    sim_movies = sorted(list(enumerate(cos_sim[idx])), key= lambda x: x[1], reverse=True)
    sim_movies = sim_movies[1:11]
    sim_num = [x[0] for x in sim_movies]
    sim_value = [x[1] for x in sim_movies]
    result = indices.iloc[sim_num]
    result[0:10] = sim_value
    return(result)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# count = CountVectorizer(stop_words='english')
# count_matrix = count.fit_transform(df['soup'])

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['soup'])

#cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [None]:
def rank_plot(movie_name, cos_sim):
    tmp = recommend_engine(movie_name,cos_sim)
    plt.figure(figsize=(10,5))
    sns.barplot(x = tmp[0:10], y=tmp.index)
    plt.title("Recommended Movies from  " + str.upper(movie_name) + " using cosine_sim", fontdict= {'fontsize' :20})
    plt.xlabel("Cosine Similarities")
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
rank_plot("Jumanji", cosine_sim)