# Generation of a cosine similarity matrix to compute movie similarity based on movie features

## Data from The Movies Database (tmdb)
 + Metadata
 + credits

## OBS: NLTK needs to be installed in the environment before you run the notebook. So unhash the code below for installation

In [1]:
# Installation of NLTK
#pip install nltk

In [2]:
import pandas as pd
import numpy as np


import scipy.sparse as sp
from typing import List

from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer, PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity



# Construction of the feature dataframe for the similarity measurement

## Load data from tmdb dataset

In [3]:
df_meta = pd.read_csv("../data/tmdb/movies_metadata.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
df_meta.head()


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [5]:
df_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [6]:
df_meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [7]:
# load credit and keyword table
df_credits = pd.read_csv("../data/tmdb/credits.csv")
df_keywords = pd.read_csv("../data/tmdb/keywords.csv")

In [8]:
df_credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [9]:
df_keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [10]:
df_keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


## Cleaning data (tmdb credit and keyword df)

In [11]:
# check for duplicates id 
#df_keywords.id.nunique()

There are 987 duplicates in df_keywords

In [12]:
# check out the duplicated ids
#duplicated_keywords = df_keywords[df_keywords.duplicated(subset="id")]
# duplicated_keywords.head()

In [13]:
# check for duplicates id 
# df_credits.id.nunique()

In [14]:
# check out the duplicated ids
# duplicated_credits = df_credits[df_credits.duplicated(subset="id")]
# duplicated_credits.head()

In [15]:
# drop duplicates
# df_keywords.drop_duplicates(subset="id", inplace=True)
# df_credits.drop_duplicates(subset="id", inplace=True)

In [16]:
# Merging keywords and credits df
df_features = df_keywords.merge(df_credits, on="id")
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46496 entries, 0 to 46495
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46496 non-null  int64 
 1   keywords  46496 non-null  object
 2   cast      46496 non-null  object
 3   crew      46496 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.8+ MB


In [17]:
#apply literal_eval from Abstract Syntax Trees (ast) to convert str into a list of dictionaries
df_features['cast'] = df_features['cast'].apply(literal_eval)
df_features['crew'] = df_features['crew'].apply(literal_eval)
df_features['keywords'] = df_features['keywords'].apply(literal_eval)

In [18]:
#create function to get list of lists
def get_lists(key, column, df):
    x = []

    for i in range(df.shape[0]):
        values = [d[f"{key}"] for d in df[f"{column}"][i]]
        x.append(values)
    return x

In [19]:
#add new columns to features data frame
df_features["crew_jobs"] = pd.Series(get_lists("job", "crew", df_features))
df_features["actors"] = pd.Series(get_lists("name", "cast", df_features))
df_features["keywords"] = pd.Series(get_lists("name", "keywords", df_features))

In [20]:
# select only cleaned columns for the new dataset
df_features = df_features[["id", "actors", "keywords", "crew_jobs"]]
df_features.head()

Unnamed: 0,id,actors,keywords,crew_jobs
0,862,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[jealousy, toy, boy, friendship, friends, riva...","[Director, Screenplay, Screenplay, Screenplay,..."
1,8844,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[board game, disappearance, based on children'...","[Executive Producer, Screenplay, Original Musi..."
2,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[fishing, best friend, duringcreditsstinger, o...","[Director, Characters, Writer, Sound Recordist]"
3,31357,"[Whitney Houston, Angela Bassett, Loretta Devi...","[based on novel, interracial relationship, sin...","[Director, Screenplay, Producer, Producer, Pro..."
4,11862,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[baby, midlife crisis, confidence, aging, daug...","[Original Music Composer, Director of Photogra..."


## Cleaning data from tmdb metadata df

In [21]:
df_meta.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [22]:
# check for duplicates in the id
# df_meta.id.nunique()

There are 30 duplicates for id in the meta table.

In [23]:
# remove duplicates
# df_meta.drop_duplicates(subset="id", inplace=True)

## Selecting features from the dataframe, that are suitable for computing content-based similarity

For the calculation of the cosine similarity are following features considered:
+ title
+ overview (description of the movie), 
+ genre
+ actors
+ release year
+ director
+ writer
+ keywords


In [24]:
df_meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [25]:
# make a new datafame with only desired features
meta = df_meta[['genres','id', 'overview', 'release_date', 'tagline', 'title']]
meta.head()

Unnamed: 0,genres,id,overview,release_date,tagline,title
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,,Toy Story
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,When siblings Judy and Peter discover an encha...,1995-12-15,Roll the dice and unleash the excitement!,Jumanji
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,A family wedding reignites the ancient feud be...,1995-12-22,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,Friends are the people who let you be yourself...,Waiting to Exhale
4,"[{'id': 35, 'name': 'Comedy'}]",11862,Just when George Banks has recovered from his ...,1995-02-10,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II


In [26]:
#apply literal_eval from Abstract Syntax Trees (ast) to convert str into a list of dictionaries
meta['genres'] = meta['genres'].apply(literal_eval)

# add new columns to features data frame
meta['genres'] = pd.Series(get_lists("name", "genres", meta))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta['genres'] = meta['genres'].apply(literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta['genres'] = pd.Series(get_lists("name", "genres", meta))


### Extract the release year

In [27]:
# convert release date to datetime object
meta["release_date"] = pd.to_datetime(meta["release_date"], errors="coerce")

# extract the year
meta["release_date"] = (pd.DatetimeIndex(meta["release_date"]).year)

# cast the year from float into integer
meta["release_date"] = meta["release_date"].astype("Int64")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta["release_date"] = pd.to_datetime(meta["release_date"], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta["release_date"] = (pd.DatetimeIndex(meta["release_date"]).year)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta["release_date"] = meta["release_date"].astype("Int64")

In [28]:
# check the dataframe
meta.head()

Unnamed: 0,genres,id,overview,release_date,tagline,title
0,"[Animation, Comedy, Family]",862,"Led by Woody, Andy's toys live happily in his ...",1995,,Toy Story
1,"[Adventure, Fantasy, Family]",8844,When siblings Judy and Peter discover an encha...,1995,Roll the dice and unleash the excitement!,Jumanji
2,"[Romance, Comedy]",15602,A family wedding reignites the ancient feud be...,1995,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men
3,"[Comedy, Drama, Romance]",31357,"Cheated on, mistreated and stepped on, the wom...",1995,Friends are the people who let you be yourself...,Waiting to Exhale
4,[Comedy],11862,Just when George Banks has recovered from his ...,1995,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II


## Merge the metadata with the df_feature

In [29]:
# drop rows with incorrect tmdb id´s 
meta = meta.drop([19730, 29503, 35587])

# cast id in meta to int
meta["id"] = meta["id"].astype(int)

In [30]:
# Merging meta and df_feature table on tmdb id
df_features = df_features.merge(meta, on="id")
df_features.head()

Unnamed: 0,id,actors,keywords,crew_jobs,genres,overview,release_date,tagline,title
0,862,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[jealousy, toy, boy, friendship, friends, riva...","[Director, Screenplay, Screenplay, Screenplay,...","[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",1995,,Toy Story
1,8844,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[board game, disappearance, based on children'...","[Executive Producer, Screenplay, Original Musi...","[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,1995,Roll the dice and unleash the excitement!,Jumanji
2,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[fishing, best friend, duringcreditsstinger, o...","[Director, Characters, Writer, Sound Recordist]","[Romance, Comedy]",A family wedding reignites the ancient feud be...,1995,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men
3,31357,"[Whitney Houston, Angela Bassett, Loretta Devi...","[based on novel, interracial relationship, sin...","[Director, Screenplay, Producer, Producer, Pro...","[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",1995,Friends are the people who let you be yourself...,Waiting to Exhale
4,11862,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[baby, midlife crisis, confidence, aging, daug...","[Original Music Composer, Director of Photogra...",[Comedy],Just when George Banks has recovered from his ...,1995,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II


## Set movielens id as key for the feature dataframe
Since the recommendation lists from the collaborative filtering derives from the movielens dataset, the df_features has to have the movielens Id as index for the movies.

In [31]:
df_links = pd.read_csv("../data/ml-latest-small/links.csv")

In [32]:
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [33]:
df_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [34]:
#check for duplicates
df_links.movieId.nunique()

9742

In [35]:
#check for duplicates
df_links.tmdbId.nunique()

9733

There are 8 duplicates in movieId and 9 Duplicates in the tmdbId. The imdbId is not of interest here and will not be used further.

In [36]:
# remove the duplicates
df_links.drop_duplicates(subset="movieId", inplace=True)
df_links.drop_duplicates(subset="tmdbId", inplace=True)
df_links.shape

(9734, 3)

In [37]:
# check for null values
df_links["tmdbId"].isnull().sum()

1

In [38]:
df_links.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9734 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9734 non-null   int64  
 1   imdbId   9734 non-null   int64  
 2   tmdbId   9733 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 304.2 KB


In [39]:
# drop the null values
df_links.dropna(inplace=True)

In [40]:
# rename the id in the df_features to tmdbId for merging
df_features.rename(columns={"id":"tmdbId"}, inplace=True)

In [41]:
# cast the tmdbId into integer for merging
df_links["tmdbId"] = df_links["tmdbId"].astype("Int64")
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [42]:
# merge the feature dataframe with the link table from movielens 
df_features = df_features.merge(df_links,on="tmdbId")
df_features.head()

Unnamed: 0,tmdbId,actors,keywords,crew_jobs,genres,overview,release_date,tagline,title,movieId,imdbId
0,862,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[jealousy, toy, boy, friendship, friends, riva...","[Director, Screenplay, Screenplay, Screenplay,...","[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",1995,,Toy Story,1,114709
1,8844,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[board game, disappearance, based on children'...","[Executive Producer, Screenplay, Original Musi...","[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,1995,Roll the dice and unleash the excitement!,Jumanji,2,113497
2,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[fishing, best friend, duringcreditsstinger, o...","[Director, Characters, Writer, Sound Recordist]","[Romance, Comedy]",A family wedding reignites the ancient feud be...,1995,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,3,113228
3,31357,"[Whitney Houston, Angela Bassett, Loretta Devi...","[based on novel, interracial relationship, sin...","[Director, Screenplay, Producer, Producer, Pro...","[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",1995,Friends are the people who let you be yourself...,Waiting to Exhale,4,114885
4,11862,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[baby, midlife crisis, confidence, aging, daug...","[Original Music Composer, Director of Photogra...",[Comedy],Just when George Banks has recovered from his ...,1995,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5,113041


In [43]:
# check for duplicates in movieId
df_features.movieId.duplicated().sum()

114

In [44]:
# remove duplicates
df_features.drop_duplicates(subset="movieId", inplace=True)

In [45]:
# set the movieId as index
df_features = df_features.set_index("movieId")
df_features.head()

Unnamed: 0_level_0,tmdbId,actors,keywords,crew_jobs,genres,overview,release_date,tagline,title,imdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,862,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[jealousy, toy, boy, friendship, friends, riva...","[Director, Screenplay, Screenplay, Screenplay,...","[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",1995,,Toy Story,114709
2,8844,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[board game, disappearance, based on children'...","[Executive Producer, Screenplay, Original Musi...","[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,1995,Roll the dice and unleash the excitement!,Jumanji,113497
3,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[fishing, best friend, duringcreditsstinger, o...","[Director, Characters, Writer, Sound Recordist]","[Romance, Comedy]",A family wedding reignites the ancient feud be...,1995,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,113228
4,31357,"[Whitney Houston, Angela Bassett, Loretta Devi...","[based on novel, interracial relationship, sin...","[Director, Screenplay, Producer, Producer, Pro...","[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",1995,Friends are the people who let you be yourself...,Waiting to Exhale,114885
5,11862,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[baby, midlife crisis, confidence, aging, daug...","[Original Music Composer, Director of Photogra...",[Comedy],Just when George Banks has recovered from his ...,1995,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,113041


In [46]:
df_features.tail()

Unnamed: 0_level_0,tmdbId,actors,keywords,crew_jobs,genres,overview,release_date,tagline,title,imdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
175705,7014,"[Michel Piccoli, Miou-Miou, Béatrice Romand, F...",[absurdism],"[Sound Designer, Director, Novel, Producer, Pr...",[Comedy],"Made without proper language, just gibberish a...",1973,,Themroc,69369
175707,411516,[Brunhilde Pomsel],[],"[Director, Director, Director, Director, Write...",[Documentary],"Brunhilde Pomsel describes herself as an ""apol...",2016,,A German Life,5135434
175743,433410,"[Julian Radlmaier, Deragh Campbell, Beniamin F...",[],"[Director, Writer]",[Comedy],A bourgeois dog confesses how he was transform...,2017,,Self-criticism of a Bourgeois Dog,6354108
175781,28469,[Helmut Qualtinger],[],[],[Comedy],No overview found.,1961,,Der Herr Karl,273646
176051,460135,"[Grey Griffin, Tara Strong, Anais Fairweather,...","[superhero, lego]","[Producer, Executive Producer, Executive Produ...",[Animation],"When Supergirl, Wonder Woman, Batgirl, Bumbleb...",2017,,LEGO DC Super Hero Girls: Brain Drain,7158814


In [47]:
df_features.shape

(9543, 10)

## Fill missing values with empty space, so no error occur in the NLP

In [48]:
# check out missing values
df_features.isnull().sum()

tmdbId             0
actors             0
keywords           0
crew_jobs          0
genres             0
overview          18
release_date       2
tagline         2303
title              0
imdbId             0
dtype: int64

In [49]:
# replace missing values in text with space
df_features["tagline"] = df_features["tagline"].fillna(" ")
df_features["overview"] = df_features["overview"].fillna(" ")

In [50]:
# look for index of the missing values in release_date
rows_with_nan = [index for index, row in df_features.iterrows() if row.isnull().any()]
rows_with_nan

[86237, 171495]

In [51]:
# check out the rows with missing values
df_features.loc[86237]

tmdbId                                                     367647
actors                                                         []
keywords                                             [miniseries]
crew_jobs                                                      []
genres                                                         []
overview        Documentary  Follow James Burke through the hi...
release_date                                                 <NA>
tagline                                                          
title                                                 Connections
imdbId                                                      78588
Name: 86237, dtype: object

In [52]:
# check out the rows with missing values
df_features.loc[171495]

tmdbId                                                     409926
actors                                                         []
keywords                                                       []
crew_jobs                                                      []
genres                                                         []
overview        Astronomer Dr. Carl Sagan is host and narrator...
release_date                                                 <NA>
tagline                                                          
title                                                      Cosmos
imdbId                                                      81846
Name: 171495, dtype: object

## Stemming of the words in the description

In [53]:
# split the overview text into single words
df_features["overview"] = df_features["overview"].str.split()

In [54]:
df_features.head()

Unnamed: 0_level_0,tmdbId,actors,keywords,crew_jobs,genres,overview,release_date,tagline,title,imdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,862,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[jealousy, toy, boy, friendship, friends, riva...","[Director, Screenplay, Screenplay, Screenplay,...","[Animation, Comedy, Family]","[Led, by, Woody,, Andy's, toys, live, happily,...",1995,,Toy Story,114709
2,8844,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[board game, disappearance, based on children'...","[Executive Producer, Screenplay, Original Musi...","[Adventure, Fantasy, Family]","[When, siblings, Judy, and, Peter, discover, a...",1995,Roll the dice and unleash the excitement!,Jumanji,113497
3,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[fishing, best friend, duringcreditsstinger, o...","[Director, Characters, Writer, Sound Recordist]","[Romance, Comedy]","[A, family, wedding, reignites, the, ancient, ...",1995,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,113228
4,31357,"[Whitney Houston, Angela Bassett, Loretta Devi...","[based on novel, interracial relationship, sin...","[Director, Screenplay, Producer, Producer, Pro...","[Comedy, Drama, Romance]","[Cheated, on,, mistreated, and, stepped, on,, ...",1995,Friends are the people who let you be yourself...,Waiting to Exhale,114885
5,11862,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[baby, midlife crisis, confidence, aging, daug...","[Original Music Composer, Director of Photogra...",[Comedy],"[Just, when, George, Banks, has, recovered, fr...",1995,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,113041


In [55]:
# Initializing stemmer and countvectorizer 
ps_stemmer = SnowballStemmer('english')#PorterStemmer()
#apply stemmer to pd.Series
df_features['stemmed_overview'] = df_features['overview'].apply(lambda x: [ps_stemmer.stem(y) for y in x])


In [56]:
# join the list of actors, keywords and genre
df_features["actors"] = df_features["actors"].str.join(" ")
df_features["keywords"] = df_features["keywords"].str.join(" ")
df_features["genres"] = df_features["genres"].str.join(" ")
df_features["stemmed_overview"] = df_features["stemmed_overview"].str.join(" ")
df_features.head()

Unnamed: 0_level_0,tmdbId,actors,keywords,crew_jobs,genres,overview,release_date,tagline,title,imdbId,stemmed_overview
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,862,Tom Hanks Tim Allen Don Rickles Jim Varney Wal...,jealousy toy boy friendship friends rivalry bo...,"[Director, Screenplay, Screenplay, Screenplay,...",Animation Comedy Family,"[Led, by, Woody,, Andy's, toys, live, happily,...",1995,,Toy Story,114709,"led by woody, andi toy live happili in his roo..."
2,8844,Robin Williams Jonathan Hyde Kirsten Dunst Bra...,board game disappearance based on children's b...,"[Executive Producer, Screenplay, Original Musi...",Adventure Fantasy Family,"[When, siblings, Judy, and, Peter, discover, a...",1995,Roll the dice and unleash the excitement!,Jumanji,113497,when sibl judi and peter discov an enchant boa...
3,15602,Walter Matthau Jack Lemmon Ann-Margret Sophia ...,fishing best friend duringcreditsstinger old men,"[Director, Characters, Writer, Sound Recordist]",Romance Comedy,"[A, family, wedding, reignites, the, ancient, ...",1995,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,113228,a famili wed reignit the ancient feud between ...
4,31357,Whitney Houston Angela Bassett Loretta Devine ...,based on novel interracial relationship single...,"[Director, Screenplay, Producer, Producer, Pro...",Comedy Drama Romance,"[Cheated, on,, mistreated, and, stepped, on,, ...",1995,Friends are the people who let you be yourself...,Waiting to Exhale,114885,"cheat on, mistreat and step on, the women are ..."
5,11862,Steve Martin Diane Keaton Martin Short Kimberl...,baby midlife crisis confidence aging daughter ...,"[Original Music Composer, Director of Photogra...",Comedy,"[Just, when, George, Banks, has, recovered, fr...",1995,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,113041,just when georg bank has recov from his daught...


In [57]:
df_features[df_features.title.duplicated()]

Unnamed: 0_level_0,tmdbId,actors,keywords,crew_jobs,genres,overview,release_date,tagline,title,imdbId,stemmed_overview
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
915,6620,Humphrey Bogart Audrey Hepburn William Holden ...,brother brother relationship chauffeur champag...,"[Director, Screenplay, Screenplay, Producer, D...",Comedy Drama Romance,"[Linus, and, David, Larrabee, are, the, two, s...",1954,...the chauffeur's daughter who learned her st...,Sabrina,47437,linus and david larrabe are the two son of a v...
1344,11349,Gregory Peck Robert Mitchum Polly Bergen Lori ...,poison small town boat ex-detainee psychopath ...,"[Director, Novel, Screenplay, Producer, Origin...",Drama Thriller,"[Sam, Bowden, witnesses, a, rape, committed, b...",1962,Now he had only one weapon left - murder! ...T...,Cape Fear,55824,sam bowden wit a rape commit by max cadi and t...
1726,9922,Kevin Costner Will Patton Olivia Williams Lare...,usa post postman army apocalypse,"[Screenplay, Producer, Original Music Composer...",Drama Adventure,"[In, 2013, there, are, no, highways,, no, I-wa...",1997,The year is 2013. One man walked in off the ho...,The Postman,119925,"in 2013 there are no highways, no i-ways, no d..."
1941,23383,Laurence Olivier Jean Simmons John Laurie Esmo...,shakespeare father murder,"[Director, Screenplay, Novel, Executive Producer]",Drama,"[Winner, of, four, Academy, Awards,, including...",1948,,Hamlet,40416,"winner of four academi awards, includ best pic..."
2059,9820,Lindsay Lohan Dennis Quaid Natasha Richardson ...,matchmaking divorced twins separated at birth ...,"[Original Music Composer, Production Design, D...",Comedy Drama Family,"[Hallie, Parker, and, Annie, James, are, ident...",1998,"Twice the Fun, Double the Trouble.",The Parent Trap,120783,halli parker and anni jame are ident twin sepa...
...,...,...,...,...,...,...,...,...,...,...,...
169982,305470,Dacre Montgomery Naomi Scott RJ Cyler Becky G ...,spaceship based on tv series tokusatsu superhe...,"[Producer, Producer, Producer, Producer, Produ...",Action Adventure Science Fiction,"[Saban's, Power, Rangers, follows, five, ordin...",2017,Together we are more,Power Rangers,3717490,saban power ranger follow five ordinari teen w...
170827,282035,Tom Cruise Russell Crowe Annabelle Wallis Sofi...,monster mummy horror,"[Casting, Executive Producer, Executive Produc...",Thriller Action Adventure,"[Though, safely, entombed, in, a, crypt, deep,...",2017,Welcome To A New World of Gods And Monsters,The Mummy,2345759,though safe entomb in a crypt deep beneath the...
172253,38966,Keanu Reeves Lori Loughlin Theresa Saldana Tri...,handcuffs amnesia tied up,"[Director, Writer, Camera Operator]",Comedy,"[The, Night, Before, is, a, 1988, film, starri...",1988,You lost your father's car. Sold your prom dat...,The Night Before,95730,the night befor is a 1988 film star keanu reev...
173873,26787,Ned Beatty Mary Steenburgen Ted Danson James F...,horse fairy tale miniseries giant,"[Writer, Director]",Adventure Family Fantasy,"[Gulliver, washes, ashore, on, Lilliput, and, ...",1996,The Classic MiniSeries based on Jonathan Swift...,Gulliver's Travels,115195,gulliv wash ashor on lilliput and attempt to p...


In [58]:
df_features.index.nunique()

9543

## Export the feature dataframe

In [59]:
# export new data to csv. file
df_features.to_csv('../data/df_features.csv')

## Make a combined feature of texts


In [60]:
#def combined_features(row):
#    return " "+row["stemmed_overview"]+" "+row['keywords']+" "+row['actors'] + " "+row['genres']#+" "+row["tagline"]+" "+row['genres']+" "+row["title"]#++" "+row['actors']

#df_features["combined_features"] = df_features.apply(combined_features, axis =1)
df_features["combined_features"] = df_features.stemmed_overview + df_features.genres + df_features.keywords

In [61]:
# check out an example of a stemmed overview (Toy Story)
df_features.combined_features[1]

'led by woody, andi toy live happili in his room until andi birthday bring buzz lightyear onto the scene. afraid of lose his place in andi heart, woodi plot against buzz. but when circumst separ buzz and woodi from their owner, the duo eventu learn to put asid their differences.Animation Comedy Familyjealousy toy boy friendship friends rivalry boy next door new toy toy comes to life'

## Generate a matrix of cosine similarity of the features

In [63]:
#use CountVectorizer for NLP
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

count_matrix = count.fit_transform(df_features['combined_features'].values.astype('U'))

#calculate similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

### Make sample recommendations

In [64]:
def get_cb_recommendations(title, df_features, cosine_sim):
    ''' compares cosine similarity between movies and ranks the movies according to the score
    -----
    parameter
    -----
    returns a list of movie titles for recommendation

    '''
    df_features = df_features.reset_index()
    titles = df_features['title']
    indices = pd.Series(df_features.index, index=df_features['title'])
    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]

    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [65]:
df_features.tail()

Unnamed: 0_level_0,tmdbId,actors,keywords,crew_jobs,genres,overview,release_date,tagline,title,imdbId,stemmed_overview,combined_features
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
175705,7014,Michel Piccoli Miou-Miou Béatrice Romand Franc...,absurdism,"[Sound Designer, Director, Novel, Producer, Pr...",Comedy,"[Made, without, proper, language,, just, gibbe...",1973,,Themroc,69369,"made without proper language, just gibberish a...","made without proper language, just gibberish a..."
175707,411516,Brunhilde Pomsel,,"[Director, Director, Director, Director, Write...",Documentary,"[Brunhilde, Pomsel, describes, herself, as, an...",2016,,A German Life,5135434,"brunhild pomsel describ herself as an ""apolit ...","brunhild pomsel describ herself as an ""apolit ..."
175743,433410,Julian Radlmaier Deragh Campbell Beniamin Fort...,,"[Director, Writer]",Comedy,"[A, bourgeois, dog, confesses, how, he, was, t...",2017,,Self-criticism of a Bourgeois Dog,6354108,a bourgeoi dog confess how he was transform fr...,a bourgeoi dog confess how he was transform fr...
175781,28469,Helmut Qualtinger,,[],Comedy,"[No, overview, found.]",1961,,Der Herr Karl,273646,no overview found.,no overview found.Comedy
176051,460135,Grey Griffin Tara Strong Anais Fairweather Tea...,superhero lego,"[Producer, Executive Producer, Executive Produ...",Animation,"[When, Supergirl,, Wonder, Woman,, Batgirl,, B...",2017,,LEGO DC Super Hero Girls: Brain Drain,7158814,"when supergirl, wonder woman, batgirl, bumbleb...","when supergirl, wonder woman, batgirl, bumbleb..."


In [66]:
# get a sample recommendation to check the result, seed item "Toy Story"
get_cb_recommendations("Toy Story", df_features, cosine_sim)

2351                   Toy Story 2
7332                   Toy Story 3
1398                Small Soldiers
1463                  Child's Play
53      The Indian in the Cupboard
5955        The 40 Year Old Virgin
1671                          Toys
1465                Child's Play 3
1464                Child's Play 2
7892                           Ted
Name: title, dtype: object

In [67]:
get_cb_recommendations("Batman Begins", df_features, cosine_sim)

6689                            The Dark Knight
509                                      Batman
1171                             Batman & Robin
7741                      The Dark Knight Rises
1058                             Batman Returns
7357                 Batman: Under the Red Hood
8004    Batman: The Dark Knight Returns, Part 1
4945                           Enter the Dragon
8792                           Batman vs. Robin
8926         Batman v Superman: Dawn of Justice
Name: title, dtype: object

In [68]:
get_cb_recommendations("Interstellar", df_features, cosine_sim)

4095                         Solaris
1902              Planet of the Apes
705            2001: A Space Odyssey
9391                      Passengers
2503                 Mission to Mars
9387    Rogue One: A Star Wars Story
3840                  Silent Running
8216                      About Time
3407              Planet of the Apes
2708                       Moonraker
Name: title, dtype: object

## Save the cosine similarity array as an dataframe

In [69]:
# save the matrix in a dataframe
df_cosine_sim = pd.DataFrame(cosine_sim)
df_cosine_sim.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9533,9534,9535,9536,9537,9538,9539,9540,9541,9542
0,1.0,0.0339,0.010186,0.010879,0.0,0.0,0.012142,0.046066,0.0,0.0,...,0.015958,0.0,0.016464,0.0,0.018334,0.014679,0.008266,0.025425,0.052926,0.0
1,0.0339,1.0,0.020544,0.0,0.010148,0.030094,0.0,0.009292,0.100504,0.009387,...,0.0,0.042701,0.0,0.0,0.0,0.0,0.008336,0.0,0.0,0.0
2,0.010186,0.020544,1.0,0.013186,0.0,0.0,0.0,0.011167,0.0,0.011282,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.010879,0.0,0.013186,1.0,0.026053,0.019316,0.031439,0.023855,0.0,0.0,...,0.020659,0.0,0.04263,0.01472,0.023736,0.019004,0.010701,0.032915,0.068519,0.013027
4,0.0,0.010148,0.0,0.026053,1.0,0.017865,0.014539,0.0,0.073432,0.0,...,0.0,0.0,0.019714,0.0,0.0,0.017576,0.0,0.0,0.0,0.024096
5,0.0,0.030094,0.0,0.019316,0.017865,1.0,0.0,0.049073,0.047637,0.008263,...,0.0,0.084571,0.0,0.111032,0.0,0.013031,0.0,0.0,0.0,0.008932
6,0.012142,0.0,0.0,0.031439,0.014539,0.0,1.0,0.0,0.011076,0.0,...,0.023057,0.0,0.047579,0.0,0.026491,0.0,0.0,0.036736,0.076472,0.0
7,0.046066,0.009292,0.011167,0.023855,0.0,0.049073,0.0,1.0,0.033618,0.020409,...,0.052486,0.069631,0.0,0.012466,0.0,0.0,0.0,0.0,0.0,0.022063
8,0.0,0.100504,0.0,0.0,0.073432,0.047637,0.011076,0.033618,1.0,0.025472,...,0.029114,0.057937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036716
9,0.0,0.009387,0.011282,0.0,0.0,0.008263,0.0,0.020409,0.025472,1.0,...,0.0,0.011724,0.0,0.012594,0.0,0.0,0.0,0.0,0.0,0.02229


In [70]:
df_cosine_sim[560:570]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9533,9534,9535,9536,9537,9538,9539,9540,9541,9542
560,0.0,0.024932,0.0,0.016003,0.0,0.021946,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029601
561,0.011934,0.012036,0.028931,0.030901,0.0,0.074162,0.0,0.039253,0.0,0.0,...,0.0,0.045099,0.0,0.064592,0.0,0.020847,0.023477,0.0,0.0,0.0
562,0.0,0.0,0.0,0.038007,0.017576,0.013031,0.021209,0.04828,0.013391,0.0,...,0.027875,0.0,0.02876,0.019861,0.0,0.0,0.014438,0.0,0.0,0.0
563,0.0,0.011648,0.0,0.014952,0.013829,0.041011,0.0,0.012662,0.0,0.0,...,0.021932,0.029096,0.0,0.015627,0.025198,0.0,0.0,0.0,0.0,0.0
564,0.035284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.030817,0.017353,0.0,0.0,0.0
565,0.009213,0.009292,0.022334,0.011928,0.0,0.008179,0.013312,0.060606,0.033618,0.061228,...,0.069982,0.011605,0.018051,0.0,0.020101,0.0,0.0,0.027875,0.058026,0.022063
566,0.0,0.0,0.008399,0.017942,0.016595,0.018455,0.010013,0.037987,0.006321,0.0,...,0.026318,0.0,0.0,0.009376,0.0,0.0,0.006816,0.0,0.0,0.016595
567,0.009032,0.0,0.010948,0.035081,0.054077,0.016037,0.052204,0.0,0.00824,0.010005,...,0.017152,0.0,0.017697,0.0,0.019707,0.015778,0.008884,0.027328,0.056888,0.0
568,0.0,0.019383,0.0,0.0,0.0,0.017062,0.0,0.042143,0.07013,0.031931,...,0.0,0.036314,0.0,0.0,0.020966,0.0,0.009452,0.0,0.0,0.011506
569,0.0,0.0,0.021483,0.0,0.010611,0.039336,0.0,0.009716,0.024253,0.009816,...,0.0,0.011163,0.0,0.035973,0.0,0.0,0.0,0.0,0.0,0.042445


In [72]:
# replace the index and column names, which were serielly numbered, with the movie Ids of movielens
df_cosine_sim = df_cosine_sim.set_axis(df_features.index, axis=1)
df_cosine_sim = df_cosine_sim.set_axis(df_features.index, axis=0)

In [73]:
df_cosine_sim

movieId,1,2,3,4,5,6,7,8,9,10,...,175475,175569,175577,175585,175693,175705,175707,175743,175781,176051
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.033900,0.010186,0.010879,0.000000,0.000000,0.012142,0.046066,0.000000,0.000000,...,0.015958,0.000000,0.016464,0.000000,0.018334,0.014679,0.008266,0.025425,0.052926,0.000000
2,0.033900,1.000000,0.020544,0.000000,0.010148,0.030094,0.000000,0.009292,0.100504,0.009387,...,0.000000,0.042701,0.000000,0.000000,0.000000,0.000000,0.008336,0.000000,0.000000,0.000000
3,0.010186,0.020544,1.000000,0.013186,0.000000,0.000000,0.000000,0.011167,0.000000,0.011282,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.010879,0.000000,0.013186,1.000000,0.026053,0.019316,0.031439,0.023855,0.000000,0.000000,...,0.020659,0.000000,0.042630,0.014720,0.023736,0.019004,0.010701,0.032915,0.068519,0.013027
5,0.000000,0.010148,0.000000,0.026053,1.000000,0.017865,0.014539,0.000000,0.073432,0.000000,...,0.000000,0.000000,0.019714,0.000000,0.000000,0.017576,0.000000,0.000000,0.000000,0.024096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175705,0.014679,0.000000,0.000000,0.019004,0.017576,0.013031,0.000000,0.000000,0.000000,0.000000,...,0.027875,0.000000,0.000000,0.000000,0.000000,1.000000,0.014438,0.044412,0.000000,0.000000
175707,0.008266,0.008336,0.000000,0.010701,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.011184,0.000000,0.014438,1.000000,0.000000,0.000000,0.000000
175743,0.025425,0.000000,0.000000,0.032915,0.000000,0.000000,0.036736,0.000000,0.000000,0.000000,...,0.048280,0.000000,0.049814,0.000000,0.055470,0.044412,0.000000,1.000000,0.160128,0.000000
175781,0.052926,0.000000,0.000000,0.068519,0.000000,0.000000,0.076472,0.000000,0.000000,0.000000,...,0.100504,0.000000,0.103695,0.000000,0.115470,0.000000,0.000000,0.160128,1.000000,0.000000


In [74]:
# query a cosine similarity between two movies
df_cosine_sim.loc[1,175781]

0.052925612402496325

## In order to generate a .csv of the cosine similarity dataframe, unhash the code below

In [79]:
# export new data to csv. file
#df_cosine_sim.to_csv('../data/cos_sim_matrix.csv',index=True)

### Save data as .NPZ (compressed)
The generated .csv file is 1,2 GB, so far too big. The better way is to store it as a numpy array, because it is only needed in a .py script. Unhash the code below, if needed

In [80]:
# save numpy array as npz file
#from numpy import asarray
#from numpy import savez_compressed

# save to npy file
#savez_compressed('../data/cosine_sim.npz', cosine_sim)