# Generation of a movie feature dataframe to compute movie similarity

## Data from The Movies Database (tmdb)
 + Metadata
 + credits

In [None]:
# Installation of NLTK, needs to be installed in the environment before you run the notebook
#pip install nltk

In [1]:
import pandas as pd
import numpy as np


import scipy.sparse as sp
from typing import List

from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer, PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity



## Function for recommendentions

In [2]:
def get_recommendations(title):
    ''' compares cosine similarity between movies and ranks the movies according to the score
    -----
    parameter
    -----
    returns a list of movie titles for recommendation

    '''
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [3]:
def _single_list_similarity(predicted: list, feature_df: pd.DataFrame, u: int) -> float:
    """
    Computes the intra-list similarity for a single list of recommendations.
    Parameters
    ----------
    predicted : a list
        Ordered predictions
        Example: ['X', 'Y', 'Z']
    feature_df: dataframe
        A dataframe with one hot encoded or latent features.
        The dataframe should be indexed by the id used in the recommendations.
    u:  User
    Returns:
    -------
    ils_single_user: float
        The intra-list similarity for a single list of recommendations.
    """
    # exception predicted list empty
    #if not(predicted):
    #    raise Exception('Predicted list is empty, index: {0}'.format(u))

    #get features for all recommended items
    recs_content = feature_df.loc[predicted]
    recs_content = recs_content.dropna()
    recs_content = sp.csr_matrix(recs_content.values)

    #calculate similarity scores for all items in list
    similarity = cosine_similarity(X=recs_content, dense_output=False)

    #get indicies for upper right triangle w/o diagonal
    upper_right = np.triu_indices(similarity.shape[0], k=1)

    #calculate average similarity score of all recommended items in list
    ils_single_user = np.mean(similarity[upper_right])
    return ils_single_user

# Construction of the feature dataframe for the similarity measurement

## Load data from tmdb dataset

In [4]:
df_meta = pd.read_csv("../data/tmdb/movies_metadata.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
df_meta.head()


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [6]:
df_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [7]:
df_meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [8]:
# load credit and keyword table
df_credits = pd.read_csv("../data/tmdb/credits.csv")
df_keywords = pd.read_csv("../data/tmdb/keywords.csv")

In [9]:
df_credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [10]:
df_keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [11]:
df_keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


## Cleaning data (tmdb credit and keyword df)

In [12]:
# Merging keywords and credits df
df_features = df_keywords.merge(df_credits, on="id")
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46496 entries, 0 to 46495
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46496 non-null  int64 
 1   keywords  46496 non-null  object
 2   cast      46496 non-null  object
 3   crew      46496 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.8+ MB


In [13]:
#apply literal_eval from Abstract Syntax Trees (ast) to convert str into a list of dictionaries
df_features['cast'] = df_features['cast'].apply(literal_eval)
df_features['crew'] = df_features['crew'].apply(literal_eval)
df_features['keywords'] = df_features['keywords'].apply(literal_eval)

In [14]:
#create function to get list of lists
def get_lists(key, column, df):
    x = []

    for i in range(df.shape[0]):
        values = [d[f"{key}"] for d in df[f"{column}"][i]]
        x.append(values)
    return x

In [15]:
# add new columns to features data frame
df_features["crew_jobs"] = pd.Series(get_lists("job", "crew", df_features))
df_features["actors"] = pd.Series(get_lists("name", "cast", df_features))
df_features["keywords"] = pd.Series(get_lists("name", "keywords", df_features))

In [16]:
# select only cleaned columns for the new dataset
df_features = df_features[["id", "actors", "keywords", "crew_jobs"]]
df_features.head()

Unnamed: 0,id,actors,keywords,crew_jobs
0,862,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[jealousy, toy, boy, friendship, friends, riva...","[Director, Screenplay, Screenplay, Screenplay,..."
1,8844,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[board game, disappearance, based on children'...","[Executive Producer, Screenplay, Original Musi..."
2,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[fishing, best friend, duringcreditsstinger, o...","[Director, Characters, Writer, Sound Recordist]"
3,31357,"[Whitney Houston, Angela Bassett, Loretta Devi...","[based on novel, interracial relationship, sin...","[Director, Screenplay, Producer, Producer, Pro..."
4,11862,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[baby, midlife crisis, confidence, aging, daug...","[Original Music Composer, Director of Photogra..."


## Cleaning data from tmdb metadata df

In [17]:
df_meta.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


## Selecting features from the dataframe, that are suitable for computing content-based similarity

For the calculation of the cosine similarity are following features considered:
+ title
+ overview (description of the movie), 
+ genre
+ actors
+ release year
+ director
+ writer
+ keywords


In [18]:
df_meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [19]:
# make a new datafame with only desired features
meta = df_meta[['genres','id', 'overview', 'release_date', 'tagline', 'title']]
meta.head()

Unnamed: 0,genres,id,overview,release_date,tagline,title
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,,Toy Story
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,When siblings Judy and Peter discover an encha...,1995-12-15,Roll the dice and unleash the excitement!,Jumanji
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,A family wedding reignites the ancient feud be...,1995-12-22,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,Friends are the people who let you be yourself...,Waiting to Exhale
4,"[{'id': 35, 'name': 'Comedy'}]",11862,Just when George Banks has recovered from his ...,1995-02-10,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II


In [20]:
#apply literal_eval from Abstract Syntax Trees (ast) to convert str into a list of dictionaries
meta['genres'] = meta['genres'].apply(literal_eval)

# add new columns to features data frame
meta['genres'] = pd.Series(get_lists("name", "genres", meta))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta['genres'] = meta['genres'].apply(literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta['genres'] = pd.Series(get_lists("name", "genres", meta))


### Extract the release year

In [21]:
# convert release date to datetime object
meta["release_date"] = pd.to_datetime(meta["release_date"], errors="coerce")

# extract the year
meta["release_date"] = (pd.DatetimeIndex(meta["release_date"]).year)

# cast the year from float into integer
meta["release_date"] = meta["release_date"].astype("Int64")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta["release_date"] = pd.to_datetime(meta["release_date"], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta["release_date"] = (pd.DatetimeIndex(meta["release_date"]).year)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta["release_date"] = meta["release_date"].astype("Int64")

In [22]:
# check the dataframe
meta.head()

Unnamed: 0,genres,id,overview,release_date,tagline,title
0,"[Animation, Comedy, Family]",862,"Led by Woody, Andy's toys live happily in his ...",1995,,Toy Story
1,"[Adventure, Fantasy, Family]",8844,When siblings Judy and Peter discover an encha...,1995,Roll the dice and unleash the excitement!,Jumanji
2,"[Romance, Comedy]",15602,A family wedding reignites the ancient feud be...,1995,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men
3,"[Comedy, Drama, Romance]",31357,"Cheated on, mistreated and stepped on, the wom...",1995,Friends are the people who let you be yourself...,Waiting to Exhale
4,[Comedy],11862,Just when George Banks has recovered from his ...,1995,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II


## Merge the metadata with the df_feature

In [23]:
# drop rows with incorrect tmdb id´s 
meta = meta.drop([19730, 29503, 35587])

# cast id in meta to int
meta["id"] = meta["id"].astype(int)

In [24]:
# Merging meta and df_feature table on tmdb id
df_features = df_features.merge(meta, on="id")
df_features.head()

Unnamed: 0,id,actors,keywords,crew_jobs,genres,overview,release_date,tagline,title
0,862,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[jealousy, toy, boy, friendship, friends, riva...","[Director, Screenplay, Screenplay, Screenplay,...","[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",1995,,Toy Story
1,8844,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[board game, disappearance, based on children'...","[Executive Producer, Screenplay, Original Musi...","[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,1995,Roll the dice and unleash the excitement!,Jumanji
2,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[fishing, best friend, duringcreditsstinger, o...","[Director, Characters, Writer, Sound Recordist]","[Romance, Comedy]",A family wedding reignites the ancient feud be...,1995,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men
3,31357,"[Whitney Houston, Angela Bassett, Loretta Devi...","[based on novel, interracial relationship, sin...","[Director, Screenplay, Producer, Producer, Pro...","[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",1995,Friends are the people who let you be yourself...,Waiting to Exhale
4,11862,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[baby, midlife crisis, confidence, aging, daug...","[Original Music Composer, Director of Photogra...",[Comedy],Just when George Banks has recovered from his ...,1995,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II


## Set movielens id as key for the feature dataframe
Since the recommendation lists from the collaborative filtering derives from the movielens dataset, the df_features has to have the movielens Id as index for the movies.

In [25]:
df_links = pd.read_csv("../data/ml-latest-small/links.csv")

In [26]:
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [27]:
df_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [28]:
# check for null values
df_links["tmdbId"].isnull().sum()

8

In [29]:
# drop the null values
df_links.dropna(inplace=True)

In [30]:
# rename the id in the df_features to tmdbId for merging
df_features.rename(columns={"id":"tmdbId"}, inplace=True)

In [31]:
# cast the tmdbId into integer for merging
df_links["tmdbId"] = df_links["tmdbId"].astype("Int64")
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [32]:
# merge the feature dataframe with the link table from movielens 
df_features = df_features.merge(df_links,on="tmdbId")
df_features.head()

Unnamed: 0,tmdbId,actors,keywords,crew_jobs,genres,overview,release_date,tagline,title,movieId,imdbId
0,862,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[jealousy, toy, boy, friendship, friends, riva...","[Director, Screenplay, Screenplay, Screenplay,...","[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",1995,,Toy Story,1,114709
1,8844,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[board game, disappearance, based on children'...","[Executive Producer, Screenplay, Original Musi...","[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,1995,Roll the dice and unleash the excitement!,Jumanji,2,113497
2,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[fishing, best friend, duringcreditsstinger, o...","[Director, Characters, Writer, Sound Recordist]","[Romance, Comedy]",A family wedding reignites the ancient feud be...,1995,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,3,113228
3,31357,"[Whitney Houston, Angela Bassett, Loretta Devi...","[based on novel, interracial relationship, sin...","[Director, Screenplay, Producer, Producer, Pro...","[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",1995,Friends are the people who let you be yourself...,Waiting to Exhale,4,114885
4,11862,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[baby, midlife crisis, confidence, aging, daug...","[Original Music Composer, Director of Photogra...",[Comedy],Just when George Banks has recovered from his ...,1995,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5,113041


In [33]:
# set the movieId as index
df_features = df_features.set_index("movieId")
df_features.head()

Unnamed: 0_level_0,tmdbId,actors,keywords,crew_jobs,genres,overview,release_date,tagline,title,imdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,862,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[jealousy, toy, boy, friendship, friends, riva...","[Director, Screenplay, Screenplay, Screenplay,...","[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",1995,,Toy Story,114709
2,8844,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[board game, disappearance, based on children'...","[Executive Producer, Screenplay, Original Musi...","[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,1995,Roll the dice and unleash the excitement!,Jumanji,113497
3,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[fishing, best friend, duringcreditsstinger, o...","[Director, Characters, Writer, Sound Recordist]","[Romance, Comedy]",A family wedding reignites the ancient feud be...,1995,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,113228
4,31357,"[Whitney Houston, Angela Bassett, Loretta Devi...","[based on novel, interracial relationship, sin...","[Director, Screenplay, Producer, Producer, Pro...","[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",1995,Friends are the people who let you be yourself...,Waiting to Exhale,114885
5,11862,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[baby, midlife crisis, confidence, aging, daug...","[Original Music Composer, Director of Photogra...",[Comedy],Just when George Banks has recovered from his ...,1995,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,113041


## Fill missing values with empty space, so no error occur in the NLP

In [34]:
# check out missing values
df_features.isnull().sum()

tmdbId             0
actors             0
keywords           0
crew_jobs          0
genres             0
overview          19
release_date       2
tagline         2347
title              0
imdbId             0
dtype: int64

In [35]:
# replace missing values in text with space
df_features["tagline"] = df_features["tagline"].fillna(" ")
df_features["overview"] = df_features["overview"].fillna(" ")

In [36]:
# look for index of the missing values in release_date
rows_with_nan = [index for index, row in df_features.iterrows() if row.isnull().any()]
rows_with_nan

[86237, 171495]

In [37]:
# check out the rows with missing values
df_features.loc[86237]

tmdbId                                                     367647
actors                                                         []
keywords                                             [miniseries]
crew_jobs                                                      []
genres                                                         []
overview        Documentary  Follow James Burke through the hi...
release_date                                                 <NA>
tagline                                                          
title                                                 Connections
imdbId                                                      78588
Name: 86237, dtype: object

In [38]:
# check out the rows with missing values
df_features.loc[171495]

tmdbId                                                     409926
actors                                                         []
keywords                                                       []
crew_jobs                                                      []
genres                                                         []
overview        Astronomer Dr. Carl Sagan is host and narrator...
release_date                                                 <NA>
tagline                                                          
title                                                      Cosmos
imdbId                                                      81846
Name: 171495, dtype: object

## Stemming of the words in the description

In [39]:
# split the overview text into single words
df_features["overview"] = df_features["overview"].str.split()

In [40]:
df_features.head()

Unnamed: 0_level_0,tmdbId,actors,keywords,crew_jobs,genres,overview,release_date,tagline,title,imdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,862,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[jealousy, toy, boy, friendship, friends, riva...","[Director, Screenplay, Screenplay, Screenplay,...","[Animation, Comedy, Family]","[Led, by, Woody,, Andy's, toys, live, happily,...",1995,,Toy Story,114709
2,8844,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[board game, disappearance, based on children'...","[Executive Producer, Screenplay, Original Musi...","[Adventure, Fantasy, Family]","[When, siblings, Judy, and, Peter, discover, a...",1995,Roll the dice and unleash the excitement!,Jumanji,113497
3,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[fishing, best friend, duringcreditsstinger, o...","[Director, Characters, Writer, Sound Recordist]","[Romance, Comedy]","[A, family, wedding, reignites, the, ancient, ...",1995,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,113228
4,31357,"[Whitney Houston, Angela Bassett, Loretta Devi...","[based on novel, interracial relationship, sin...","[Director, Screenplay, Producer, Producer, Pro...","[Comedy, Drama, Romance]","[Cheated, on,, mistreated, and, stepped, on,, ...",1995,Friends are the people who let you be yourself...,Waiting to Exhale,114885
5,11862,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[baby, midlife crisis, confidence, aging, daug...","[Original Music Composer, Director of Photogra...",[Comedy],"[Just, when, George, Banks, has, recovered, fr...",1995,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,113041


In [41]:
# Initializing stemmer and countvectorizer 
ps_stemmer = SnowballStemmer('english')#PorterStemmer()
#apply stemmer to pd.Series
df_features['stemmed_overview'] = df_features['overview'].apply(lambda x: [ps_stemmer.stem(y) for y in x])
#df_features["stemmed_overview"]

In [42]:
# join the list of actors, keywords and genre
df_features["actors"] = df_features["actors"].str.join(" ")
df_features["keywords"] = df_features["keywords"].str.join(" ")
df_features["genres"] = df_features["genres"].str.join(" ")
df_features["stemmed_overview"] = df_features["stemmed_overview"].str.join(" ")
df_features.head()

Unnamed: 0_level_0,tmdbId,actors,keywords,crew_jobs,genres,overview,release_date,tagline,title,imdbId,stemmed_overview
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,862,Tom Hanks Tim Allen Don Rickles Jim Varney Wal...,jealousy toy boy friendship friends rivalry bo...,"[Director, Screenplay, Screenplay, Screenplay,...",Animation Comedy Family,"[Led, by, Woody,, Andy's, toys, live, happily,...",1995,,Toy Story,114709,"led by woody, andi toy live happili in his roo..."
2,8844,Robin Williams Jonathan Hyde Kirsten Dunst Bra...,board game disappearance based on children's b...,"[Executive Producer, Screenplay, Original Musi...",Adventure Fantasy Family,"[When, siblings, Judy, and, Peter, discover, a...",1995,Roll the dice and unleash the excitement!,Jumanji,113497,when sibl judi and peter discov an enchant boa...
3,15602,Walter Matthau Jack Lemmon Ann-Margret Sophia ...,fishing best friend duringcreditsstinger old men,"[Director, Characters, Writer, Sound Recordist]",Romance Comedy,"[A, family, wedding, reignites, the, ancient, ...",1995,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,113228,a famili wed reignit the ancient feud between ...
4,31357,Whitney Houston Angela Bassett Loretta Devine ...,based on novel interracial relationship single...,"[Director, Screenplay, Producer, Producer, Pro...",Comedy Drama Romance,"[Cheated, on,, mistreated, and, stepped, on,, ...",1995,Friends are the people who let you be yourself...,Waiting to Exhale,114885,"cheat on, mistreat and step on, the women are ..."
5,11862,Steve Martin Diane Keaton Martin Short Kimberl...,baby midlife crisis confidence aging daughter ...,"[Original Music Composer, Director of Photogra...",Comedy,"[Just, when, George, Banks, has, recovered, fr...",1995,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,113041,just when georg bank has recov from his daught...


In [43]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9665 entries, 1 to 176051
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   tmdbId            9665 non-null   int64 
 1   actors            9665 non-null   object
 2   keywords          9665 non-null   object
 3   crew_jobs         9665 non-null   object
 4   genres            9665 non-null   object
 5   overview          9665 non-null   object
 6   release_date      9663 non-null   Int64 
 7   tagline           9665 non-null   object
 8   title             9665 non-null   object
 9   imdbId            9665 non-null   int64 
 10  stemmed_overview  9665 non-null   object
dtypes: Int64(1), int64(2), object(8)
memory usage: 1.1+ MB


## Make a combined feature of texts


In [44]:
#def combined_features(row):
#    return " "+row["stemmed_overview"]+" "+row['keywords']+" "+row['actors'] + " "+row['genres']#+" "+row["tagline"]+" "+row['genres']+" "+row["title"]#++" "+row['actors']

#df_features["combined_features"] = df_features.apply(combined_features, axis =1)
df_features["combined_features"] = df_features.stemmed_overview + df_features.genres + df_features.keywords

In [45]:
df_features.combined_features[1]

'led by woody, andi toy live happili in his room until andi birthday bring buzz lightyear onto the scene. afraid of lose his place in andi heart, woodi plot against buzz. but when circumst separ buzz and woodi from their owner, the duo eventu learn to put asid their differences.Animation Comedy Familyjealousy toy boy friendship friends rivalry boy next door new toy toy comes to life'

## Generate a matrix of cosine similarity of features

In [46]:
#use CountVectorizer for NLP
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

count_matrix = count.fit_transform(df_features['combined_features'].values.astype('U'))

#calculate similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [47]:
df_cosine_sim = pd.DataFrame(cosine_sim)
df_cosine_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9655,9656,9657,9658,9659,9660,9661,9662,9663,9664
0,1.000000,0.033900,0.010186,0.010879,0.000000,0.000000,0.012142,0.046066,0.000000,0.000000,...,0.015958,0.000000,0.016464,0.000000,0.018334,0.014679,0.008266,0.025425,0.052926,0.000000
1,0.033900,1.000000,0.020544,0.000000,0.010148,0.030094,0.000000,0.009292,0.100504,0.009387,...,0.000000,0.042701,0.000000,0.000000,0.000000,0.000000,0.008336,0.000000,0.000000,0.000000
2,0.010186,0.020544,1.000000,0.013186,0.000000,0.000000,0.000000,0.011167,0.000000,0.011282,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.010879,0.000000,0.013186,1.000000,0.026053,0.019316,0.031439,0.023855,0.000000,0.000000,...,0.020659,0.000000,0.042630,0.014720,0.023736,0.019004,0.010701,0.032915,0.068519,0.013027
4,0.000000,0.010148,0.000000,0.026053,1.000000,0.017865,0.014539,0.000000,0.073432,0.000000,...,0.000000,0.000000,0.019714,0.000000,0.000000,0.017576,0.000000,0.000000,0.000000,0.024096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9660,0.014679,0.000000,0.000000,0.019004,0.017576,0.013031,0.000000,0.000000,0.000000,0.000000,...,0.027875,0.000000,0.000000,0.000000,0.000000,1.000000,0.014438,0.044412,0.000000,0.000000
9661,0.008266,0.008336,0.000000,0.010701,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.011184,0.000000,0.014438,1.000000,0.000000,0.000000,0.000000
9662,0.025425,0.000000,0.000000,0.032915,0.000000,0.000000,0.036736,0.000000,0.000000,0.000000,...,0.048280,0.000000,0.049814,0.000000,0.055470,0.044412,0.000000,1.000000,0.160128,0.000000
9663,0.052926,0.000000,0.000000,0.068519,0.000000,0.000000,0.076472,0.000000,0.000000,0.000000,...,0.100504,0.000000,0.103695,0.000000,0.115470,0.000000,0.000000,0.160128,1.000000,0.000000


In [None]:
#define titles series
titles = df_features['title']

#define indices series
indices = pd.Series(df_features.index, index=df_features['title'])

In [None]:
indices.shape

(9665,)

In [None]:
get_recommendations("Toy Story")[:10]

movieId
42725               Grandma's Boy
98122       Indie Game: The Movie
135137                     Pixels
44731                  Stay Alive
26985                     Nirvana
2600                     eXistenZ
97913              Wreck-It Ralph
3997           Dungeons & Dragons
8633         The Last Starfighter
6566      Spy Kids 3-D: Game Over
Name: title, dtype: object

## Although I used the same methods (snowball stemmer, countvectorizer) as Alex, I get different recommendations (so different cosine similarities). My results are not that close with the content as the ones from Alex

+ Problem 1: I didn´t include the director
+ Problem 2: I didn´t include the writer
+ Problem 3: I get different stemms with the same method... NO IDEA WHY

Suggestion: We use the cos_sim table in alex notebook. 

Next problem:

In Alex notebook (4_content_based_recommender_2) the movieIDs don´t match the movieIDs in the ml-latest-small/ratings.csv. We have to find a solution for that.



## NEXT: Rename the rows and columns with the right movieIDs

In [None]:
movie_user_likes = "Toy Story"
def get_index_from_title(title):
    return df_features[df_features.title == title].index.values[0]

movie_index = get_index_from_title(movie_user_likes)
movie_index

1

In [50]:
#similar_movies = list(enumerate(cosine_sim[indices]))#movie_index]))
#does not work
#IndexError: index 144606 is out of bounds for axis 0 with size 9665


In [53]:
#sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
#sorted_similar_movies[:10]

def get_title_from_index(index):
    return df_features[df_features.index == index]["title"].values[0]
    
i=0
for movie in sorted_similar_movies:
   print(get_title_from_index(df_features[0]))
   i=i+1
   if i>15:
    break
#print(get_title_from_index(similar_movies))