# Content Based Recommender System

### Objective

To create a movie recommender system that uses the content/features of the film/entity, then calculates the similarities between one and the other so that when we point to one film, we will get several other films that have similarities with that film.

For instance, by comparing the similarities between the existing plots and the existing genres, when the audience prefers Narnia films, this content based recommender system will also recommend films such as Harry Potter or The Lords of The Rings which have similar genres.

### Data Preparation

In [1]:
#import library
import pandas as pd
import numpy as np

# dataset
movie_rating_df = pd.read_csv('https://storage.googleapis.com/dqlab-dataset/movie_rating_df.csv')

In [2]:
pd.set_option('display.max_columns', None)

print(movie_rating_df.head())
print(movie_rating_df.info())

      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

   isAdult  startYear  endYear  runtimeMinutes                    genres  \
0        0     1894.0      NaN             1.0         Documentary,Short   
1        0     1892.0      NaN             5.0           Animation,Short   
2        0     1892.0      NaN             4.0  Animation,Comedy,Romance   
3        0     1892.0      NaN            12.0           Animation,Short   
4        0     1893.0      NaN             1.0              Comedy,Short   

   averageRating  numVotes  
0            5.6      1608  
1            6.0       197  
2          

In [3]:
# directors and writers data
director_writers = pd.read_csv('https://storage.googleapis.com/dqlab-dataset/directors_writers.csv')

# convert to list
director_writers['director_name'] = director_writers['director_name'].apply(lambda row: row.split(','))
director_writers['writer_name'] = director_writers['writer_name'].apply(lambda row: row.split(','))

print(director_writers.head())

      tconst                      director_name  \
0  tt0011414                   [David Kirkland]   
1  tt0011890                [Roy William Neill]   
2  tt0014341  [Buster Keaton, John G. Blystone]   
3  tt0018054                 [Cecil B. DeMille]   
4  tt0024151                      [James Cruze]   

                                         writer_name  
0                         [John Emerson, Anita Loos]  
1   [Arthur F. Goodrich, Burns Mantle, Mary Murillo]  
2  [Jean C. Havez, Clyde Bruckman, Joseph A. Mitc...  
3                                [Jeanie Macpherson]  
4               [Max Miller, Wells Root, Jack Jevne]  


In [4]:
director_writers

Unnamed: 0,tconst,director_name,writer_name
0,tt0011414,[David Kirkland],"[John Emerson, Anita Loos]"
1,tt0011890,[Roy William Neill],"[Arthur F. Goodrich, Burns Mantle, Mary Murillo]"
2,tt0014341,"[Buster Keaton, John G. Blystone]","[Jean C. Havez, Clyde Bruckman, Joseph A. Mitc..."
3,tt0018054,[Cecil B. DeMille],[Jeanie Macpherson]
4,tt0024151,[James Cruze],"[Max Miller, Wells Root, Jack Jevne]"
...,...,...,...
981,tt9236688,[Kai Wessel],[Christian Jeltsch]
982,tt9278408,[Bahadir Ince],"[Levent Cantek, Ali Demirel, Baris Erdogan]"
983,tt9285882,[Rapman],[Rapman]
984,tt9310372,[Sujoy Ghosh],"[Sujoy Ghosh, Raj Vasant, Pratim D. Gupta, Sur..."


In [5]:
# actors data
name_df = pd.read_csv('https://storage.googleapis.com/dqlab-dataset/actor_name.csv')

# filter unnecessary columns
name_df = name_df[['nconst','primaryName','knownForTitles']]

print(name_df.head())

       nconst          primaryName                           knownForTitles
0   nm1774132    Nathan McLaughlin  tt0417686,tt1713976,tt1891860,tt0454839
1  nm10683464        Bridge Andrew                                tt7718088
2   nm1021485    Brandon Fransvaag                                tt0168790
3   nm6940929   Erwin van der Lely                                tt4232168
4   nm5764974  Svetlana Shypitsyna                                tt3014168


In [6]:
# checking number of film variation starred by actor
print(name_df['knownForTitles'].apply(lambda x: len(x.split(','))).unique())

# convert knownForTitles to list
name_df['knownForTitles'] = name_df['knownForTitles'].apply(lambda x: x.split(','))

print(name_df.head())

[4 1 2 3]
       nconst          primaryName  \
0   nm1774132    Nathan McLaughlin   
1  nm10683464        Bridge Andrew   
2   nm1021485    Brandon Fransvaag   
3   nm6940929   Erwin van der Lely   
4   nm5764974  Svetlana Shypitsyna   

                                 knownForTitles  
0  [tt0417686, tt1713976, tt1891860, tt0454839]  
1                                   [tt7718088]  
2                                   [tt0168790]  
3                                   [tt4232168]  
4                                   [tt3014168]  


In [7]:
# One to One Correspondence

# create empty bucket list for dataframe
df_uni = []

for x in ['knownForTitles']:
    # repeat every row index for every element of knownForTitles column
    idx = name_df.index.repeat(name_df['knownForTitles'].str.len())
   
    # split values in every row and concat them in a dataframe
    df1 = pd.DataFrame({
        x: np.concatenate(name_df[x].values)
    })
    
    # replace index dataframe to idx
    df1.index = idx
    # append every dataframe into bucket
    df_uni.append(df1)
    
# concat all dataframes
df_concat = pd.concat(df_uni, axis=1)

# left join with value in name_df dataframe
unnested_df = df_concat.join(name_df.drop(['knownForTitles'], 1), how='left')

# select column based on the original dataframe (name_df)
unnested_df = unnested_df[name_df.columns.tolist()]
print(unnested_df)

         nconst        primaryName knownForTitles
0     nm1774132  Nathan McLaughlin      tt0417686
0     nm1774132  Nathan McLaughlin      tt1713976
0     nm1774132  Nathan McLaughlin      tt1891860
0     nm1774132  Nathan McLaughlin      tt0454839
1    nm10683464      Bridge Andrew      tt7718088
..          ...                ...            ...
998   nm5245804      Eliza Jenkins      tt1464058
999   nm0948460         Greg Yolen      tt0436869
999   nm0948460         Greg Yolen      tt0476663
999   nm0948460         Greg Yolen      tt0109723
999   nm0948460         Greg Yolen      tt0364484

[1918 rows x 3 columns]


  unnested_df = df_concat.join(name_df.drop(['knownForTitles'], 1), how='left')


In [8]:
# Grouping primaryName into a list group by knownForTitles

unnested_drop = unnested_df.drop(['nconst'], axis=1)

# create empty bucket for dataframe
df_uni = []

for col in ['primaryName']:
    # agregation
    dfi = unnested_drop.groupby(['knownForTitles'])[col].apply(list)
    # append
    df_uni.append(dfi)
df_grouped = pd.concat(df_uni, axis=1).reset_index()
df_grouped.columns = ['knownForTitles','cast_name']
print(df_grouped)

     knownForTitles           cast_name
0         tt0008125    [Charles Harley]
1         tt0009706    [Charles Harley]
2         tt0010304  [Natalie Talmadge]
3         tt0011414  [Natalie Talmadge]
4         tt0011890  [Natalie Talmadge]
...             ...                 ...
1893      tt9610496  [Stefano Baffetti]
1894      tt9714030        [Kevin Kain]
1895      tt9741820   [Caroline Plyler]
1896      tt9759814     [Ethan Francis]
1897      tt9856236     [Nuala Maguire]

[1898 rows x 2 columns]


In [9]:
#join movie table and cast table 
base_df = pd.merge(df_grouped, movie_rating_df, left_on='knownForTitles', right_on='tconst', how='inner')

#join base_df and director_writer table
base_df = pd.merge(base_df, director_writers, left_on='tconst', right_on='tconst', how='left')
print(base_df.head())

  knownForTitles           cast_name     tconst titleType  \
0      tt0011414  [Natalie Talmadge]  tt0011414     movie   
1      tt0011890  [Natalie Talmadge]  tt0011890     movie   
2      tt0014341  [Natalie Talmadge]  tt0014341     movie   
3      tt0018054     [Reeka Roberts]  tt0018054     movie   
4      tt0024151     [James Hackett]  tt0024151     movie   

             primaryTitle           originalTitle  isAdult  startYear  \
0         The Love Expert         The Love Expert        0     1920.0   
1               Yes or No               Yes or No        0     1920.0   
2         Our Hospitality         Our Hospitality        0     1923.0   
3       The King of Kings       The King of Kings        0     1927.0   
4  I Cover the Waterfront  I Cover the Waterfront        0     1933.0   

   endYear  runtimeMinutes                   genres  averageRating  numVotes  \
0      NaN            60.0           Comedy,Romance            4.9       136   
1      NaN            72.0        

### Data Cleaning

In [10]:
# drop knownForTitles column
base_drop = base_df.drop(['knownForTitles'], axis=1)
print(base_drop.info())

# replace NULL values in genres column to 'Unknown'
base_drop['genres'] = base_drop['genres'].fillna('Unknown')

# calculate number of null values in every column
print(base_drop.isnull().sum())

# replace NULL values in dorector_name & writer_name column to 'Unknown'
base_drop[['director_name','writer_name']] = base_drop[['director_name','writer_name']].fillna('unknown')

# create list of list genre column multiple values
base_drop['genres'] = base_drop['genres'].apply(lambda x: x.split(','))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1060 entries, 0 to 1059
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   cast_name       1060 non-null   object 
 1   tconst          1060 non-null   object 
 2   titleType       1060 non-null   object 
 3   primaryTitle    1060 non-null   object 
 4   originalTitle   1060 non-null   object 
 5   isAdult         1060 non-null   int64  
 6   startYear       1060 non-null   float64
 7   endYear         110 non-null    float64
 8   runtimeMinutes  1060 non-null   float64
 9   genres          745 non-null    object 
 10  averageRating   1060 non-null   float64
 11  numVotes        1060 non-null   int64  
 12  director_name   986 non-null    object 
 13  writer_name     986 non-null    object 
dtypes: float64(4), int64(2), object(8)
memory usage: 124.2+ KB
None
cast_name           0
tconst              0
titleType           0
primaryTitle        0
originalTitle   

In [11]:
#Drop tconst, isAdult, endYear, originalTitle column
base_drop2 = base_drop.drop(['tconst','isAdult','endYear','originalTitle'], axis=1)

base_drop2 = base_drop2[['primaryTitle','titleType','startYear','runtimeMinutes','genres','averageRating','numVotes','cast_name','director_name','writer_name']]

# Rename columns
base_drop2.columns = ['title','type','start','duration','genres','rating','votes','cast_name','director_name','writer_name']
print(base_drop2.head())

                    title   type   start  duration  \
0         The Love Expert  movie  1920.0      60.0   
1               Yes or No  movie  1920.0      72.0   
2         Our Hospitality  movie  1923.0      65.0   
3       The King of Kings  movie  1927.0     155.0   
4  I Cover the Waterfront  movie  1933.0      80.0   

                        genres  rating  votes           cast_name  \
0            [Comedy, Romance]     4.9    136  [Natalie Talmadge]   
1                    [Unknown]     6.3      7  [Natalie Talmadge]   
2  [Comedy, Romance, Thriller]     7.8   9621  [Natalie Talmadge]   
3  [Biography, Drama, History]     7.3   1826     [Reeka Roberts]   
4             [Drama, Romance]     6.3    455     [James Hackett]   

                       director_name  \
0                   [David Kirkland]   
1                [Roy William Neill]   
2  [Buster Keaton, John G. Blystone]   
3                 [Cecil B. DeMille]   
4                      [James Cruze]   

                   

In [12]:
# define classification features (title, cast_name, genres, director_name, writer_name)
feature_df = base_drop2[['title','cast_name','genres','director_name','writer_name']]

print(feature_df.head())

                    title           cast_name                       genres  \
0         The Love Expert  [Natalie Talmadge]            [Comedy, Romance]   
1               Yes or No  [Natalie Talmadge]                    [Unknown]   
2         Our Hospitality  [Natalie Talmadge]  [Comedy, Romance, Thriller]   
3       The King of Kings     [Reeka Roberts]  [Biography, Drama, History]   
4  I Cover the Waterfront     [James Hackett]             [Drama, Romance]   

                       director_name  \
0                   [David Kirkland]   
1                [Roy William Neill]   
2  [Buster Keaton, John G. Blystone]   
3                 [Cecil B. DeMille]   
4                      [James Cruze]   

                                         writer_name  
0                         [John Emerson, Anita Loos]  
1   [Arthur F. Goodrich, Burns Mantle, Mary Murillo]  
2  [Jean C. Havez, Clyde Bruckman, Joseph A. Mitc...  
3                                [Jeanie Macpherson]  
4              

In [13]:
# Create function to strip spaces

def strip_spaces(x):
    try:
        # if cell contains a list
        if isinstance(x, list):
            return [i.replace(' ','').lower() for i in x]
        # if cell contains a string
        else:
            return [x.replace(' ','').lower()]
    except:
        print(x)
        
# feature columns    
feature_cols = ['cast_name','genres','writer_name','director_name']

# Apply function strip_spaces
for col in feature_cols:
    feature_df[col] = feature_df[col].apply(strip_spaces)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_df[col] = feature_df[col].apply(strip_spaces)


In [14]:
# create function to join all features into sentence

def soup_feature(x):
    return ' '.join(x['cast_name']) + ' ' + ' '.join(x['genres']) + ' ' + ' '.join(x['director_name']) + ' ' + ' '.join(x['writer_name'])

# applying function
feature_df['soup'] = feature_df.apply(soup_feature, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_df['soup'] = feature_df.apply(soup_feature, axis=1)


In [16]:
#import CountVectorizer 
from sklearn.feature_extraction.text import CountVectorizer

#define CountVectorizer and convert soup into vector form
count = CountVectorizer(stop_words= 'english')
count_matrix = count.fit_transform(feature_df['soup'])

print(count)
print(count_matrix.shape)

CountVectorizer(stop_words='english')
(1060, 10026)


In [17]:
# Create Model Similarity using Cosine similarity

#Import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

# apply cosine_similarity between count_matrix 
cosine_sim = cosine_similarity(count_matrix, count_matrix)

#print output
print(cosine_sim)

[[1.         0.15430335 0.35355339 ... 0.         0.         0.13608276]
 [0.15430335 1.         0.10910895 ... 0.         0.         0.        ]
 [0.35355339 0.10910895 1.         ... 0.         0.08703883 0.09622504]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.08703883 ... 0.         1.         0.10050378]
 [0.13608276 0.         0.09622504 ... 0.         0.10050378 1.        ]]


In [22]:
# Create content based recommender system function

indices = pd.Series(feature_df.index, index=feature_df['title']).drop_duplicates()

def content_recommender(title):
    # get the index of film title (title) mentioned
    idx = indices[title]

    # create list from array similarity cosine_sim 
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sort film by highest similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the list of title (item 2-11)
    sim_scores = sim_scores[1:11]

    # get the index of the title film in sim_scores
    movie_indices = [i[0] for i in sim_scores]

    # calling base_df data based on movie_indices index
    return base_df.iloc[movie_indices]

In [24]:
# Example

#applying function 
content_recommender('The Lion King')

Unnamed: 0,knownForTitles,cast_name,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,director_name,writer_name
848,tt3040964,[Cristina Carrión Márquez],tt3040964,movie,The Jungle Book,The Jungle Book,0,2016.0,,106.0,"Adventure,Drama,Family",7.4,250994,[Jon Favreau],"[Justin Marks, Rudyard Kipling]"
383,tt0286336,[Francisco Bretas],tt0286336,tvSeries,The Animals of Farthing Wood,The Animals of Farthing Wood,0,1993.0,1995.0,25.0,"Adventure,Animation,Drama",8.3,3057,"[Elphin Lloyd-Jones, Philippe Leclerc]","[Valerie Georgeson, Colin Dann, Jenny McDade, ..."
1002,tt7222086,[Hiroki Matsukawa],tt7222086,tvSeries,Made in Abyss,Made in Abyss,0,2017.0,,325.0,"Adventure,Animation,Drama",8.4,4577,"[Masayuki Kojima, Hitoshi Haga, Shinya Iino, T...","[Akihito Tsukushi, Keigo Koyanagi, Hideyuki Ku..."
73,tt0075147,[Joaquín Parra],tt0075147,movie,Robin and Marian,Robin and Marian,0,1976.0,,106.0,"Adventure,Drama,Romance",6.5,10830,[Richard Lester],[James Goldman]
232,tt0119051,[Chris Kosloski],tt0119051,movie,The Edge,The Edge,0,1997.0,,117.0,"Action,Adventure,Drama",6.9,65673,[Lee Tamahori],[David Mamet]
556,tt10068158,[Hiroki Matsukawa],tt10068158,movie,Made in Abyss: Journey's Dawn,Made in Abyss: Tabidachi no Yoake,0,2019.0,,139.0,"Adventure,Animation,Fantasy",7.4,81,[Masayuki Kojima],[Akihito Tsukushi]
9,tt0028657,[Bernard Loftus],tt0028657,movie,Boss of Lonely Valley,Boss of Lonely Valley,0,1937.0,,60.0,"Action,Adventure,Drama",6.2,41,[Ray Taylor],"[Frances Guihan, Forrest Brown]"
191,tt0107875,[Simon Mayal],tt0107875,movie,The Princess and the Goblin,The Princess and the Goblin,0,1991.0,,82.0,"Adventure,Animation,Comedy",6.8,2350,[József Gémes],"[Robin Lyons, George MacDonald]"
803,tt2356464,[Sina Müller],tt2356464,movie,Ostwind,Ostwind,0,2013.0,,101.0,"Adventure,Drama,Family",6.8,1350,[Katja von Garnier],"[Kristina Magdalena Henn, Lea Schmidbauer]"
983,tt6270328,[Jo Boag],tt6270328,tvSeries,The Skinner Boys: Guardians of the Lost Secrets,The Skinner Boys: Guardians of the Lost Secrets,0,2014.0,,23.0,"Adventure,Animation,Drama",7.8,12,"[Pablo De La Torre, Eugene Linkov, Jo Boag]","[David Witt, John Derevlany, David Evans, Pete..."
