In [255]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from rake_nltk import Rake # used to extract the key words from the description
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string as str

In [256]:
data=pd.read_csv('netflix_titles.csv')
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [257]:
# data inspection
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6234 entries, 0 to 6233
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       6234 non-null   int64 
 1   type          6234 non-null   object
 2   title         6234 non-null   object
 3   director      4265 non-null   object
 4   cast          5664 non-null   object
 5   country       5758 non-null   object
 6   date_added    6223 non-null   object
 7   release_year  6234 non-null   int64 
 8   rating        6224 non-null   object
 9   duration      6234 non-null   object
 10  listed_in     6234 non-null   object
 11  description   6234 non-null   object
dtypes: int64(2), object(10)
memory usage: 584.6+ KB


In [258]:
# Data Cleaning:  create a new column to extract the key words from description ('Rake')
r=Rake()
def rake(x):
    r.extract_keywords_from_text(x)
    return r.get_ranked_phrases()
data['key_words']=data['description'].apply(lambda i: rake(i))
data


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,key_words
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...,"[polar bear king must take back, evil archaeol..."
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...,"[jandino asporaat riffs, rousing rendition, ra..."
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob...","[three human allies, protect earth, onslaught,..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...,"[prison ship crash unleashes hundreds, new aut..."
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...,"[nerdy high schooler dani finally attracts, so..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6229,80000063,TV Show,Red vs. Blue,,"Burnie Burns, Jason Saldaña, Gustavo Sorola, G...",United States,,2015,NR,13 Seasons,"TV Action & Adventure, TV Comedies, TV Sci-Fi ...","This parody of first-person shooter games, mil...","[person shooter games, fiction films centers, ..."
6230,70286564,TV Show,Maron,,"Marc Maron, Judd Hirsch, Josh Brener, Nora Zeh...",United States,,2016,TV-MA,4 Seasons,TV Comedies,"Marc Maron stars as Marc Maron, who interviews...","[interviews fellow comedians, marc maron stars..."
6231,80116008,Movie,Little Baby Bum: Nursery Rhyme Friends,,,,,2016,,60 min,Movies,Nursery rhymes and original music for children...,"[playful animation engage, original music, nur..."
6232,70281022,TV Show,A Young Doctor's Notebook and Other Stories,,"Daniel Radcliffe, Jon Hamm, Adam Godley, Chris...",United Kingdom,,2013,TV-MA,2 Seasons,"British TV Shows, TV Comedies, TV Dramas","Set during the Russian Revolution, this comic ...","[early career working, way village, russian re..."


In [259]:
def cleaning_text(words):  #remove the punctuation and conver to lower case
    words=' '.join(word.lower() for word in words if (word not in str.punctuation))
    return words 

In [260]:
df=data.copy()

In [261]:
df['key_words']=df['key_words'].apply(cleaning_text)
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,key_words
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...,polar bear king must take back evil archaeolog...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...,jandino asporaat riffs rousing rendition raisi...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob...",three human allies protect earth onslaught meg...
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...,prison ship crash unleashes hundreds new autob...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...,nerdy high schooler dani finally attracts soci...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6229,80000063,TV Show,Red vs. Blue,,"Burnie Burns, Jason Saldaña, Gustavo Sorola, G...",United States,,2015,NR,13 Seasons,"TV Action & Adventure, TV Comedies, TV Sci-Fi ...","This parody of first-person shooter games, mil...",person shooter games fiction films centers civ...
6230,70286564,TV Show,Maron,,"Marc Maron, Judd Hirsch, Josh Brener, Nora Zeh...",United States,,2016,TV-MA,4 Seasons,TV Comedies,"Marc Maron stars as Marc Maron, who interviews...",interviews fellow comedians marc maron stars m...
6231,80116008,Movie,Little Baby Bum: Nursery Rhyme Friends,,,,,2016,,60 min,Movies,Nursery rhymes and original music for children...,playful animation engage original music nurser...
6232,70281022,TV Show,A Young Doctor's Notebook and Other Stories,,"Daniel Radcliffe, Jon Hamm, Adam Godley, Chris...",United Kingdom,,2013,TV-MA,2 Seasons,"British TV Shows, TV Comedies, TV Dramas","Set during the Russian Revolution, this comic ...",early career working way village russian revol...


## modeling 1: content-based recommender only based on description 

In [262]:
df=df[['title', 'key_words']].drop_duplicates()

In [263]:
df

Unnamed: 0,title,key_words
0,Norm of the North: King Sized Adventure,polar bear king must take back evil archaeolog...
1,Jandino: Whatever it Takes,jandino asporaat riffs rousing rendition raisi...
2,Transformers Prime,three human allies protect earth onslaught meg...
3,Transformers: Robots in Disguise,prison ship crash unleashes hundreds new autob...
4,#realityhigh,nerdy high schooler dani finally attracts soci...
...,...,...
6229,Red vs. Blue,person shooter games fiction films centers civ...
6230,Maron,interviews fellow comedians marc maron stars m...
6231,Little Baby Bum: Nursery Rhyme Friends,playful animation engage original music nurser...
6232,A Young Doctor's Notebook and Other Stories,early career working way village russian revol...


In [264]:
df.set_index('title', inplace=True)


In [265]:
df.head()

Unnamed: 0_level_0,key_words
title,Unnamed: 1_level_1
Norm of the North: King Sized Adventure,polar bear king must take back evil archaeolog...
Jandino: Whatever it Takes,jandino asporaat riffs rousing rendition raisi...
Transformers Prime,three human allies protect earth onslaught meg...
Transformers: Robots in Disguise,prison ship crash unleashes hundreds new autob...
#realityhigh,nerdy high schooler dani finally attracts soci...


In [266]:
df.index

Index(['Norm of the North: King Sized Adventure', 'Jandino: Whatever it Takes',
       'Transformers Prime', 'Transformers: Robots in Disguise',
       '#realityhigh', 'Apaches', 'Automata',
       'Fabrizio Copano: Solo pienso en mi', 'Fire Chasers', 'Good People',
       ...
       'Anthony Bourdain: Parts Unknown', 'Frasier', 'La Familia P. Luche',
       'The Adventures of Figaro Pho', 'Kikoriki', 'Red vs. Blue', 'Maron',
       'Little Baby Bum: Nursery Rhyme Friends',
       'A Young Doctor's Notebook and Other Stories', 'Friends'],
      dtype='object', name='title', length=6233)

In [267]:
# Apply CountVectorizer for transformation (bag of words)
count = CountVectorizer()
count_matrix = count.fit_transform(df['key_words'])

indices = pd.Series(df.index)
count_matrix

<6233x16281 sparse matrix of type '<class 'numpy.int64'>'
	with 90804 stored elements in Compressed Sparse Row format>

In [268]:
# Create the matrix based on cosine_similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [269]:
# function that takes in movie title as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim):
     
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = (score_series[1:11].index).tolist()
    
    # populating the list with the titles of the best 10 matching movies
            
    return [df.index[i] for i in top_10_indexes]

In [271]:
# recommend top 10 movies only based on show discription
recommendations('Good People', cosine_sim)

['Lechmi',
 'Paranoid',
 'Band Aid',
 'Private Life',
 'Bhoot',
 'I Am Not Madame Bovary',
 'Qila',
 'Kill Me If You Dare',
 'Luck by Chance',
 'Blood Money']

## model 2: content-based recommender based on combined characters including director, cast, country, listed_in, description

In [329]:
df=data[['title', 'director', 'cast', 'country', 'listed_in', 'key_words']]
df.isnull().sum()

title           0
director     1969
cast          570
country       476
listed_in       0
key_words       0
dtype: int64

In [330]:
# fill null values 
df=df.fillna('')
df.head(5)

Unnamed: 0,title,director,cast,country,listed_in,key_words
0,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","Children & Family Movies, Comedies","[polar bear king must take back, evil archaeol..."
1,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,Stand-Up Comedy,"[jandino asporaat riffs, rousing rendition, ra..."
2,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,Kids' TV,"[three human allies, protect earth, onslaught,..."
3,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,Kids' TV,"[prison ship crash unleashes hundreds, new aut..."
4,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,Comedies,"[nerdy high schooler dani finally attracts, so..."


In [331]:
for col in ['director', 'cast', 'country']:
    df[col]=df[col].map(lambda x: x.lower().split(',')[0:2])  # pick the first 2 items if there are more

In [335]:
df['listed_in']=df['listed_in'].str.split()

In [336]:
df

Unnamed: 0_level_0,director,cast,country,listed_in,key_words
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Norm of the North: King Sized Adventure,"[richardfinn, timmaltby]","[alanmarriott, andrewtoth]","[unitedstates, india]","[Children, &, Family, Movies,, Comedies]","[polar bear king must take back, evil archaeol..."
Jandino: Whatever it Takes,[],[jandinoasporaat],[unitedkingdom],"[Stand-Up, Comedy]","[jandino asporaat riffs, rousing rendition, ra..."
Transformers Prime,[],"[petercullen, sumaleemontano]",[unitedstates],"[Kids', TV]","[three human allies, protect earth, onslaught,..."
Transformers: Robots in Disguise,[],"[willfriedle, darrencriss]",[unitedstates],"[Kids', TV]","[prison ship crash unleashes hundreds, new aut..."
#realityhigh,[fernandolebrija],"[nestacooper, katewalsh]",[unitedstates],[Comedies],"[nerdy high schooler dani finally attracts, so..."
...,...,...,...,...,...
Red vs. Blue,[],"[burnieburns, jasonsaldaña]",[unitedstates],"[TV, Action, &, Adventure,, TV, Comedies,, TV,...","[person shooter games, fiction films centers, ..."
Maron,[],"[marcmaron, juddhirsch]",[unitedstates],"[TV, Comedies]","[interviews fellow comedians, marc maron stars..."
Little Baby Bum: Nursery Rhyme Friends,[],[],[],[Movies],"[playful animation engage, original music, nur..."
A Young Doctor's Notebook and Other Stories,[],"[danielradcliffe, jonhamm]",[unitedkingdom],"[British, TV, Shows,, TV, Comedies,, TV, Dramas]","[early career working, way village, russian re..."


In [332]:
# merging together first and last name to one_word for each actor, director and country
for index, row in df.iterrows():
    row['cast']=[x.lower().replace(' ', '') for x in row['cast']]
    row['director']=[x.lower().replace(' ', '') for x in row['director']]
    row['country']=[x.lower().replace(' ', '') for x in row['country']]

In [334]:
# set 'title' as index
df.set_index('title', inplace=True)


In [337]:
# combine all the columns together into one columns-'combined_features'
df['combined_features']=''
for index, row in df.iterrows():
    list=[]
    for col in df.columns:
        list.extend(row[col])
    row['combined_features']=' '.join(l.lower() for l in list)
df['combined_features']=df['combined_features'].str.replace('[^\w\s]','')  #remove the punctuation and number

In [340]:
df['combined_features'].tolist()[0:3]

['richardfinn timmaltby alanmarriott andrewtoth unitedstates india children  family movies comedies polar bear king must take back evil archaeologist first stolen artifact awesome wedding planning grandfather',
 ' jandinoasporaat unitedkingdom standup comedy jandino asporaat riffs rousing rendition raising kids comedy show sex serenades fire challenges audience',
 ' petercullen sumaleemontano unitedstates kids tv three human allies protect earth onslaught megatron leader help decepticons autobots']

In [341]:
df=df[['combined_features']]
df.isnull().sum()

combined_features    0
dtype: int64

In [342]:
df.drop_duplicates(inplace=True)

In [343]:
df

Unnamed: 0_level_0,combined_features
title,Unnamed: 1_level_1
Norm of the North: King Sized Adventure,richardfinn timmaltby alanmarriott andrewtoth ...
Jandino: Whatever it Takes,jandinoasporaat unitedkingdom standup comedy ...
Transformers Prime,petercullen sumaleemontano unitedstates kids ...
Transformers: Robots in Disguise,willfriedle darrencriss unitedstates kids tv ...
#realityhigh,fernandolebrija nestacooper katewalsh unitedst...
...,...
Red vs. Blue,burnieburns jasonsaldaña unitedstates tv acti...
Maron,marcmaron juddhirsch unitedstates tv comedies...
Little Baby Bum: Nursery Rhyme Friends,movies playful animation engage original mu...
A Young Doctor's Notebook and Other Stories,danielradcliffe jonhamm unitedkingdom british...


In [344]:
# Applying Countvectorization transformation and Cosine_similarity 
CV=CountVectorizer()
CV_matrix=CV.fit_transform(df['combined_features'])
cos_sim2=cosine_similarity(CV_matrix,CV_matrix)
cos_sim2

array([[1.        , 0.        , 0.05      , ..., 0.10327956, 0.03713907,
        0.07844645],
       [0.        , 1.        , 0.0559017 , ..., 0.        , 0.04152274,
        0.        ],
       [0.05      , 0.0559017 , 1.        , ..., 0.        , 0.1392715 ,
        0.1470871 ],
       ...,
       [0.10327956, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.03713907, 0.04152274, 0.1392715 , ..., 0.        , 1.        ,
        0.25492496],
       [0.07844645, 0.        , 0.1470871 , ..., 0.        , 0.25492496,
        1.        ]])

In [350]:
recommendations('Good People', cos_sim2)

['Blood Money',
 'Inception',
 'Inconceivable',
 'Big Kill',
 'Ninja Assassin',
 'The Saint',
 'The Darkest Dawn',
 'The World Is Not Enough',
 'The Taking of Pelham 123',
 'Black Hawk Down']