In [3]:

import numpy as np 
import pandas as pd 

import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Capture similarity 
from sklearn.metrics.pairwise import linear_kernel



In [4]:
data = pd.read_csv("./data/netflix_titles.csv")
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [5]:
data.shape

(6234, 12)

In [6]:
data.dropna(subset=['cast','title','description','listed_in'],inplace=True,axis=0)
data = data.reset_index(drop=True)

In [7]:
data['listed_in'] = [re.sub(r'[^\w\s]', '', t) for t in data['listed_in']]
data['cast'] = [re.sub(',',' ',re.sub(' ','',t)) for t in data['cast']]
data['description'] = [re.sub(r'[^\w\s]', '', t) for t in data['description']]
data['title'] = [re.sub(r'[^\w\s]', '', t) for t in data['title']]

In [8]:
data["combined"] = data['listed_in'] + '  ' + data['cast'] + ' ' + data['title'] + ' ' + data['description']
data.drop(['listed_in','cast','description'],axis=1,inplace=True)
data.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,combined
0,81145628,Movie,Norm of the North King Sized Adventure,"Richard Finn, Tim Maltby","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,Children Family Movies Comedies AlanMarriott...
1,80117401,Movie,Jandino Whatever it Takes,,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,StandUp Comedy JandinoAsporaat Jandino Whatev...
2,70234439,TV Show,Transformers Prime,,United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids TV PeterCullen SumaleeMontano FrankWelke...
3,80058654,TV Show,Transformers Robots in Disguise,,United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids TV WillFriedle DarrenCriss ConstanceZimm...
4,80125979,Movie,realityhigh,Fernando Lebrija,United States,"September 8, 2017",2017,TV-14,99 min,Comedies NestaCooper KateWalsh JohnMichaelHig...


In [15]:
# saving the required dataset only
new_data= data[['title','combined']]
new_data.to_csv('new_data.csv')

In [16]:
new_data.head(3)

Unnamed: 0,title,combined
0,Norm of the North King Sized Adventure,Children Family Movies Comedies AlanMarriott...
1,Jandino Whatever it Takes,StandUp Comedy JandinoAsporaat Jandino Whatev...
2,Transformers Prime,Kids TV PeterCullen SumaleeMontano FrankWelke...


In [10]:
data['title'].value_counts()

Tunnel                   3
Love                     3
The Silence              3
Oh My Ghost              3
Prince                   2
                        ..
Clash of the Titans      1
Planet 51                1
Strange Weather          1
AD Kingdom and Empire    1
The Mist                 1
Name: title, Length: 5608, dtype: int64

In [11]:
# Content Similarity
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(data["combined"])
cosine_similarities = linear_kernel(matrix,matrix)
movie_title = data['title']
indices = pd.Series(data.index, index=data['title'])

In [17]:
def content_recommender(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    print(type(movie_title.iloc[movie_indices]))
    return movie_title.iloc[movie_indices]

In [18]:
content_recommender('Transformers Prime')


<class 'pandas.core.series.Series'>


3                   Transformers Robots in Disguise
155                        Transformers Rescue Bots
288                         Transformers Cyberverse
1084                          Kulipari Dream Walker
1622                    All Hail King Julien Exiled
150                       Kulipari An Army of Frogs
5540                            Whats New ScoobyDoo
3102               Kipo and the Age of Wonderbeasts
2311                         Expelled from Paradise
4904                                 Beyond Skyline
2229                                            YOM
3500       Marvel Super Hero Adventures Frost Fight
336     Naruto Shippûden the Movie The Will of Fire
5157                 The Boss Baby Back in Business
5417                        3Below Tales of Arcadia
5554                     Voltron Legendary Defender
5571                                  The Originals
4830           Planet Earth The Complete Collection
3423                       The Last of the Schmucks
1670        