In [1]:
import numpy as numpy
import pandas as pd

In [2]:
raw_data = pd.read_csv('netflix_titles.csv')

In [7]:
import spacy

In [8]:
nlp = spacy.load('en_core_web_sm')

In [3]:
raw_data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [6]:
raw_data.type.value_counts()

Movie      4265
TV Show    1969
Name: type, dtype: int64

In [9]:
raw_data.isnull().sum()

show_id            0
type               0
title              0
director        1969
cast             570
country          476
date_added        11
release_year       0
rating            10
duration           0
listed_in          0
description        0
dtype: int64

In [83]:
df = raw_data.copy()

In [84]:
df['movie_info'] = df['director'].fillna(" ") + ' ' + df['listed_in'] + ' ' + df['description'] + ' ' + df['listed_in'] + ' ' + df['cast'].fillna(" ")

In [85]:
df.movie_info.isnull().sum()

0

In [86]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'movie_info'],
      dtype='object')

In [87]:
df = df.drop(columns=['show_id', 'type','director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'], axis=1)

In [88]:
df.head()

Unnamed: 0,title,movie_info
0,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby Children & Family Mov..."
1,Jandino: Whatever it Takes,Stand-Up Comedy Jandino Asporaat riffs on th...
2,Transformers Prime,Kids' TV With the help of three human allies...
3,Transformers: Robots in Disguise,Kids' TV When a prison ship crash unleashes ...
4,#realityhigh,Fernando Lebrija Comedies When nerdy high scho...


In [89]:
import string
import re

In [90]:
punctuations = string.punctuation

In [91]:
stop_words = nlp.Defaults.stop_words

In [92]:
def remove_punctuations(text):
    return re.sub('['+punctuations+']','',text)

In [93]:
def remove_stopwords(text):
    words = []
    for tokens in nlp(text):
        if tokens not in stop_words:
            if tokens.lemma_ != '-PRON-':
                words.append(tokens.lemma_.rstrip())
    return " ".join(words)

In [94]:
def denoise_data(text):
    text = remove_punctuations(text)
    text = remove_stopwords(text)
    return text

In [95]:
df.movie_info = df.movie_info.apply(denoise_data)

In [96]:
from sklearn.feature_extraction.text import CountVectorizer

In [97]:
vec = CountVectorizer()

In [98]:
X = vec.fit_transform(df.movie_info)

In [99]:
from sklearn.metrics.pairwise import cosine_similarity

In [100]:
cosine_sim = cosine_similarity(X)

In [130]:
def recommended_titles(liked_title):
    idx_liked = df[df['title'] == liked_title].index[0]
    similar_titles = sorted(list(enumerate(cosine_sim[df[df['title']== liked_title].index.values[0]])), key=lambda x: x[1],reverse=True)
    similar_titles.pop(0)
    similar_titles = similar_titles[:10]
    rec_titles = [df.title[idx] for idx,cos_val in similar_titles]
    rec_title_df = pd.DataFrame(data=rec_titles,columns=['Recommended Titles'],index=np.arange(1,11))
    #print(f'Recommended Titles: {dict(enumerate(rec_titles))}')
    print(rec_title_df.head(10))

In [131]:
recommended_titles('Transformers Prime')

                                  Recommended Titles
1                                                YOM
2                        Pororo - The Little Penguin
3                             What's New Scooby-Doo?
4                DreamWorks Spooky Stories: Volume 2
5                        Bobby Kennedy for President
6                             Kulipari: Dream Walker
7                                           The Deep
8                                       Danger Mouse
9   Prohibition: A Film by Ken Burns and Lynn Novick
10                         Power Rangers Dino Charge
