# Building Netflix Recommendation System: content-based filtering algorithms  

# Data wrangling 

In [1]:
# Import the pandas and numpy packages from Python
import numpy as np
import pandas as pd

In [2]:
# Import the data 
#url = 'https://www.kaggle.com/shivamb/netflix-shows?select=netflix_titles.csv'
mydata = pd.read_csv('netflix_titles.csv', header = 0)
mydata.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,14-Aug-20,2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,23-Dec-16,2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,20-Dec-18,2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,16-Nov-17,2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,1-Jan-20,2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [3]:
# Find all the columns name
mydata.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [4]:
mydata.shape

(7787, 12)

In [5]:
# Fund out the total number of null values in each column 
mydata.isnull().sum()

show_id            0
type               0
title              0
director        2389
cast             718
country          507
date_added        10
release_year       0
rating             7
duration           0
listed_in          0
description        0
dtype: int64

In [6]:
# Find out the data type of each column
mydata.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

In [7]:
mydata.tail()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
7782,s7783,Movie,Zozo,Josef Fares,"Imad Creidi, Antoinette Turk, Elias Gergi, Car...","Sweden, Czech Republic, United Kingdom, Denmar...",19-Oct-20,2005,TV-MA,99 min,"Dramas, International Movies",When Lebanon's Civil War deprives Zozo of his ...
7783,s7784,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,2-Mar-19,2015,TV-14,111 min,"Dramas, International Movies, Music & Musicals",A scrappy but poor boy worms his way into a ty...
7784,s7785,Movie,Zulu Man in Japan,,Nasty C,,25-Sep-20,2019,TV-MA,44 min,"Documentaries, International Movies, Music & M...","In this documentary, South African rapper Nast..."
7785,s7786,TV Show,Zumbo's Just Desserts,,"Adriano Zumbo, Rachel Khoo",Australia,31-Oct-20,2019,TV-PG,1 Season,"International TV Shows, Reality TV",Dessert wizard Adriano Zumbo looks for the nex...
7786,s7787,Movie,ZZ TOP: THAT LITTLE OL' BAND FROM TEXAS,Sam Dunn,,"United Kingdom, Canada, United States",1-Mar-20,2019,TV-MA,90 min,"Documentaries, Music & Musicals",This documentary delves into the mystique behi...


In [8]:
# List the different counties in the data set
mydata['country'].unique()[:10]

array(['Brazil', 'Mexico', 'Singapore', 'United States', 'Turkey',
       'Egypt', 'India', 'Poland, United States', 'Thailand', 'Nigeria'],
      dtype=object)

In [9]:
# List countries and the number of times they appear in the data set, display top 10 countries  
mydata['country'].value_counts()[:10]

United States     2555
India              923
United Kingdom     397
Japan              226
South Korea        183
Canada             177
Spain              134
France             115
Egypt              101
Mexico             100
Name: country, dtype: int64

In [10]:
# List the types of show listed in the data set
mydata['type'].value_counts()

Movie      5377
TV Show    2410
Name: type, dtype: int64

In [11]:
# List the number of years appeared in the data set and diplay top 10 years  
mydata['release_year'].value_counts()[:10]

2018    1121
2017    1012
2019     996
2016     882
2020     868
2015     541
2014     334
2013     267
2012     219
2010     173
Name: release_year, dtype: int64

In [12]:
# List the year wise number of movie/tv show release
year_show_count = pd.DataFrame(mydata['release_year'].value_counts())
year_show_count.head()

Unnamed: 0,release_year
2018,1121
2017,1012
2019,996
2016,882
2020,868


In [13]:
# In the 'director' column many values are missing, so this column was dropped from the dataframe 
mydata2 = mydata.drop(columns = ['director'], axis = 1)
mydata2.head()

Unnamed: 0,show_id,type,title,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,14-Aug-20,2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,23-Dec-16,2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,20-Dec-18,2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,16-Nov-17,2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,1-Jan-20,2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [14]:
mydata2['date_added'] = pd.to_datetime(mydata2['date_added'])
mydata2.head()

Unnamed: 0,show_id,type,title,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,2020-08-14,2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,2016-12-23,2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,2018-12-20,2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,2017-11-16,2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,2020-01-01,2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [15]:
mydata2['date_added'] = mydata2['date_added'].dt.year
mydata2.head(3)

Unnamed: 0,show_id,type,title,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,2020.0,2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,2016.0,2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,2018.0,2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."


In [16]:
mydata2.isnull().sum()

show_id           0
type              0
title             0
cast            718
country         507
date_added       10
release_year      0
rating            7
duration          0
listed_in         0
description       0
dtype: int64

In [17]:
mydata2.dropna(inplace = True)
mydata2.head()

Unnamed: 0,show_id,type,title,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,2020.0,2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,2016.0,2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,2018.0,2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,2017.0,2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,2020.0,2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [18]:
mydata2.isnull().sum()

show_id         0
type            0
title           0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [19]:
mydata2['rating'].value_counts()

TV-MA       2450
TV-14       1636
R            655
TV-PG        627
PG-13        375
PG           238
TV-Y7        208
TV-Y         192
TV-G         146
NR            66
G             38
UR             5
TV-Y7-FV       4
NC-17          3
Name: rating, dtype: int64

In [20]:
mydata2.shape

(6643, 11)

In [21]:
mydata2.columns

Index(['show_id', 'type', 'title', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [22]:
mydata2['duration'].value_counts()

1 Season     1184
2 Seasons     314
3 Seasons     160
93 min        121
94 min        120
             ... 
191 min         1
190 min         1
196 min         1
205 min         1
181 min         1
Name: duration, Length: 203, dtype: int64

In [23]:
# Make a dataframe containing only TV shows
mydata2_seasons_only = mydata2.loc[mydata2['duration'].isin(['1 Season', '2 Seasons', '3 Seasons'])]
mydata2_seasons_only.head()                               

Unnamed: 0,show_id,type,title,cast,country,date_added,release_year,rating,duration,listed_in,description
5,s6,TV Show,46,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",Turkey,2017.0,2016,TV-MA,1 Season,"International TV Shows, TV Dramas, TV Mysteries",A genetics professor experiments with a treatm...
11,s12,TV Show,1983,"Robert Więckiewicz, Maciej Musiał, Michalina O...","Poland, United States",2018.0,2018,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Dramas","In this dark alt-history thriller, a naïve law..."
24,s25,TV Show,​SAINT SEIYA: Knights of the Zodiac,"Bryson Baugus, Emily Neves, Blake Shepard, Pat...",Japan,2020.0,2020,TV-14,2 Seasons,"Anime Series, International TV Shows",Seiya and the Knights of the Zodiac rise again...
29,s30,TV Show,#blackAF,"Kenya Barris, Rashida Jones, Iman Benson, Genn...",United States,2020.0,2020,TV-MA,1 Season,TV Comedies,Kenya Barris and his family navigate relations...
45,s46,TV Show,Şubat,"Alican Yücesoy, Melisa Sözen, Musa Uzunlar, Se...",Turkey,2017.0,2013,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Dramas",An orphan subjected to tests that gave him sup...


In [24]:
mydata2_seasons_only.shape

(1658, 11)

In [25]:
# Make a dataframe of Movies only
mydata2_movies_only = mydata2[mydata2['type'] == 'Movie']
mydata2_movies_only.head()

Unnamed: 0,show_id,type,title,cast,country,date_added,release_year,rating,duration,listed_in,description
1,s2,Movie,7:19,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,2016.0,2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,2018.0,2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,2017.0,2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,2020.0,2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...
6,s7,Movie,122,"Amina Khalil, Ahmed Dawood, Tarek Lotfy, Ahmed...",Egypt,2020.0,2019,TV-MA,95 min,"Horror Movies, International Movies","After an awful accident, a couple admitted to ..."


In [26]:
mydata2_movies_only.shape

(4761, 11)

In [27]:
# Convert the column name 'duration' to 'duration_min' 
mydata2_movies_only.rename(columns = {'duration':'duration_min'}, inplace = True)
mydata2_movies_only.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,show_id,type,title,cast,country,date_added,release_year,rating,duration_min,listed_in,description
1,s2,Movie,7:19,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,2016.0,2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,2018.0,2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."


In [28]:
# Remove 'min' unit from contents of "duration" column, keep only the minutes in number and rename the column name
#"duration_min"
# For this, first split that column and keep only the number
mydata2_movies_only2 = mydata2_movies_only['duration_min'].str.split(" ", expand = True)
mydata2_movies_only2.head(2)

Unnamed: 0,0,1
1,93,min
2,78,min


In [29]:
mydata2_movies_only['duration_min'] = mydata2_movies_only2[0] 
mydata2_movies_only.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,show_id,type,title,cast,country,date_added,release_year,rating,duration_min,listed_in,description
1,s2,Movie,7:19,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,2016.0,2016,TV-MA,93,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,2018.0,2011,R,78,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,2017.0,2009,PG-13,80,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,2020.0,2008,PG-13,123,Dramas,A brilliant group of students become card-coun...
6,s7,Movie,122,"Amina Khalil, Ahmed Dawood, Tarek Lotfy, Ahmed...",Egypt,2020.0,2019,TV-MA,95,"Horror Movies, International Movies","After an awful accident, a couple admitted to ..."


In [30]:
# As in the movie only dataframe,  drop the 'Season/Seasons' from the 'duration' column of the mydata2_seasons_only dataframe. 
# keep only the numbers.
mydata2_seasons_only.rename(columns = {'duration': 'Number_of_seasons'}, inplace = True)
mydata2_seasons_only.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,show_id,type,title,cast,country,date_added,release_year,rating,Number_of_seasons,listed_in,description
5,s6,TV Show,46,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",Turkey,2017.0,2016,TV-MA,1 Season,"International TV Shows, TV Dramas, TV Mysteries",A genetics professor experiments with a treatm...
11,s12,TV Show,1983,"Robert Więckiewicz, Maciej Musiał, Michalina O...","Poland, United States",2018.0,2018,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Dramas","In this dark alt-history thriller, a naïve law..."
24,s25,TV Show,​SAINT SEIYA: Knights of the Zodiac,"Bryson Baugus, Emily Neves, Blake Shepard, Pat...",Japan,2020.0,2020,TV-14,2 Seasons,"Anime Series, International TV Shows",Seiya and the Knights of the Zodiac rise again...
29,s30,TV Show,#blackAF,"Kenya Barris, Rashida Jones, Iman Benson, Genn...",United States,2020.0,2020,TV-MA,1 Season,TV Comedies,Kenya Barris and his family navigate relations...
45,s46,TV Show,Şubat,"Alican Yücesoy, Melisa Sözen, Musa Uzunlar, Se...",Turkey,2017.0,2013,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Dramas",An orphan subjected to tests that gave him sup...


In [31]:
mydata2_seasons_only2 = mydata2_seasons_only.replace({'1 Season': 1, '2 Seasons': 2, '3 Seasons': 3})
mydata2_seasons_only2.head()

Unnamed: 0,show_id,type,title,cast,country,date_added,release_year,rating,Number_of_seasons,listed_in,description
5,s6,TV Show,46,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",Turkey,2017.0,2016,TV-MA,1,"International TV Shows, TV Dramas, TV Mysteries",A genetics professor experiments with a treatm...
11,s12,TV Show,1983,"Robert Więckiewicz, Maciej Musiał, Michalina O...","Poland, United States",2018.0,2018,TV-MA,1,"Crime TV Shows, International TV Shows, TV Dramas","In this dark alt-history thriller, a naïve law..."
24,s25,TV Show,​SAINT SEIYA: Knights of the Zodiac,"Bryson Baugus, Emily Neves, Blake Shepard, Pat...",Japan,2020.0,2020,TV-14,2,"Anime Series, International TV Shows",Seiya and the Knights of the Zodiac rise again...
29,s30,TV Show,#blackAF,"Kenya Barris, Rashida Jones, Iman Benson, Genn...",United States,2020.0,2020,TV-MA,1,TV Comedies,Kenya Barris and his family navigate relations...
45,s46,TV Show,Şubat,"Alican Yücesoy, Melisa Sözen, Musa Uzunlar, Se...",Turkey,2017.0,2013,TV-MA,1,"Crime TV Shows, International TV Shows, TV Dramas",An orphan subjected to tests that gave him sup...
