In [1]:
import pandas as pd
import numpy as np

In [2]:
# let's load netflix_title.csv data
data = pd.read_csv("datasets/netflix_titles.csv")

In [3]:
# information about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [4]:
# the first five rows of the data
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [5]:
# Check for Missing Values
# let's count missing values in each column
data.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [6]:
# Read CSV with Specific Columns
data_subset = pd.read_csv("datasets/netflix_titles.csv", usecols = ["title", "type", "release_year", "rating"])

In [7]:
data_subset

Unnamed: 0,type,title,release_year,rating
0,Movie,Dick Johnson Is Dead,2020,PG-13
1,TV Show,Blood & Water,2021,TV-MA
2,TV Show,Ganglands,2021,TV-MA
3,TV Show,Jailbirds New Orleans,2021,TV-MA
4,TV Show,Kota Factory,2021,TV-MA
...,...,...,...,...
8802,Movie,Zodiac,2007,R
8803,TV Show,Zombie Dumb,2018,TV-Y7
8804,Movie,Zombieland,2009,R
8805,Movie,Zoom,2006,PG


In [8]:
data_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          8807 non-null   object
 1   title         8807 non-null   object
 2   release_year  8807 non-null   int64 
 3   rating        8803 non-null   object
dtypes: int64(1), object(3)
memory usage: 275.3+ KB


In [9]:
data = data.set_index('show_id')

In [10]:
data.head()

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [11]:
# Fill missing values in country with "Unknown":
data['country'].fillna('Unknown')

show_id
s1       United States
s2        South Africa
s3             Unknown
s4             Unknown
s5               India
             ...      
s8803    United States
s8804          Unknown
s8805    United States
s8806    United States
s8807            India
Name: country, Length: 8807, dtype: object

In [12]:
# Filter Movies Only
movies = data[data['type'] == 'Movie']

In [13]:
movies.head()

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,"September 24, 2021",2021,PG,91 min,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...
s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...
s13,Movie,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Germany, Czech Republic","September 23, 2021",2021,TV-MA,127 min,"Dramas, International Movies",After most of her family is murdered in a terr...


In [14]:
# datas on movies
print(len(movies))
print(movies.shape[0])

6131
6131


In [15]:
# movie rating
movies['rating'].value_counts()

rating
TV-MA       2062
TV-14       1427
R            797
TV-PG        540
PG-13        490
PG           287
TV-Y7        139
TV-Y         131
TV-G         126
NR            75
G             41
TV-Y7-FV       5
NC-17          3
UR             3
74 min         1
84 min         1
66 min         1
Name: count, dtype: int64

### Let's extract more insights from the Netflix dataset

In [16]:
# Check Dataset Shape
data.shape

(8807, 11)

In [17]:
# Column Names
data.columns

Index(['type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [18]:
# Summary Statistics for Columns
data.describe()

Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [19]:
# Data Cleaning & Missing Values
data.isnull().sum()

type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [20]:
data_cleaned = data.dropna(subset = ['director', 'cast', 'country'])

In [21]:
data_cleaned.shape

(5336, 11)

In [22]:
data_cleaned.head()

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...
s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...
s13,Movie,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Germany, Czech Republic","September 23, 2021",2021,TV-MA,127 min,"Dramas, International Movies",After most of her family is murdered in a terr...
s25,Movie,Jeans,S. Shankar,"Prashanth, Aishwarya Rai Bachchan, Sri Lakshmi...",India,"September 21, 2021",1998,TV-14,166 min,"Comedies, International Movies, Romantic Movies",When the father of the man she loves insists t...


In [23]:
data_cleaned.isnull().sum()

type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          1
duration        3
listed_in       0
description     0
dtype: int64

In [24]:
data_cleaned['rating'].fillna('Unkown', inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned['rating'].fillna('Unkown', inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['rating'].fillna('Unkown', inplace = True)


In [25]:
data_cleaned['rating'].value_counts()

rating
TV-MA       1822
TV-14       1214
R            778
PG-13        470
TV-PG        431
PG           275
TV-G          84
TV-Y7         76
TV-Y          76
NR            58
G             40
TV-Y7-FV       3
UR             3
NC-17          2
66 min         1
84 min         1
74 min         1
Unkown         1
Name: count, dtype: int64

In [26]:
data_cleaned.isnull().sum()

type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        3
listed_in       0
description     0
dtype: int64

In [27]:
# Count Number of Movies vs. TV Shows
data['type'].value_counts()

type
Movie      6131
TV Show    2676
Name: count, dtype: int64

In [28]:
# Find the Most Common Ratings
data['rating'].value_counts().head(10)

rating
TV-MA    3207
TV-14    2160
TV-PG     863
R         799
PG-13     490
TV-Y7     334
TV-Y      307
PG        287
TV-G      220
NR         80
Name: count, dtype: int64

In [29]:
# Find the Most Common Countries Producing Netflix Content
data['country'].value_counts().head(10)

country
United States     2818
India              972
United Kingdom     419
Japan              245
South Korea        199
Canada             181
Spain              145
France             124
Mexico             110
Egypt              106
Name: count, dtype: int64

In [30]:
# Find the Top 10 Directors with Most Content on Netflix
data['director'].value_counts().head(10)

director
Rajiv Chilaka             19
Raúl Campos, Jan Suter    18
Suhas Kadav               16
Marcus Raboy              16
Jay Karas                 14
Cathy Garcia-Molina       13
Martin Scorsese           12
Youssef Chahine           12
Jay Chapman               12
Steven Spielberg          11
Name: count, dtype: int64

In [31]:
# Count the Number of Titles Released Per Year 
# extract recent years
data['release_year'].value_counts().sort_index().tail(10)

release_year
2012     237
2013     288
2014     352
2015     560
2016     902
2017    1032
2018    1147
2019    1030
2020     953
2021     592
Name: count, dtype: int64

In [32]:
# Find the Most Recent Movies/Shows Added to Netflix
data[data['release_year'] == data['release_year'].max()][['title', 'release_year']].head(10)

Unnamed: 0_level_0,title,release_year
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1
s2,Blood & Water,2021
s3,Ganglands,2021
s4,Jailbirds New Orleans,2021
s5,Kota Factory,2021
s6,Midnight Mass,2021
s7,My Little Pony: A New Generation,2021
s9,The Great British Baking Show,2021
s10,The Starling,2021
s11,"Vendetta: Truth, Lies and The Mafia",2021
s12,Bangkok Breaking,2021


In [33]:
# Find All Netflix Originals from the USA
data[(data['country'] == 'United States') & (data['type'] == 'Movie')][['title', 'release_year']].head(10)

Unnamed: 0_level_0,title,release_year
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1
s1,Dick Johnson Is Dead,2020
s10,The Starling,2021
s28,Grown Ups,2010
s29,Dark Skies,2013
s42,Jaws,1975
s43,Jaws 2,1978
s44,Jaws 3,1983
s45,Jaws: The Revenge,1987
s49,Training Day,2001
s82,Kate,2021


In [34]:
# Find All TV Shows Rated "TV-MA"
data[(data['type'] == 'TV Show') & (data['rating'] == 'TV-MA')][['title', 'release_year']].head(10)

Unnamed: 0_level_0,title,release_year
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1
s2,Blood & Water,2021
s3,Ganglands,2021
s4,Jailbirds New Orleans,2021
s5,Kota Factory,2021
s6,Midnight Mass,2021
s11,"Vendetta: Truth, Lies and The Mafia",2021
s12,Bangkok Breaking,2021
s15,Crime Stories: India Detectives,2021
s16,Dear White People,2021
s18,Falsa identidad,2020


In [35]:
data_cleaned.to_csv("datasets/netflix_cleaned.csv")