## Importing Libraries & Getting Data

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

In [3]:
data = pd.read_csv(r"dataset/NetflixOriginals.csv", encoding='latin-1')

In [4]:
data.head()

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish
2,The App,Science fiction/Drama,"December 26, 2019",79,2.6,Italian
3,The Open House,Horror thriller,"January 19, 2018",94,3.2,English
4,Kaali Khuhi,Mystery,"October 30, 2020",90,3.4,Hindi


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584 entries, 0 to 583
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       584 non-null    object 
 1   Genre       584 non-null    object 
 2   Premiere    584 non-null    object 
 3   Runtime     584 non-null    int64  
 4   IMDB Score  584 non-null    float64
 5   Language    584 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 27.5+ KB


In [6]:
data.describe()

Unnamed: 0,Runtime,IMDB Score
count,584.0,584.0
mean,93.577055,6.271747
std,27.761683,0.979256
min,4.0,2.5
25%,86.0,5.7
50%,97.0,6.35
75%,108.0,7.0
max,209.0,9.0


In [7]:
data.columns

Index(['Title', 'Genre', 'Premiere', 'Runtime', 'IMDB Score', 'Language'], dtype='object')

## Handling null values

In [8]:
data.isnull().sum()

Title         0
Genre         0
Premiere      0
Runtime       0
IMDB Score    0
Language      0
dtype: int64

In [9]:
data["date"] = pd.to_datetime(data["Premiere"])
data["date"]

0     2019-08-05
1     2020-08-21
2     2019-12-26
3     2018-01-19
4     2020-10-30
         ...    
579   2018-12-31
580   2015-10-09
581   2018-12-16
582   2020-12-08
583   2020-10-04
Name: date, Length: 584, dtype: datetime64[ns]

## Converting dates to resp. days ,months & years 

In [10]:
data['year_month'] = data['date'].dt.strftime('%Y-%m')
data['year'] = data['date'].dt.year 
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.dayofweek

In [11]:
data

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,date,year_month,year,month,day
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese,2019-08-05,2019-08,2019,8,0
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish,2020-08-21,2020-08,2020,8,4
2,The App,Science fiction/Drama,"December 26, 2019",79,2.6,Italian,2019-12-26,2019-12,2019,12,3
3,The Open House,Horror thriller,"January 19, 2018",94,3.2,English,2018-01-19,2018-01,2018,1,4
4,Kaali Khuhi,Mystery,"October 30, 2020",90,3.4,Hindi,2020-10-30,2020-10,2020,10,4
...,...,...,...,...,...,...,...,...,...,...,...
579,Taylor Swift: Reputation Stadium Tour,Concert Film,"December 31, 2018",125,8.4,English,2018-12-31,2018-12,2018,12,0
580,Winter on Fire: Ukraine's Fight for Freedom,Documentary,"October 9, 2015",91,8.4,English/Ukranian/Russian,2015-10-09,2015-10,2015,10,4
581,Springsteen on Broadway,One-man show,"December 16, 2018",153,8.5,English,2018-12-16,2018-12,2018,12,6
582,Emicida: AmarElo - It's All For Yesterday,Documentary,"December 8, 2020",89,8.6,Portuguese,2020-12-08,2020-12,2020,12,1


## EDA of Genre

In [12]:
gen = data["Genre"].nunique()
print("So we have a total of {} different genres.".format(gen))


So we have a total of 115 different genres.


In [13]:
data["Genre"].value_counts(normalize=True)

Documentary                             0.272260
Drama                                   0.131849
Comedy                                  0.083904
Romantic comedy                         0.066781
Thriller                                0.056507
                                          ...   
Romantic comedy-drama                   0.001712
Heist film/Thriller                     0.001712
Musical/Western/Fantasy                 0.001712
Horror anthology                        0.001712
Animation/Christmas/Comedy/Adventure    0.001712
Name: Genre, Length: 115, dtype: float64

In [14]:
genre_top20 = data['Genre'].value_counts()[:20]
genre_top20

Documentary                 159
Drama                        77
Comedy                       49
Romantic comedy              39
Thriller                     33
Comedy-drama                 14
Crime drama                  11
Biopic                        9
Horror                        9
Action                        7
Romance                       6
Concert Film                  6
Aftershow / Interview         6
Animation                     5
Action comedy                 5
Romantic drama                5
Psychological thriller        4
Science fiction/Thriller      4
Variety show                  4
Science fiction               4
Name: Genre, dtype: int64

In [15]:
fig = px.bar(genre_top20 , x=genre_top20.index ,y = genre_top20.values , labels={'y':'Number of Movies from the Genre' , 'index':'Genres'})

fig.update_layout(xaxis={'categoryorder' : 'total descending'})
fig.show()

## EDA of Languages

In [16]:
lang = data['Language'].nunique()
print("So we have a total of {} different languages.".format(lang))

So we have a total of 38 different languages.


In [17]:
lang_top10 = data['Language'].value_counts()[:10]
lang_top10

English       401
Hindi          33
Spanish        31
French         20
Italian        14
Portuguese     12
Indonesian      9
Japanese        6
Korean          6
German          5
Name: Language, dtype: int64

In [18]:
fig = px.bar(lang_top10, x=lang_top10.index, y=lang_top10.values, labels={'y': 'Number of languages used', 'x': 'Top 10 Languages'}, color_discrete_sequence=['red'])

fig.update_layout(xaxis={'categoryorder' : 'total descending'})
fig.show()


## EDA of Runtime

In [19]:
data['Runtime'].describe()

# mean score is lower than median score ,so we can expect left skewed distribution and we will see more runtime values on the minimum side

count    584.000000
mean      93.577055
std       27.761683
min        4.000000
25%       86.000000
50%       97.000000
75%      108.000000
max      209.000000
Name: Runtime, dtype: float64

In [20]:
fig = px.histogram(data ,x='Runtime' , title="Runtime of the Programs on Netflix")
fig.update_layout(bargap=0.1)
fig.show()


In [21]:
fig = px.box(data , x='Runtime' , hover_data=data[['Title' , 'Genre']])
fig.update_traces(quartilemethod="inclusive")
fig.show()

## EDA of IMDB Score

In [22]:
data["IMDB Score"].describe()

count    584.000000
mean       6.271747
std        0.979256
min        2.500000
25%        5.700000
50%        6.350000
75%        7.000000
max        9.000000
Name: IMDB Score, dtype: float64

In [23]:
fig = px.histogram(data, x='IMDB Score',title="IMDB Scores of Netflix Originals", color_discrete_sequence=['green'])
fig.update_layout(bargap=0.1)
fig.show()


In [24]:
fig = px.box(data , x='IMDB Score' , hover_data=data[['Title', 'Genre']])
fig.update_traces(quartilemethod="inclusive")
fig.show()

## Correlation between Runtime & IMDB scores

In [25]:
runtime_scores_corr = data[['Runtime' , 'IMDB Score']].corr()
runtime_scores_corr

Unnamed: 0,Runtime,IMDB Score
Runtime,1.0,-0.040896
IMDB Score,-0.040896,1.0


In [26]:
fig = px.scatter(data , x='IMDB Score' , y='Runtime' , hover_data=data[['Title'  ,'Genre']])
fig.show()

## EDA of Dates

### EDA of Year

In [27]:
year = data['year'].value_counts()
year

2020    183
2019    125
2018     99
2021     71
2017     66
2016     30
2015      9
2014      1
Name: year, dtype: int64

In [28]:
fig = px.bar(year , x=year.index , y=year.values , labels={'y':'Count of Movies each year' , 'index':'Year'} ,color_discrete_sequence=['pink'])

fig.update_layout(xaxis={'categoryorder' : 'total descending'})
fig.show()

### EDA of Month

In [29]:
month = data['month'].value_counts(sort=False)
month

8     37
12    51
1     37
10    77
11    57
6     35
3     48
5     53
4     63
9     53
2     39
7     34
Name: month, dtype: int64

In [30]:
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun','Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

fig = px.bar(month, x=months, y=month.values, labels={'y': 'Count of movies of released each month', 'x': 'Month'}, color_discrete_sequence=['darkblue'])

fig.show()


### EDA of Day

In [31]:
day = data['day'].value_counts(sort=False)
day

0     17
4    383
3     59
1     29
2     82
6      9
5      5
Name: day, dtype: int64

In [32]:
days = ['Mon', 'Tues' , 'Wed' , 'Thurs' , 'Fri' , 'Sat' , 'Sun']

fig = px.bar(day, x=days, y=day.values, labels={'y': 'Count of Movies released each Day', 'x': 'Day'}, color_discrete_sequence=['brown'])

fig.show()


## Top Entries

In [33]:
top_10_rating_bygenre = data.groupby('Genre')['IMDB Score'].mean().sort_values(ascending=False)[:10]

top_10_rating_bygenre

Genre
Animation/Christmas/Comedy/Adventure    8.200000
Musical / Short                         7.700000
Concert Film                            7.633333
Anthology/Dark comedy                   7.600000
Animation / Science Fiction             7.500000
Making-of                               7.450000
Action-adventure                        7.300000
Historical drama                        7.200000
Coming-of-age comedy-drama              7.200000
Drama-Comedy                            7.200000
Name: IMDB Score, dtype: float64

In [34]:
fig = px.bar(top_10_rating_bygenre, x=top_10_rating_bygenre.index, y=top_10_rating_bygenre.values,labels={'y': 'Average Rating Score', 'x': 'Genre'}, color_discrete_sequence=['pink'])

fig.show()


In [35]:
bottom_10_ratings_by_genre = data.groupby('Genre')['IMDB Score'].mean().sort_values()[:10]

bottom_10_ratings_by_genre


Genre
Heist film/Thriller        3.700000
Musical/Western/Fantasy    3.900000
Horror anthology           4.300000
Political thriller         4.300000
Superhero-Comedy           4.400000
Science fiction/Drama      4.533333
Romance drama              4.600000
Mystery                    4.650000
Horror thriller            4.700000
Anime / Short              4.700000
Name: IMDB Score, dtype: float64

In [36]:
fig = px.bar(bottom_10_ratings_by_genre, x=bottom_10_ratings_by_genre.index, y=bottom_10_ratings_by_genre.values,labels={'y': 'Average Rating Score', 'x': 'Genre'}, color_discrete_sequence=['purple'])

fig.show()


## Top 20 high rating movies

In [37]:
top_20_movies = data[['IMDB Score', 'Title', 'Genre', 'year', 'Language']].sort_values(['IMDB Score'] , ascending=False)[:20]

top_20_movies


Unnamed: 0,IMDB Score,Title,Genre,year,Language
583,9.0,David Attenborough: A Life on Our Planet,Documentary,2020,English
582,8.6,Emicida: AmarElo - It's All For Yesterday,Documentary,2020,Portuguese
581,8.5,Springsteen on Broadway,One-man show,2018,English
580,8.4,Winter on Fire: Ukraine's Fight for Freedom,Documentary,2015,English/Ukranian/Russian
579,8.4,Taylor Swift: Reputation Stadium Tour,Concert Film,2018,English
578,8.4,Ben Platt: Live from Radio City Music Hall,Concert Film,2020,English
577,8.3,Dancing with the Birds,Documentary,2019,English
576,8.3,Cuba and the Cameraman,Documentary,2017,English
573,8.2,Klaus,Animation/Christmas/Comedy/Adventure,2019,English
571,8.2,13th,Documentary,2016,English


In [38]:
fig = px.scatter(top_20_movies, y='Title', x='IMDB Score',hover_data=top_20_movies[['Genre', 'year','Language']], color='Genre',title="Top 20 High Rated Programs")

fig.show()


In [39]:
bottom_20_movies  = data[['IMDB Score' , 'Title' , 'Genre' , 'year', 'Language']].sort_values(['IMDB Score'])[:20]

bottom_20_movies

Unnamed: 0,IMDB Score,Title,Genre,year,Language
0,2.5,Enter the Anime,Documentary,2019,English/Japanese
1,2.6,Dark Forces,Thriller,2020,Spanish
2,2.6,The App,Science fiction/Drama,2019,Italian
3,3.2,The Open House,Horror thriller,2018,English
4,3.4,Kaali Khuhi,Mystery,2020,Hindi
5,3.5,Drive,Action,2019,Hindi
6,3.7,Leyla Everlasting,Comedy,2020,Turkish
7,3.7,The Last Days of American Crime,Heist film/Thriller,2020,English
8,3.9,Paradox,Musical/Western/Fantasy,2018,English
9,4.1,Sardar Ka Grandson,Comedy,2021,Hindi


In [40]:
fig = px.scatter(bottom_20_movies , x='IMDB Score' ,y='Title' , hover_data=bottom_20_movies[['Genre' , 'year' , 'Language']] , title='20 Lowest Rated Programs' , color='Genre')

fig.show()