# Movies on Netflix, Prime Video, Hulu and Disney+

In [279]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [280]:
dataset = pd.read_csv('MoviesOnStreamingPlatforms_updated.csv')

In [281]:
genres = dataset['Genres'].str.get_dummies(',')
data = pd.concat([dataset,genres],axis=1,sort=False)
data.drop(['Unnamed: 0'],axis=1,inplace=True)
data.fillna(np.nan,inplace=True)

In [282]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Directors,Genres,Country,Language,Runtime
0,0,1,Inception,2010,13+,8.8,87%,1,0,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,1,2,The Matrix,1999,18+,8.7,87%,1,0,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0
2,2,3,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0
3,3,4,Back to the Future,1985,7+,8.5,96%,1,0,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0
4,4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0


In [283]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16744 entries, 0 to 16743
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       16744 non-null  int64  
 1   ID               16744 non-null  int64  
 2   Title            16744 non-null  object 
 3   Year             16744 non-null  int64  
 4   Age              7354 non-null   object 
 5   IMDb             16173 non-null  float64
 6   Rotten Tomatoes  5158 non-null   object 
 7   Netflix          16744 non-null  int64  
 8   Hulu             16744 non-null  int64  
 9   Prime Video      16744 non-null  int64  
 10  Disney+          16744 non-null  int64  
 11  Type             16744 non-null  int64  
 12  Directors        16018 non-null  object 
 13  Genres           16469 non-null  object 
 14  Country          16309 non-null  object 
 15  Language         16145 non-null  object 
 16  Runtime          16152 non-null  float64
dtypes: float64(2

# 

# -------------------------------------------------------------------------------------

# 

## Data Cleaning / Preprocessing

In [284]:
#Dropping columns:
#'Type' values are all = "0" , so it is an unnecessary column

columns_to_drop = ['Type', 'Unnamed: 0']
dataset.drop(columns_to_drop, inplace=True, axis=1)

In [285]:
dataset.head()

Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Directors,Genres,Country,Language,Runtime
0,1,Inception,2010,13+,8.8,87%,1,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,2,The Matrix,1999,18+,8.7,87%,1,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0
2,3,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0
3,4,Back to the Future,1985,7+,8.5,96%,1,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0
4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0


# 

### Remove rows that are null if any

In [286]:
#Before dropping null values

dataset.shape[0]

16744

In [287]:
#Remove rows with at least 1 element missing

dataset.dropna()
dataset.shape[0]

16744

In [288]:
#Remove rows with no IMDB ratings, Rotten Tomatoes ratings, Directors, Genres, or Country

dataset.dropna(subset=['IMDb', 'Directors', 'Genres', 'Country'],inplace=True)
dataset.shape[0]

15562

# 

### Remove duplicate rows if any

In [289]:
dataset.drop_duplicates(inplace=True)
dataset.shape[0]

15562

# 

### Overview check before analysis 

In [290]:
dataset.head()

Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Directors,Genres,Country,Language,Runtime
0,1,Inception,2010,13+,8.8,87%,1,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,2,The Matrix,1999,18+,8.7,87%,1,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0
2,3,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0
3,4,Back to the Future,1985,7+,8.5,96%,1,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0
4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0


In [291]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15562 entries, 0 to 16742
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               15562 non-null  int64  
 1   Title            15562 non-null  object 
 2   Year             15562 non-null  int64  
 3   Age              7119 non-null   object 
 4   IMDb             15562 non-null  float64
 5   Rotten Tomatoes  5039 non-null   object 
 6   Netflix          15562 non-null  int64  
 7   Hulu             15562 non-null  int64  
 8   Prime Video      15562 non-null  int64  
 9   Disney+          15562 non-null  int64  
 10  Directors        15562 non-null  object 
 11  Genres           15562 non-null  object 
 12  Country          15562 non-null  object 
 13  Language         15386 non-null  object 
 14  Runtime          15388 non-null  float64
dtypes: float64(2), int64(6), object(7)
memory usage: 1.9+ MB


# -------------------------------------------------------------------------------------

# 

## Data Analysis

### Movie count percentage from streaming services

In [292]:
#Streaming services 

netflix = len(data[data['Netflix'] == 1])
hulu = len(data[data['Hulu'] == 1])
prime = len(data[data['Prime Video'] ==1])
disney = len(data[data['Disney+'] == 1])

Platform = ['Netflix','Hulu','Prime Video','Disney+']

Count = [netflix,hulu,prime,disney]

fig = px.pie(names = Platform,values=Count,
             title = 'Movie Count Of Different Streaming Services',
             color_discrete_sequence = px.colors.sequential.Darkmint)

fig.update_traces(textposition ='inside', textinfo = 'percent+label')

fig.show()

# 

### Average runtime on different streaming services

In [293]:
#First, gather the 1's for the streaming services columns
netflix_movies = data.loc[data['Netflix'] == 1].drop(['Hulu', 'Prime Video', 'Disney+', 'Type'],axis=1)
hulu_movies = data.loc[data['Hulu'] == 1].drop(['Netflix', 'Prime Video', 'Disney+', 'Type'],axis=1)
prime_video_movies = data.loc[data['Prime Video'] == 1].drop(['Netflix','Hulu', 'Disney+', 'Type'],axis=1)
disney_movies = data.loc[data['Disney+'] == 1].drop(['Netflix','Hulu', 'Prime Video', 'Type'],axis=1)


#Average runtime on different streaming services
avg_runtime = [netflix_movies['Runtime'].mean(),hulu_movies['Runtime'].mean(),prime_video_movies['Runtime'].mean(),
               disney_movies['Runtime'].mean()]

platform = ['Netflix','Hulu','Prime Video','Disney+']

runtime_ott = pd.DataFrame({'Platforms':platform,'Avg Runtime': avg_runtime})

fig = px.bar(runtime_ott,
             x = 'Platforms',
             y = 'Avg Runtime',
             color = 'Avg Runtime',
             color_continuous_scale = 'Darkmint',
             title = 'Average Runtime on different Streaming Services')

fig.show()

# 

### Number of movies with IMDb ratings 8.0+ across the different Streaming Services

In [359]:
count_imdb = [len(netflix_movies[netflix_movies['IMDb']>8]),
              len(hulu_movies[hulu_movies['IMDb']>8]),
              len(prime_video_movies[prime_video_movies['IMDb']>8]),
              len(disney_movies[disney_movies['IMDb']>8])
             ]

platform = ['Netflix','Hulu','Prime Video','Disney+']

top_rated = pd.DataFrame({'Platforms':platform,'Count':count_imdb})

fig = px.bar(top_rated,
             x = 'Platforms',
             y = 'Count',
             color = 'Count',
             color_continuous_scale = 'Darkmint',
             title = 'Movies with IMDb Ratings 8.0+ vs Streaming Services'
            )
fig.show()

# 

### Movie count by year

In [298]:
year_count = data.groupby('Year')['Title'].count()
year_movie = data.groupby('Year')[['Netflix','Hulu','Prime Video','Disney+']].sum()
year_data = pd.concat([year_count,year_movie],axis=1).reset_index().rename(columns={'Title':'Movie Count'})

fig = px.bar(year_data,
              x = 'Year',
              y = 'Movie Count',
              color = 'Movie Count',
              color_continuous_scale = 'Darkmint',
              title = 'Movie Count By Year'
             )
fig.show()

# 

### Best movie by year (IMDb rating)

In [299]:
best_movie_year = data.sort_values('IMDb',ascending=False).groupby('Year').first().reset_index()

fig = px.scatter(best_movie_year,
                 x = 'Year',
                 y = 'IMDb',
                 color_continuous_scale = 'Darkmint',
                 color = 'IMDb',
                 size = 'IMDb',
                 title = 'Best Movie Each Year According to IMDB Rating'
                )

fig.show()

### Worst movie by year (IMDb rating)

In [300]:
worst_movie_year = data.sort_values('IMDb',ascending=True).groupby('Year').first().reset_index()

fig = px.scatter(worst_movie_year,
                 x = 'Year',
                 y = 'IMDb',
                 color_continuous_scale = 'Reds',
                 color = 'IMDb',
                 size = 'IMDb',
                 title = 'Worst Movie Each Year According to IMDB Rating'
                )

fig.show()

# 

### Top genres by count

In [301]:
gen_count = data.groupby('Genres')['Title'].count()
gen_movie = data.groupby('Genres')[['Netflix','Hulu','Prime Video','Disney+']].sum()
gen_data = pd.concat([gen_count,gen_movie],axis=1).reset_index().rename(columns={'Title':'Movie Count'})
gen_data = gen_data.sort_values('Movie Count',ascending=False)[:10]

fig = px.bar(gen_data,
             x = 'Genres',
             y = 'Movie Count',
             color = 'Movie Count',
             color_continuous_scale = 'Darkmint',
             title = 'Top 10 Genres by Number of Movies'
            )

fig.show()

### Least popular genres by movie count

In [302]:
gen_count = data.groupby('Genres')['Title'].count()
gen_movie = data.groupby('Genres')[['Netflix','Hulu','Prime Video','Disney+']].sum()
gen_data = pd.concat([gen_count,gen_movie],axis=1).reset_index().rename(columns={'Title':'Movie Count'})
gen_data = gen_data.sort_values('Movie Count',ascending=True)[:10]

fig = px.bar(gen_data,
             x = 'Genres',
             y = 'Movie Count',
             color = 'Movie Count',
             color_continuous_scale = 'Reds',
             title = 'Bottom 10 Genres by Number of Movies'
            )

fig.show()

# 

### Top 10 movies by IMDb rating

In [303]:
imdb_rating = data.sort_values('IMDb',ascending=False).reset_index(drop=True).head(10)
imdb_rating.fillna("NA",inplace=True)

for x in ['Netflix','Hulu','Prime Video','Disney+']:
    imdb_rating[x].replace(1,x,inplace=True)
    imdb_rating[x].replace(0,"",inplace=True)

imdb_rating['Platform'] = imdb_rating[['Netflix','Hulu','Prime Video','Disney+']].agg("  ".join,axis=1)

fig = px.bar(imdb_rating,
             x = 'Title',
             y = 'IMDb',
             title = 'Top 10 Movies (IMDB Rating)',
             color = 'IMDb', 
             color_continuous_scale = 'Darkmint'
            )

fig.show()

# 

### Top 10 rated movies on each platform and their genre (IMDb ratings)

In [317]:
def val_sum(df,c):
    return df[c].sum(axis=0)

In [318]:
val_counts = []
dfs = [dataset]
cols = ['Netflix','Hulu','Prime Video','Disney+']

for x in dfs:
    for y in cols:
        val_counts.append(val_sum(x,y))

In [353]:
def burstchart(dataframe,platform,c):
    dataframe=dataframe.loc[dataframe[platform] == 1]
    dataframe=dataframe.sort_values(by='IMDb', ascending=False)
    rating = dataframe[0:10]
    fig = px.sunburst(
    rating,
    path = ['Title','Genres'],
    values = 'IMDb',
    color = 'IMDb',
    color_continuous_scale = c)
    fig.show()

In [355]:
#Netflix
burstchart(dataset,'Netflix','reds')

In [356]:
#Hulu
burstchart(dataset,'Hulu','greens')

In [357]:
#Prime Video
burstchart(dataset,'Prime Video','oryel')

In [358]:
#Disney+
burstchart(dataset,'Disney+','dense')

# 