In [101]:
import numpy as np
import pandas as pd
import plotly.express as px
from textblob import TextBlob

df = pd.read_csv('/content/drive/MyDrive/Data_Science_Projects/netflix_titles.csv')
df.shape


(8807, 12)

In [4]:
df.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


In [5]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

# Distribution of Centent:
Looking at the distribution of content ratings on Netlix:

In [17]:
z = df.groupby(['rating']).size().reset_index(name='counts')
# z
pieChart = px.pie(z, values='counts', names='rating', title='Distribution of Content Ratings', color_discrete_sequence=px.colors.qualitative.Set3)
pieChart.show()

The most of the content on Netflix is intended for viewing for mature and adult audiences.

# Top 5 Actors and Directors

Let's see the top 5 successful directors on Netflix.

In [39]:
df['director'] = df['director'].fillna('No Director Specified')
filtered_directors = pd.DataFrame()
filtered_directors = df['director'].str.split(',', expand=True).stack()
filtered_directors = filtered_directors.to_frame()
filtered_directors.columns = ['Director']

directors = filtered_directors.groupby(['Director']).size().reset_index(name='Total Content')
directors = directors[directors.Director !='No Director Specified']
directors = directors.sort_values(by=['Total Content'], ascending=False)

directorsTop5 = directors.head()
directorsTop5 = directorsTop5.sort_values(by=['Total Content'])

fig = px.bar(directorsTop5, x = 'Total Content', y = 'Director', title = 'Top 5 Directors on Netflix')
fig.show()

# Top 5 actors on Netflix

Let's also have a look at the actors as we looked at the directors on Netflix.

In [98]:
from unicodedata import name
df['cast'] = df['cast'].fillna('No Cast Specified')
filtered_cast = pd.DataFrame()
filtered_cast = df['cast'].str.split(',', expand=True).stack()
filtered_cast = filtered_cast.to_frame()
filtered_cast.columns = ['Actor']

actors = filtered_cast.groupby(['Actor']).size().reset_index(name = "Total Content")
actors = actors[actors['Actor'] != 'No Cast Specified']
actors = actors.sort_values(by=['Total Content'], ascending=False)

actorsTop5 = actors.head()
actorsTop5 = actorsTop5.sort_values(by=['Total Content'])

fig2 = px.bar(actorsTop5, x = 'Total Content', y = 'Actor', title = 'Top 5 Actors on Netflix')
fig.show()

# Analysing Content on Netflix
Let's analyse the trend of production over the years on Netflix.

In [99]:
dfa = df[['type', 'release_year']]
dfa = dfa.rename(columns = {"type": "Content Type", "release_year": "Release Year"})

df2 = dfa.groupby(['Release Year', "Content Type"]).size().reset_index(name = "Total Content")
df2 = df2[df2['Release Year'] >= 2010]


fig3 = px.line(df2, x = "Release Year", y="Total Content", color='Content Type', title="Trend of content produced over the years after 2010 on Netflix")
fig3.show()

The line graph above illustrates that there has been a decline in the production of the content for both movies and tv show since around 2018.

# Analysis of Sentiment of Content on Netflix

In [104]:
df['description'].head()

0    As her father nears the end of his life, filmm...
1    After crossing paths at a party, a Cape Town t...
2    To protect his family from a powerful drug lor...
3    Feuds, flirtations and toilet talk go down amo...
4    In a city of coaching centers known to train I...
Name: description, dtype: object

In [105]:
dfx = df[["release_year", "description"]]
dfx = dfx.rename(columns = {'release_year': 'Release Year'})
for i, row in dfx.iterrows():
  z = row['description']
  testimonial = TextBlob(z)
  rate = testimonial.sentiment.polarity
  if rate == 0:
    sent = 'Neutral'
  elif rate > 0:
    sent = 'Positive'
  else:
    sent = 'Negative'
  
  dfx.loc[[i, 2],'Sentiment'] = sent

In [106]:
dfx

Unnamed: 0,Release Year,description,Sentiment
0,2020,"As her father nears the end of his life, filmm...",Positive
1,2021,"After crossing paths at a party, a Cape Town t...",Neutral
2,2021,To protect his family from a powerful drug lor...,Negative
3,2021,"Feuds, flirtations and toilet talk go down amo...",Negative
4,2021,In a city of coaching centers known to train I...,Neutral
...,...,...,...
8802,2007,"A political cartoonist, a crime reporter and a...",Negative
8803,2018,"While living alone in a spooky town, a young g...",Positive
8804,2009,Looking to survive in a world taken over by zo...,Neutral
8805,2006,"Dragged from civilian life, a former superhero...",Positive


In [107]:
dfx = dfx.groupby(['Release Year', 'Sentiment']).size().reset_index(name='Total Content')
dfx = dfx[dfx['Release Year'] >= 2010]

fig4 = px.bar(dfx, x="Release Year", y="Total Content", color='Sentiment', title = "Sentiment of content on Netflix")
fig4.show()

So the graph above shows that the overall positive content is more than both neutral and negative content over the years.