# Import necessary libraries


In [None]:
import pandas as pd 
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
import calendar
from collections import Counter
from plotly.subplots import make_subplots

# Read data from a CSV file into a pandas DataFrame and handle defects


In [None]:
df = pd.read_csv('netflix_titles.csv')
df.info()

Check and display null rates for each column in the DataFrame

In [None]:
for i in df.columns:
    null_rate = df[i].isna().sum() / len(df) * 100 
    if null_rate > 0 :
        print(f"{i} null rate: {round(null_rate, 2)}%")


Dealing with missing data
* Replace NaN values in specific columns with 'No Data'
* Drop rows with NaN values in any column
* Drop duplicate rows from the DataFrame

In [None]:
df['country'].replace(np.nan, 'No Data', inplace  = True)
df['cast'].replace(np.nan, 'No Data', inplace  = True)
df['director'].replace(np.nan, 'No Data', inplace  = True)

df.dropna(inplace=True)

df.drop_duplicates(inplace= True)

In [None]:
df.head()

Dealing with dates, seasons and durations

In [None]:
df["date_added"] = pd.to_datetime(df['date_added'], format='%B %d, %Y', errors='coerce')
df['year_added'] = df['date_added'].dt.year.fillna(0).astype(int)
df['month_added'] = df['date_added'].dt.month.fillna(0).astype(int)

df['season_count'] = df.apply(lambda x: x['duration'].split(" ")[0] if "Season" in x['duration'] else "", axis=1)
df['duration'] = df.apply(lambda x: x['duration'].split(" ")[0] if "Season" not in x['duration'] else "", axis=1)
df.head()

In [None]:
grouped_data = df.groupby('type').size().reset_index(name='count')

fig = px.pie(grouped_data, names='type', values='count', title='Distribution of Netflix Content Types',
             hole=0.3, color_discrete_sequence=px.colors.qualitative.Set3)


fig.update_layout(margin=dict(l=20, r=20, t=60, b=20), showlegend=True)

fig.show()

# Distribution of Netflix content types
This code generates a pie chart using Plotly to illustrate the distribution of Netflix content types. It begins by grouping the DataFrame by 'type' and calculating the count for each type. The pie chart is then created with specific settings such as title, colors, and layout adjustments. Finally, the interactive plot is displayed.

In [None]:
col = "year_added"

d1 = df[(df[col] > 0) & (df["type"] == "TV Show")]
d2 = df[(df[col] > 0) & (df["type"] == "Movie")]

tv_shows = d1[col].value_counts().reset_index()
tv_shows['percent'] = tv_shows['count'].apply(lambda x : 100 * x / sum(tv_shows['count']))
tv_shows = tv_shows.sort_values(col)

movies = d2[col].value_counts().reset_index()
movies['percent'] = movies['count'].apply(lambda x : 100 * x / sum(movies['count']))
movies = movies.sort_values(col)


Trend of content added over the years for TV Shows and Movies.

In [None]:
trace1 = go.Scatter(x=tv_shows[col], y=tv_shows["count"], name="TV Shows", marker=dict(color="#a678de"))
trace2 = go.Scatter(x=movies[col], y=movies["count"], name="Movies", marker=dict(color="#6ad49b"))

layout = go.Layout(title="Content added over the years",
                   xaxis=dict(title='Year'), yaxis=dict(title='Content'), 
                   legend=dict(x=0.1, y=1.1, orientation="h"))

fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()

Percentage of TV shows and movies added over the years. 

In [None]:
trace3 = go.Scatter(x=tv_shows[col], y=tv_shows["percent"], name="TV Shows", line=dict(color="#a678de"))
trace4 = go.Scatter(x=movies[col], y=movies["percent"], name="Movies", line=dict(color="#6ad49b"))

layout = go.Layout(title="Percentage of Content added over the years",
                   xaxis=dict(title='Year'), yaxis=dict(title='Percentage'),
                   legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data=[trace3, trace4], layout=layout)
fig.show()

This code determines the year in which the maximum number of TV shows and movies were added, respectively.

In [None]:
max_tv_shows_year = tv_shows.loc[tv_shows['count'].idxmax()][col]
max_movies_year = movies.loc[movies['count'].idxmax()][col]

print(f"The maximum number of TV Shows was added in {int(max_tv_shows_year)}.")
print(f"The maximum number of Movies was added in {int(max_movies_year)}.")


Distribution of release years for added TV shows and movies, with counts represented by different bars for each content type.

In [None]:
col = "release_year"

d1 = df[(df[col] > 0) & (df["type"] == "TV Show")]
d2 = df[(df[col] > 0) & (df["type"] == "Movie")]

tv_shows = d1[col].value_counts().reset_index()
tv_shows['percent'] = tv_shows['count'].apply(lambda x : 100 * x / sum(tv_shows['count']))
tv_shows = tv_shows.sort_values(col)

movies = d2[col].value_counts().reset_index()
movies['percent'] = movies['count'].apply(lambda x : 100 * x / sum(movies['count']))
movies = movies.sort_values(col)

trace1 = go.Bar(x=tv_shows[col], y=tv_shows["count"], name="TV Shows", marker=dict(color="#a678de"))
trace2 = go.Bar(x=movies[col], y=movies["count"], name="Movies", marker=dict(color="#6ad49b"))
data = [trace1, trace2]
layout = go.Layout(title="Release year of added contents", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

Titles and release years of the first 15 entries for Movies.

In [None]:
result = df[df['duration'] != ""].sort_values("release_year", ascending=True)[['title', 'release_year']][:15]
result

Titles and release years of the first 15 entries for TV shows

In [None]:
result = df[df['season_count'] != ""].sort_values("release_year", ascending=True)[['title', 'release_year']][:15]
result

Displays the distribution of added contents per month

In [None]:
col = 'month_added'

d1 = df[(df[col] > 0) & (df["type"] == "TV Show")]
d2 = df[(df[col] > 0) & (df["type"] == "Movie")]

tv_shows = d1[col].value_counts().reset_index()
tv_shows['percent'] = tv_shows['count'].apply(lambda x : 100 * x / sum(tv_shows['count']))
tv_shows = tv_shows.sort_values(col)

movies = d2[col].value_counts().reset_index()
movies['percent'] = movies['count'].apply(lambda x : 100 * x / sum(movies['count']))
movies = movies.sort_values(col)

trace1 = go.Bar(x=tv_shows[col], y=tv_shows["count"], name="TV Shows", marker=dict(color="#a678de"))
trace2 = go.Bar(x=movies[col], y=movies["count"], name="Movies", marker=dict(color="#6ad49b"))

data = [trace1, trace2]
layout = go.Layout(title="In which month, the conent is added the most?", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)

fig.show()

Print the month in which the most TV shows and movies were added.

In [None]:
max_tv_shows_year = tv_shows.loc[tv_shows['count'].idxmax()][col]
max_movies_year = movies.loc[movies['count'].idxmax()][col]

print(f"The most of the TV Shows was added in {calendar.month_name[int(max_tv_shows_year)]}.")
print(f"The most of the Movies was added in {calendar.month_name[int(max_movies_year)]}.")

Distribution of movie durations

In [None]:
x1 = d2['duration'].fillna(0.0).astype(float)
fig = ff.create_distplot([x1], ['a'], bin_size=0.7, curve_type='normal', colors=["#6ad49b"])
fig.update_layout(title_text='Distplot with Normal Distribution')
fig.show()

Distribution of the number of seasons for TV shows.

In [None]:
col = 'season_count'
tv_shows = d1[col].value_counts().reset_index()
tv_shows['percent'] = tv_shows['count'].apply(lambda x : 100*x/sum(tv_shows['count']))

trace1 = go.Bar(x=tv_shows[col], y=tv_shows["count"], name="TV Shows", marker=dict(color="#a678de"))
data = [trace1]
layout = go.Layout(title="Seasons", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

Compare the distribution of ratings for TV shows and movies

In [None]:
col = "rating"

tv_shows = d1[col].value_counts().reset_index()
tv_shows['percent'] = tv_shows['count'].apply(lambda x : 100*x/sum(tv_shows['count']))
tv_shows = tv_shows.sort_values(col)

movies = d2[col].value_counts().reset_index()
movies['percent'] = movies['count'].apply(lambda x : 100*x/sum(movies['count']))
movies = movies.sort_values(col)

trace1 = go.Bar(x=tv_shows[col], y=tv_shows["count"], name="TV Shows", marker=dict(color="#a678de"))
trace2 = go.Bar(x=movies[col], y=movies["count"], name="Movies", marker=dict(color="#6ad49b"))
data = [trace1, trace2]
layout = go.Layout(title="Content added over the years", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

TV-MA: Mature Audience Only. Intended for adults and may be unsuitable for children under 17.

TV-14: This program contains some material that many parents would find unsuitable for children under 14 years of age.

Visualize the distribution of content categories (genres) for movies

In [None]:
col = "listed_in"
counter_list = Counter(d2[col].str.split(", ").explode().tolist()).most_common(50)
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="#a678de"))

data = [trace1]
layout = go.Layout(title="Content added over the years", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

Most common cast members in movies or TV shows from the United States, India, the United Kingdom, Canada, Spain, and Japan

In [None]:
def country_trace(country, flag="movie"):
    filtered_df = df[(df["cast"] != "No Data") & (df['country'].fillna("").str.lower().apply(lambda x: 1 if country.lower() in x else 0) == 1)]
    
    if flag == "movie":
        filtered_df = filtered_df[filtered_df["duration"] != ""]
    else:
        filtered_df = filtered_df[filtered_df["season_count"] != ""]
    
    tags = Counter(filtered_df['cast'].str.split(", ").explode().to_list()).most_common(25)
    tags = [_ for _ in tags if _[0] != ""]

    labels, values = [_[0] + "  " for _ in tags], [_[1] for _ in tags]
    
    trace = px.bar(y=labels[::-1], x=values[::-1], orientation="h", labels={'x': 'Count', 'y': 'Cast'}, 
                   title=f'Most Common Cast Members in {country}', text=values[::-1], color=values[::-1])
    
    return trace

countries = ["United States", "India", "United Kingdom", "Canada", "Spain", "Japan"]

fig = make_subplots(rows=2, cols=3, subplot_titles=countries)
for i, country in enumerate(countries):
    traces = [country_trace(country)]
    for trace in traces:
        fig.add_trace(trace.data[0], row=i // 3 + 1, col=i % 3 + 1)

fig.update_layout(showlegend=False, height=1500)

Most common cast members in TV shows from the United States, the United Kingdom

In [None]:
traces = []
titles = ["United States", "United Kingdom"]
for title in titles:
        traces.append(country_trace(title, flag="tv_shows"))

fig = make_subplots(rows=1, cols=2, subplot_titles=titles)
for i, trace in enumerate(traces):
    fig.add_trace(trace.data[0], row=1, col=i+1)

fig.update_layout(height=1000, showlegend=False)
fig.show()

Display the most prolific movie directors from the United States based on the amount of content they have produced

In [None]:
col = "director"
filtered_df = df[(df[col] != "No Data") & (df["type"] == "Movie") & (df["country"] == "United States")]

counter_list = Counter(filtered_df[col].fillna("").str.split(", ").explode().to_list()).most_common(10)
counter_list = [_ for _ in counter_list if _[0] != ""]
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="orange"))

data = [trace1]
layout = go.Layout(title="Movie Directors from US with most content", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

Contents of the movie directior with the most content.

In [None]:
tag = counter_list[0][0]
df["relevant"] = df['director'].fillna("").apply(lambda x : 1 if tag in x else 0)
filtered_df = df[df["relevant"] == 1]
filtered_df[['title', 'release_year', 'listed_in', 'director']]

In [None]:
tag = Counter(filtered_df['listed_in']).most_common(1)[0][0]
df["relevant"] = df['listed_in'].fillna("").apply(lambda x : 1 if tag.lower() in x.lower() else 0)
filtered_df = df[df["relevant"] == 1]
filtered_df[filtered_df["country"] == "United States"][["title", "country","release_year"]].head(10)