In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df_raw = pd.read_csv(r"C:\Users\user\Desktop\Data-Analytics\1. Netflix Movies and TV Shows\1. Dataset\netflix_titles.csv")


In [3]:
df_raw.sample(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
2553,s2554,TV Show,The Eddy,"Damien Chazelle, Houda Benyamina, Laïla Marrak...","André Holland, Joanna Kulig, Amandla Stenberg,...","United Kingdom, Germany, United States, France","May 8, 2020",2020,TV-MA,1 Season,"International TV Shows, TV Dramas",The owner of a Paris jazz club gets tangled up...
5675,s5676,TV Show,Miss Panda & Mr. Hedgehog,,"Dong-hae Lee, Seung-ah Yoon, Jin-hyuk Choi, So...",South Korea,"December 15, 2016",2012,TV-PG,1 Season,"International TV Shows, Korean TV Shows, Roman...",When a gifted patissier with a gloomy past mee...
7311,s7312,Movie,Little Dragon Maiden,Hua Shan,"Leslie Cheung, Jing-Jing Yung, Chen Kuan-tai, ...",Hong Kong,"August 1, 2018",1983,TV-14,92 min,"Action & Adventure, International Movies, Roma...","Seeking to improve his combat skills, a young ..."
1085,s1086,Movie,Night in Paradise,Park Hoon-jung,"Um Tae-goo, Jeon Yeo-been, Cha Seoung-won, Lee...",South Korea,"April 9, 2021",2021,TV-MA,132 min,"Dramas, International Movies",Hiding out in Jeju Island following a brutal t...
1953,s1954,Movie,My Mother's Wound,Ozan Açıktan,"Ozan Güven, Meryem Uzerli, Belçim Bilgin, Okan...",,"September 25, 2020",2016,TV-MA,115 min,"Dramas, International Movies",After leaving the orphanage where he was raise...


In [4]:
df_raw.shape

(8807, 12)

In [5]:
df_raw.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [6]:
missing_values = df_raw.isnull().sum()

In [7]:
df_raw.nunique()

show_id         8807
type               2
title           8807
director        4528
cast            7692
country          748
date_added      1767
release_year      74
rating            17
duration         220
listed_in        514
description     8775
dtype: int64

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import pandas as pd

# Fill missing values with 'Unknown'
def fill_missing_values(df):
    cols_to_fill = ['director', 'cast', 'rating', 'country']
    for col in cols_to_fill:
        df[col] = df[col].fillna('Unknown')
    return df

# Convert 'date_added' to datetime
def convert_date(df):
    df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
    return df

# Extract year and month from 'date_added'
def extract_date_parts(df):
    df['year_added'] = df['date_added'].dt.year
    df['month_added'] = df['date_added'].dt.month
    return df

# Pipeline
data_pipeline = Pipeline(steps=[
    ('fill_missing', FunctionTransformer(fill_missing_values)),
    ('convert_date', FunctionTransformer(convert_date)),
    ('extract_date', FunctionTransformer(extract_date_parts))
])

# Apply pipeline
df = data_pipeline.fit_transform(df_raw)


In [9]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",2021.0,9.0
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021.0,9.0
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,2021.0,9.0
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,Unknown,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",2021.0,9.0
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2021.0,9.0


In [10]:
df.shape

(8807, 14)

Scenario for 1/27 analysis (frequencies analysis)
- top director by frequencies analysis and what mostly they published and whos the cast
- top 10 director and what type the film they produced
- which country the top director released their film the most
- what year the director have direct the most film
- which director have the most duration on tv show
- which director have the most tv show and for movie also
- who is the top directors by the genres 



In [13]:

# Grouping by director and counting the frequencies
director_counts_df = df['director'].value_counts().reset_index()
director_counts_df.columns = ['director', 'count']

# Filtering out the 'Unknown' director entries
director_counts_df_filtered = director_counts_df[director_counts_df['director'] != "Unknown"]

# Getting the top director from 'df'
top_director_df = director_counts_df_filtered.iloc[0]['director']

# Filtering the dataset for the top director's movies/shows
top_director_data_df = df[df['director'] == top_director_df]

# Extracting information about what they have published and the cast
top_director_publications_df = top_director_data_df[['title', 'type', 'cast']]

top_director_df, top_director_publications_df.head()


('Rajiv Chilaka',
                                                  title   type  \
 406                       Chhota Bheem - Neeli Pahaadi  Movie   
 407                              Chhota Bheem & Ganesh  Movie   
 408                 Chhota Bheem & Krishna: Mayanagari  Movie   
 409  Chhota Bheem & Krishna: Pataliputra- City of t...  Movie   
 410                 Chhota Bheem And The Broken Amulet  Movie   
 
                                                   cast  
 406  Vatsal Dubey, Julie Tejwani, Rupa Bhimani, Jig...  
 407  Vatsal Dubey, Julie Tejwani, Rupa Bhimani, Jig...  
 408  Vatsal Dubey, Julie Tejwani, Rupa Bhimani, Jig...  
 409  Vatsal Dubey, Julie Tejwani, Rupa Bhimani, Jig...  
 410  Vatsal Dubey, Julie Tejwani, Rupa Bhimani, Jig...  )

In [14]:
# Getting the top 10 directors by frequency and analyzing their published content along with the cast

top_10_directors = director_counts_df_filtered.head(10)

# Creating a dictionary to store the analysis for each director
top_10_directors_analysis = {}

for director in top_10_directors['director']:
    # Filtering the dataset for each director's movies/shows
    director_data = df[df['director'] == director]

    # Extracting the most common type of content (Movie/TV Show) and the most common cast members
    most_common_type = director_data['type'].mode()[0]
    most_common_cast = director_data['cast'].mode()[0]

    # Storing the analysis in the dictionary
    top_10_directors_analysis[director] = {
        'Most Common Type': most_common_type,
        'Most Common Cast': most_common_cast,
        'Titles Count': director_data.shape[0]
    }

top_10_directors_analysis


{'Rajiv Chilaka': {'Most Common Type': 'Movie',
  'Most Common Cast': 'Vatsal Dubey, Julie Tejwani, Rupa Bhimani, Jigna Bhardwaj, Rajesh Kava, Mousam, Swapnil',
  'Titles Count': 19},
 'Raúl Campos, Jan Suter': {'Most Common Type': 'Movie',
  'Most Common Cast': 'Sofía Niño de Rivera',
  'Titles Count': 18},
 'Suhas Kadav': {'Most Common Type': 'Movie',
  'Most Common Cast': 'Unknown',
  'Titles Count': 16},
 'Marcus Raboy': {'Most Common Type': 'Movie',
  'Most Common Cast': 'Vir Das',
  'Titles Count': 16},
 'Jay Karas': {'Most Common Type': 'Movie',
  'Most Common Cast': 'Bill Burr',
  'Titles Count': 14},
 'Cathy Garcia-Molina': {'Most Common Type': 'Movie',
  'Most Common Cast': 'Bea Alonzo, Toni Gonzaga, Angel Locsin, Shaina Magdayao, Enchong Dee, Sam Milby, Angeline Quinto, Coney Reyes',
  'Titles Count': 13},
 'Jay Chapman': {'Most Common Type': 'Movie',
  'Most Common Cast': 'D.L. Hughley',
  'Titles Count': 12},
 'Youssef Chahine': {'Most Common Type': 'Movie',
  'Most Common

In [15]:
import plotly.express as px
import pandas as pd

# Assuming you have the 'top_10_directors_analysis' dictionary from the previous analysis
top_directors_visualization_data = pd.DataFrame.from_dict(top_10_directors_analysis, orient='index')
top_directors_visualization_data.reset_index(inplace=True)
top_directors_visualization_data.rename(columns={'index': 'Director'}, inplace=True)

# Creating a bar chart with Plotly
fig = px.bar(top_directors_visualization_data, 
             x='Director', 
             y='Titles Count', 
             title='Top 10 Directors by Number of Titles on Netflix',
             color='Titles Count',
             hover_data=['Most Common Type', 'Most Common Cast'])

fig.show()


In [16]:

# Assuming you have the 'df' DataFrame and 'top_10_directors' from the previous analysis
top_directors_types_data = []
for director in top_10_directors['director']:
    director_data = df[df['director'] == director]
    # Counting the types of content (Movie/TV Show)
    types_count = director_data['type'].value_counts().to_dict()
    for content_type, count in types_count.items():
        top_directors_types_data.append({'Director': director, 'Content Type': content_type, 'Count': count})

# Creating a DataFrame from the extracted data
top_directors_types_df = pd.DataFrame(top_directors_types_data)

# Creating a bar chart with Plotly
fig = px.bar(top_directors_types_df, 
             x='Director', 
             y='Count', 
             color='Content Type',
             title='Top 10 Directors and Types of Content Produced',
             barmode='group')

fig.show()






In [17]:
import plotly.express as px
import pandas as pd

# Assuming you have the 'df' DataFrame and 'top_10_directors' from the previous analysis
top_directors_countries_data = []
for director in top_10_directors['director']:
    director_data = df[df['director'] == director]
    # Counting the countries of release
    countries_count = director_data['country'].value_counts().to_dict()
    for country, count in countries_count.items():
        top_directors_countries_data.append({'Director': director, 'Country': country, 'Count': count})

# Creating a DataFrame from the extracted data
top_directors_countries_df = pd.DataFrame(top_directors_countries_data)

# Creating a bar chart with Plotly

# Assuming you have the top_directors_countries_df DataFrame from the previous analysis
# Creating a bar chart with Plotly with adjusted bar thickness
fig = px.bar(top_directors_countries_df, 
             x='Director', 
             y='Count', 
             color='Country',
             title='Countries Where Top 10 Directors Released Their Films the Most',
             barmode='group')

# Adjusting the gap between bars to make them thicker
fig.update_layout(bargap=0.1)  # Decrease this value to make bars thicker

fig.show()







In [18]:
import plotly.express as px
import pandas as pd

# Assuming you have the top_directors_countries_df DataFrame from the previous analysis

# Initialize a figure
fig = px.choropleth()

# Add a trace for each director
for director in top_directors_countries_df['Director'].unique():
    director_data = top_directors_countries_df[top_directors_countries_df['Director'] == director]
    fig.add_trace(
        px.choropleth(
            director_data,
            locations="Country",
            locationmode='country names',
            color="Count",
            hover_name="Country",
            title=f"Film Distribution of {director}",
            color_continuous_scale=px.colors.sequential.Plasma
        ).data[0]
    )

# Set up the layout for dropdown
fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=[dict(label=director,
                          method="update",
                          args=[{"visible": [director == d for d in top_directors_countries_df['Director'].unique()]}])
                    for director in top_directors_countries_df['Director'].unique()]
        )
    ]
)

# Update the layout
fig.update_layout(
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    annotations=[dict(
        text="Select Director:",
        x=0,
        xref="paper",
        y=1.1,
        yref="paper",
        align="left",
        showarrow=False
    )]
)

fig.show()


In [19]:
# Analyzing the years in which the top 10 directors directed the most films

# Preparing data for visualization
top_directors_years_data = []
for director in top_10_directors['director']:
    director_data = df[df['director'] == director]
    # Counting the release years
    years_count = director_data['release_year'].value_counts().to_dict()
    for year, count in years_count.items():
        top_directors_years_data.append({'Director': director, 'Year': year, 'Count': count})

# Creating a DataFrame from the extracted data
top_directors_years_df = pd.DataFrame(top_directors_years_data)

# Aggregating the data to find the most productive year for each director
most_productive_years = top_directors_years_df.groupby('Director')['Year'].apply(lambda x: x.mode()[0]).reset_index()

most_productive_years



Unnamed: 0,Director,Year
0,Cathy Garcia-Molina,2008
1,Jay Chapman,2012
2,Jay Karas,2012
3,Marcus Raboy,2012
4,Martin Scorsese,1967
5,Rajiv Chilaka,2009
6,"Raúl Campos, Jan Suter",2016
7,Steven Spielberg,1975
8,Suhas Kadav,2013
9,Youssef Chahine,1954


In [20]:
import plotly.express as px
import pandas as pd

# Assuming you have the 'most_productive_years' DataFrame from the previous analysis
# Adjusting the visualization: swapping the X and Y axes and adding the most productive year on each bar

# Using a bar chart to visualize the most productive years with swapped axes
fig = px.bar(most_productive_years, 
             x='Year', 
             y='Director', 
             title='Most Productive Year of Top 10 Directors',
             color='Year',
             text='Year',
             orientation='h')

# Adjusting the layout
fig.update_layout(
    xaxis_title="Most Productive Year",
    yaxis_title="Director"
)

fig.show()


In [22]:
# Analyzing which director has the most total duration for TV shows

# Filtering the dataset for TV Shows
tv_shows = df[df['type'] == 'TV Show']

# The duration column for TV Shows is typically in the format 'X Seasons'. We need to extract the numeric part.
tv_shows['duration_numeric'] = tv_shows['duration'].str.extract('(\d+)').astype(int)

# Grouping by director and summing up the duration
director_duration = tv_shows.groupby('director')['duration_numeric'].sum().reset_index()

# Finding the director with the maximum total duration
max_duration_director = director_duration.loc[director_duration['duration_numeric'].idxmax()]

max_duration_director





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



director            Unknown
duration_numeric       4348
Name: 214, dtype: object

In [23]:
import plotly.express as px
import pandas as pd

# Assuming you have the 'director_duration' DataFrame from the previous analysis
# Filter out 'Unknown' director
director_duration_filtered = director_duration[director_duration['director'] != 'Unknown']

# Sort the data to get the top directors
director_duration_sorted = director_duration_filtered.sort_values(by='duration_numeric', ascending=False).head(10)

# Creating a bar chart to visualize the top directors with the most total duration for TV shows
fig = px.bar(director_duration_sorted, 
             x='duration_numeric', 
             y='director', 
             title='Top Directors by Total Duration of TV Shows',
             labels={'duration_numeric': 'Total Seasons', 'director': 'Director'},
             orientation='h')

fig.show()


In [26]:
import plotly.express as px
import pandas as pd

# Prepare the 'tv_shows_count_filtered' DataFrame
tv_shows_count = tv_shows['director'].value_counts().reset_index()
tv_shows_count.columns = ['director', 'tv_show_count']
tv_shows_count_filtered = tv_shows_count[tv_shows_count['director'] != 'Unknown']

# Getting the top 10 directors with the most TV shows
top_10_tv_show_directors = tv_shows_count_filtered.head(10)

# Creating the Plotly visualization for the top 10 directors
fig = px.bar(top_10_tv_show_directors, 
             x='tv_show_count', 
             y='director', 
             title='Top 10 Directors with the Most TV Shows',
             labels={'tv_show_count': 'Number of TV Shows', 'director': 'Director'},
             orientation='h')

# Adjusting the X scale
fig.update_xaxes(dtick=1)

fig.show()


In [27]:
import plotly.express as px
import pandas as pd

# Prepare the 'movies_count_filtered' DataFrame
movies = df[df['type'] == 'Movie']
movies_count = movies['director'].value_counts().reset_index()
movies_count.columns = ['director', 'movie_count']
movies_count_filtered = movies_count[movies_count['director'] != 'Unknown']

# Getting the top 10 directors with the most movies
top_10_movie_directors = movies_count_filtered.head(10)

# Creating the Plotly visualization for the top 10 directors
fig = px.bar(top_10_movie_directors, 
             x='movie_count', 
             y='director', 
             title='Top 10 Directors with the Most Movies',
             labels={'movie_count': 'Number of Movies', 'director': 'Director'},
             orientation='h')

fig.show()


In [34]:
import plotly.express as px
import pandas as pd

# Re-create the 'top_directors_per_genre' DataFrame
# Splitting the 'listed_in' column as each title can be listed in multiple genres
genres = df['listed_in'].str.split(', ', expand=True).stack().reset_index(level=1, drop=True).to_frame('genre')

# Joining the genres with the original dataframe to get a row for each genre a title is listed in
df_genres = df.join(genres)

# Grouping by genre and director, and counting the number of titles
genre_director_counts = df_genres.groupby(['genre', 'director']).size().reset_index(name='count')

# Filtering out 'Unknown' directors
genre_director_counts_filtered = genre_director_counts[genre_director_counts['director'] != 'Unknown']

# Finding the top director for each genre
top_directors_per_genre = genre_director_counts_filtered.sort_values(['genre', 'count'], ascending=[True, False]).drop_duplicates('genre')

# Creating the Plotly visualization with larger size
fig = px.bar(top_directors_per_genre, 
             y='genre', 
             x='count', 
             color='director', 
             title='Top Directors in Each Genre',
             labels={'count': 'Number of Titles', 'genre': 'Genre'},
             text='director',
             orientation='h')

# Improve readability and adjust the size of the plot
fig.update_layout(yaxis={'categoryorder':'total ascending'}, height=800, width=1200)

fig.show()




