# Exercises - Netflix Data Analysis

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from wordcloud import WordCloud

## Import Data

In [None]:
file_path = "./exercises-netflix-data-analysis-dataset.csv"
df = pd.read_csv(file_path, encoding="latin 1")
print("\nDescribe:\n\n", df.describe())
print("\nShape:\n", df.shape)

In [None]:
df.head()

In [None]:
df['category'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(["season_title", "runtime", "weekly_views", "episode_launch_details"], axis=1)

In [None]:
def uniqueColdata(colname):
    colcount = len(df[colname].unique())
    coluniqueData = df[colname].value_counts()
    print("colname : ", colname, "colcount : ",colcount ,"coluniqueData : ", coluniqueData)

In [None]:
uniqueColdata("cumulative_weeks_in_top_10")

In [None]:
df.dtypes

## Data Analysis

In [None]:
top_20_most_viewed = df.nlargest(20, 'weekly_hours_viewed')[['show_title', 'weekly_hours_viewed']]
print(top_20_most_viewed)

In [None]:
category_counts = df['category'].value_counts()
plt.figure(figsize=(10, 6))
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Cumulative Weeks in Top 10 by Category')
plt.show()

In [None]:
top_show_counts = df.groupby('show_title')['cumulative_weeks_in_top_10'].max().sort_values(ascending=False).head(10)
plt.figure(figsize=(12, 8))
plt.pie(top_show_counts, labels=top_show_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Top 10 Shows with Highest Cumulative Weeks in Top 10')
plt.show()

In [None]:
top_shows = df.groupby('show_title')['cumulative_weeks_in_top_10'].max().sort_values(ascending=False).head(10).reset_index()

fig = go.Figure()

for show in top_shows['show_title']:
    show_data = df[df['show_title'] == show]
    fig.add_trace(go.Scatter(x=show_data['week'], y=show_data['cumulative_weeks_in_top_10'], mode='lines', name=show))

fig.update_layout(title='Stacked Area Chart of Cumulative Weeks in Top 10 by Show Title', xaxis_title='Week', yaxis_title='Cumulative Weeks in Top 10', showlegend=True)
fig.show()

In [None]:
top_shows = df.groupby('show_title')['cumulative_weeks_in_top_10'].max().sort_values(ascending=False).head(10).reset_index()

fig = go.Figure()
for i, show in enumerate(top_shows['show_title']):
    show_data = df[df['show_title'] == show]
    fig.add_trace(go.Scatter(x=show_data['week'], y=show_data['cumulative_weeks_in_top_10'], mode='lines', fill='tonexty' if i < len(top_shows) - 1 else 'tozeroy', line=dict(width=0.5), name=show))

fig.update_layout(title='Ribbon Chart of Cumulative Weeks in Top 10 by Show Title', xaxis_title='Week', yaxis_title='Cumulative Weeks in Top 10',showlegend=True)
fig.show()

In [None]:
top_shows = df.groupby('show_title')['cumulative_weeks_in_top_10'].max().sort_values(ascending=False).head(10).reset_index()
fig = px.treemap(top_shows, path=['show_title'], values='cumulative_weeks_in_top_10', title='Treemap of Cumulative Weeks in Top 10 by Show Title')
fig.show()

In [None]:
top_shows = df.groupby('show_title')['cumulative_weeks_in_top_10'].max().sort_values(ascending=False).head(10).reset_index()
fig = go.Figure()

fig.add_trace(go.Funnel(name='Cumulative Weeks in Top 10', y=top_shows['show_title'], x=top_shows['cumulative_weeks_in_top_10'], textinfo="value+percent initial"
))

fig.update_layout(title='Funnel Chart of Cumulative Weeks in Top 10 by Show Title', xaxis_title='Cumulative Weeks in Top 10',yaxis_title='Show Title')
fig.show()

In [None]:
category_counts = df['category'].value_counts()
fig2 = go.Figure()

fig2.add_trace(go.Funnel(name='Is Staggered Launch', y=category_counts.index, x=category_counts, textinfo="value+percent initial"
))

fig2.update_layout(title='Funnel Chart of Category vs Is Staggered Launch', xaxis_title='Count', yaxis_title='Category')
fig2.show()

In [None]:
top_shows = df.groupby('show_title')['cumulative_weeks_in_top_10'].max().sort_values(ascending=False).head(10).reset_index()
fig1 = go.Figure()

for show in top_shows['show_title']:
    show_data = df[df['show_title'] == show]
    fig1.add_trace(go.Scatter(x=show_data['week'], y=show_data['cumulative_weeks_in_top_10'], mode='lines', name=show))

fig1.update_layout(title='Line Chart of Cumulative Weeks in Top 10 by Show Title', xaxis_title='Week', yaxis_title='Cumulative Weeks in Top 10',
                   showlegend=True)
fig1.show()

In [None]:
def plot_line_chart(df, column_name):
    fig = go.Figure()

    for category in df['category'].unique():
        category_data = df[df['category'] == category]
        fig.add_trace(go.Scatter(x=category_data['week'], y=category_data[column_name], mode='lines', name=category))

    fig.update_layout(title=f'Line Chart of {column_name} by Category', xaxis_title='Week', yaxis_title=column_name, showlegend=True)
    fig.show()

plot_line_chart(df, 'weekly_rank')

In [None]:
plot_line_chart(df, 'weekly_hours_viewed')
plot_line_chart(df, 'cumulative_weeks_in_top_10')
plot_line_chart(df, 'show_title')

In [None]:
def plot_line_charts(df, parameter1, parameter2):
    columns_to_plot = [col for col in df.columns if col not in ['week', 'category', parameter1, parameter2]]

    for column_name in columns_to_plot:
        fig = go.Figure()

        for category in df['category'].unique():
            category_data = df[df['category'] == category]
            fig.add_trace(go.Scatter(x=category_data[parameter1], y=category_data[column_name],
                                     mode='lines',
                                     name=f'{parameter1} - {category}'))

            fig.add_trace(go.Scatter(x=category_data[parameter2], y=category_data[column_name],
                                     mode='lines',
                                     name=f'{parameter2} - {category}',
                                     line=dict(dash='dash')))

        fig.update_layout(title=f'Line Chart of {column_name} by {parameter1} and {parameter2}',
                          xaxis_title=f'{parameter1} / {parameter2}',
                          yaxis_title=column_name,
                          showlegend=True)

        fig.show()

In [None]:
plot_line_charts(df, 'weekly_rank', 'weekly_hours_viewed')

In [None]:
plot_line_charts(df, 'weekly_rank', 'weekly_hours_viewed')
plot_line_charts(df, 'category', 'weekly_rank')
plot_line_charts(df, 'show_title', 'weekly_hours_viewed')
plot_line_charts(df, 'show_title', 'cumulative_weeks_in_top_10')

In [None]:
def generate_word_cloud(df, column_name):
    text = ' '.join(df[column_name].astype(str))

    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for {column_name}')
    plt.show()

generate_word_cloud(df, 'show_title')