# Netflix data trends using plotly

Source = https://www.kaggle.com/shivamb/netflix-shows

Description = This dataset consists of tv shows and movies available on Netflix as of 2019. The dataset is collected from Flixable which is a third-party Netflix search engine. 

In [1]:
import pandas as pd
import numpy as np
import os
from collections import Counter
import ipywidgets

import plotly.graph_objs as go
import plotly.express as px

In [3]:
netflix_df = pd.read_csv('data/netflix_titles.csv')

In [39]:
## Clean data
def convert_to_list(x):
    if isinstance(x, list):
        return x
    elif not isinstance(x, str):
        return []
    else:
        return [i.strip() for i in x.split(',')]
    
netflix_df['listed_in'] = netflix_df['listed_in'].apply(convert_to_list)
netflix_df['country'] = netflix_df['country'].apply(convert_to_list)
netflix_df['cast'] = netflix_df['cast'].apply(convert_to_list)
netflix_df['date_added'] = netflix_df['date_added'].apply(lambda x: pd.to_datetime(x))

## Analysis of most popular category (listed_id)
category_count = Counter(pd.Series([item for sublist in netflix_df.listed_in for item in sublist])).most_common()
category_df = pd.DataFrame({"type": list(dict(category_count).keys()), 
                            "count": list(dict(category_count).values())})
  
fig1 = px.treemap(category_df, 
                path=['type'], 
                values='count',
                title = 'Treemap of Type')
fig1.update_traces(hovertemplate=None)
fig1.update_layout(hovermode="x")
fig1.show()

fig2 = px.bar(category_df, 
             x='type', y='count',
             title = 'Count of Categories',
             labels = {'x': 'category',
                       'y': 'count'})
fig2.update_traces(hovertemplate=None)
fig2.update_layout(hovermode="x")
fig2.show()

## Category by country
countries = {ctry for i in netflix_df['country'] for ctry in i if ctry != ''}

content_country_dict = {}
for country in countries:
    content_country_dict[country] = {content:0 for i in netflix_df['listed_in'] for content in i}

for country in countries:
    filtered_df = netflix_df[netflix_df.country.apply(lambda x: country in x)]
    for ls in filtered_df['listed_in'].to_list():
        for val in ls:
            content_country_dict[country][val] += 1

a = pd.DataFrame(columns=['category', 'count', 'country'])

for country in countries:
    a = a.append(pd.DataFrame(data={"category": list(content_country_dict[country].keys()),
                "count": list(content_country_dict[country].values()),
                "country": country}))

a_list = ['United States', 'India', 'United Kingdom', 'France', 'Canada', 'Japan', 'Spain']
a_reduced = a[a.country.isin(a_list)]

fig3 = px.sunburst(a_reduced, path=['country', 'category'], values='count', title='Netflix Categories by Country')
fig3.update_traces(hovertemplate=None)
fig3.update_layout(hovermode="x")
fig3.show()

## Movie vs TV show by year
def response(change):
    if rel_year.value == 'ALL':
        with fig.batch_update():
            fig.data[0].x = list(netflix_df['type'])
    else:
        filter_df = netflix_df[(netflix_df.release_year == rel_year.value)]
        with fig.batch_update():
            fig.data[0].x = list(filter_df['type'])

rel_year = ipywidgets.Dropdown(
                            options=['ALL'] + sorted(list(netflix_df['release_year'].unique()), reverse=True),
                            value='ALL'
                        )
rel_description = ipywidgets.HTML('Release Year: ')

rel_year.observe(response, names="value")

trace = go.Histogram(x=netflix_df['type'], 
                    name='Type')

fig4 = go.FigureWidget(data=[trace],
                    layout=go.Layout(
                        title='Media Type'
                    ))

fig4.update_traces(hovertemplate=None)
fig4.update_layout(hovermode="x")
fig4.update_traces(marker_color=['green', 'red'])

container = ipywidgets.HBox([rel_description, rel_year])
display(ipywidgets.VBox([container,
                fig]))

if not os.path.exists("results"):
    os.mkdir('results')

if [i for i in os.listdir('results/') if i == "netflix_summary.html"]:
    os.remove("results/netflix_summary.html")

with open('results/netflix_summary.html', 'a') as f:
    f.write("""
    <h1>
        Netflix Analysis Output
    </h1>

    <p>
        Source = https://www.kaggle.com/shivamb/netflix-shows
    </p>

    <p>
        Description = This dataset consists of tv shows and movies available on Netflix as of 2019. The dataset is collected from Flixable which is a third-party Netflix search engine.
    </p>

    <p>
        Fig 1 shows the type of movie/TV show by category. The bigger the category the more movies/TV shows.
    </p>
    """)
    fig1.write_html(f)
    f.write("""
    <p>
        Fig 2 shows the count of categories for the Movies and TV shows.
    </p>
    """
    )
    fig2.write_html(f)
    f.write("""
    <p>
        Fig 3 shows Netflix categories by country.
    </p>
    """
    )
    fig3.write_html(f)
    f.write("""
    <p>
        Fig 4 shows the count of categories, Movies vs TV Shows.
    </p>
    """
    )
    fig4.write_html(f)

VBox(children=(HBox(children=(HTML(value='Release Year: '), Dropdown(options=('ALL', 2020, 2019, 2018, 2017, 2…

yes
