In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor

# Load the dataset
df = pd.read_csv('./data/netflix_titles.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [9]:
# Express but not realistic cleaning
df = df.dropna()

# Convert duration to minutes for movies
df['duration_min'] = df[df['type'] == 'Movie']['duration'].str.extract('(\d+)').astype(float)

In [10]:
# Count of movies vs. TV shows
content_types = df['type'].value_counts()

# Top 10 countries producing content
top_countries = df['country'].value_counts().nlargest(10)

# Content added over the years
# add arg format='mixed' to correct an error in the dataset
df['year_added'] = pd.to_datetime(df['date_added'], format='mixed').dt.year
content_by_year = df['year_added'].value_counts().sort_index()

# Top 10 genres
df['genre'] = df['listed_in'].str.split(',').str[0]
top_genres = df['genre'].value_counts().nlargest(10)

In [11]:
# Set the Seaborn style
sns.set(style="whitegrid")

# Function to create and save a chart
def create_chart(data, title, filename, kind='bar', figsize=(10, 6)):
    """
    Create a chart based on the input data.

    Parameters:
    data (list): A list of data points to be plotted on the chart.

    Returns:
    None
    """
    plt.figure(figsize=figsize)
    if kind == 'bar':
        ax = sns.barplot(x=data.index, y=data.values)
    elif kind == 'line':
        ax = sns.lineplot(x=data.index, y=data.values)
    
    ax.set_title(title, fontsize=16)
    ax.set_xlabel('')
    ax.tick_params(axis='x', rotation=45)
    plt.tight_layout()
    plt.savefig('./img/slides/'+filename)
    plt.close()

# Create charts
create_chart(content_types, 'Movies vs. TV Shows', 'content_types.png')
create_chart(top_countries, 'Top 10 Countries Producing Netflix Content', 'top_countries.png')
create_chart(content_by_year, 'Content Added by Year', 'content_by_year.png', kind='line')
create_chart(top_genres, 'Top 10 Genres on Netflix', 'top_genres.png')

In [14]:
# Distribution of movie durations
plt.figure(figsize=(10, 6))
sns.histplot(df[df['type'] == 'Movie']['duration'].str.extract('(\d+)').astype(int), kde=True)
plt.title('Distribution of Movie Durations')
plt.xlabel('Duration (minutes)')
plt.savefig('./img/slides/movie_duration_dist.png')
plt.close()

# Relationship between release year and duration for movies
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df[df['type'] == 'Movie'], 
                x='release_year', 
                y='duration_min')
plt.title('Movie Duration vs. Release Year')
plt.xlabel('Release Year')
plt.ylabel('Duration (minutes)')
plt.savefig('./img/slides/duration_vs_year.png')
plt.close()

# Content ratings distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=df, y='rating', order=df['rating'].value_counts().index)
plt.title('Distribution of Content Ratings')
plt.xlabel('Count')
plt.ylabel('Rating')
plt.savefig('./img/slides/rating_distribution.png')
plt.close()

In [16]:
# Create a new presentation
prs = Presentation()

# Define custom colors
DARK_BLUE = RGBColor(0, 32, 96)
LIGHT_BLUE = RGBColor(197, 217, 241)

# Function to add a title slide
def add_title_slide(title, subtitle):
    """
    Add a title slide to a presentation.

    Parameters:
    title (str): The title of the slide.
    slide_content (str): The content of the slide.

    Returns:
    None
    """
    slide_layout = prs.slide_layouts[0]
    slide = prs.slides.add_slide(slide_layout)
    slide.shapes.title.text = title
    slide.placeholders[1].text = subtitle
    
    # Customize title font
    title_shape = slide.shapes.title
    title_shape.text_frame.paragraphs[0].font.color.rgb = DARK_BLUE
    title_shape.text_frame.paragraphs[0].font.size = Pt(44)

# Function to add a content slide with an image
def add_content_slide(title, image_path):
    """
    Add content to a slide.

    Parameters:
    content (str): The content to be added to the slide.

    Returns:
    None
    """
    slide_layout = prs.slide_layouts[5]
    slide = prs.slides.add_slide(slide_layout)
    slide.shapes.title.text = title
    
    # Add image
    left = Inches(1)
    top = Inches(1.5)
    width = Inches(8)
    height = Inches(5.5)
    slide.shapes.add_picture(image_path, left, top, width, height)
    
    # Customize title font
    title_shape = slide.shapes.title
    title_shape.text_frame.paragraphs[0].font.color.rgb = DARK_BLUE
    title_shape.text_frame.paragraphs[0].font.size = Pt(32)

# Add slides
add_title_slide("Netflix Content Analysis", "Insights from the Netflix Movies and TV Shows Dataset")
add_content_slide("Movies vs. TV Shows", "./img/slides/content_types.png")
add_content_slide("Top 10 Countries Producing Netflix Content", "./img/slides/top_countries.png")
add_content_slide("Content Added by Year", "./img/slides/content_by_year.png")
add_content_slide("Top 10 Genres on Netflix", "./img/slides/top_genres.png")
add_content_slide("Distribution of Movie Durations", "./img/slides/movie_duration_dist.png")
add_content_slide("Movie Duration vs. Release Year", "./img/slides/duration_vs_year.png")
add_content_slide("Distribution of Content Ratings", "./img/slides/rating_distribution.png")

# Save the presentation
prs.save('Netflix_Content_Analysis.pptx')
print("Presentation saved as Netflix_Content_Analysis.pptx")

Presentation saved as Netflix_Content_Analysis.pptx


In [17]:
# Customize the slide master
slide_master = prs.slide_master

# Set background color
background = slide_master.background
fill = background.fill
fill.solid()
fill.fore_color.rgb = LIGHT_BLUE

# Customize title style
title_style = slide_master.slide_layouts[0].placeholders[0].text_frame.paragraphs[0].font
title_style.name = 'Arial'
title_style.size = Pt(44)
title_style.color.rgb = DARK_BLUE

# Customize body text style
body_style = slide_master.slide_layouts[1].placeholders[1].text_frame.paragraphs[0].font
body_style.name = 'Arial'
body_style.size = Pt(18)
body_style.color.rgb = RGBColor(0, 0, 0)

# Save the updated presentation
prs.save('Netflix_Content_Analysis_Themed.pptx')
print("Themed presentation saved as Netflix_Content_Analysis_Themed.pptx")

Themed presentation saved as Netflix_Content_Analysis_Themed.pptx
