# Netflix Content Trends Analysis

This notebook analyzes Movies vs TV Shows over years, genre trends, and country-wise contributions using the attached Netflix dataset.

In [None]:
# 1) Setup: imports and display settings
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

pd.set_option('display.max_colwidth', 120)
pd.set_option('display.max_columns', 50)
sns.set_theme(style='whitegrid', context='notebook')

DATA_PATH = 'Netflix-Dataset.csv'  # ensure this file is in the working directory
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
# 2) Load data and quick peek
df_raw = pd.read_csv(DATA_PATH)
print('Shape:', df_raw.shape)
df_raw.head(3)

In [None]:
# 3) Standardize schema, parse dates, and derive features
df = df_raw.copy()

# Rename columns to snake_case used throughout
col_map = {
    'Show_Id':'show_id',
    'Category':'category',            # Movie or TV Show
    'Title':'title',
    'Director':'director',
    'Cast':'cast',
    'Country':'country',              # may contain multiple comma-separated countries
    'Release_Date':'release_date',    # string date like "August 14, 2020"
    'Rating':'rating',
    'Duration':'duration',            # e.g., '93 min' or '4 Seasons'
    'Type':'genres',                  # comma-separated genres/categories
    'Description':'description'
}
df.rename(columns=col_map, inplace=True)

# Parse release_date â†’ release_year
df['release_date_parsed'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date_parsed'].dt.year

# Clean text fields (strip whitespace)
for c in ['category','title','director','cast','country','rating','duration','genres','description']:
    if c in df.columns:
        df[c] = df[c].astype(str).str.strip()

# Duration parsing: minutes for Movies, seasons for TV Shows
def extract_int(s):
    m = re.search(r'(\d+)', str(s))
    return int(m.group(1)) if m else np.nan

df['movie_minutes'] = np.where(df['category'].str.lower()=='movie', df['duration'].apply(extract_int), np.nan)
df['tv_seasons']    = np.where(df['category'].str.lower()=='tv show', df['duration'].apply(extract_int), np.nan)

# Drop exact duplicates by unique id if present, else by key fields
if 'show_id' in df.columns:
    df = df.drop_duplicates(subset=['show_id'])
else:
    df = df.drop_duplicates(subset=['title','release_year','category','duration','rating'])

print('Post-clean shape:', df.shape)
df[['show_id','category','title','release_year','rating','duration','genres','country']].head(5)

In [None]:
# 4) Data quality overview: nulls and basic counts
nulls = df.isna().sum().sort_values(ascending=False)
display(nulls.to_frame('null_count').T)
print('\nYear range (min, max):', int(df['release_year'].min()), int(df['release_year'].max()))
print('Category distribution:')
display(df['category'].value_counts().to_frame('count'))

In [None]:
# 5) Movies vs TV Shows over the years
movies_tv_by_year = (
    df.dropna(subset=['release_year'])
      .groupby(['release_year','category'])
      .size()
      .reset_index(name='count')
      .sort_values(['release_year','category'])
)
display(movies_tv_by_year.head())

plt.figure(figsize=(12,6))
sns.lineplot(data=movies_tv_by_year, x='release_year', y='count', hue='category', marker='o')
plt.title('Movies vs TV Shows by Release Year')
plt.xlabel('Release Year')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

# Save summary
movies_tv_by_year.to_csv('out_movies_tv_by_year.csv', index=False)

In [None]:
# 6) Genre distribution (overall)
# Split comma-separated genres and explode
genres_exploded = (
    df.dropna(subset=['genres'])
      .assign(genre_list=df['genres'].str.split(','))
      .explode('genre_list')
)
genres_exploded['genre_list'] = genres_exploded['genre_list'].str.strip()
genre_counts = genres_exploded['genre_list'].value_counts().reset_index()
genre_counts.columns = ['genre','count']
display(genre_counts.head(20))

plt.figure(figsize=(12,7))
sns.barplot(data=genre_counts.head(15), x='count', y='genre', palette='viridis')
plt.title('Top Genres on Netflix (Overall)')
plt.xlabel('Count')
plt.ylabel('Genre')
plt.tight_layout()
plt.show()

genre_counts.to_csv('out_genre_counts_overall.csv', index=False)

In [None]:
# 7) Genre popularity over time
genres_time = (
    genres_exploded.dropna(subset=['release_year'])
                   .groupby(['release_year','genre_list'])
                   .size()
                   .reset_index(name='count')
)

# Focus on top 10 overall genres to simplify trend chart
top_genres = genre_counts.head(10)['genre']
genres_time_top = genres_time[genres_time['genre_list'].isin(top_genres)]

plt.figure(figsize=(14,7))
sns.lineplot(data=genres_time_top, x='release_year', y='count', hue='genre_list', marker='o')
plt.title('Genre Popularity Over Time (Top 10 Genres)')
plt.xlabel('Release Year')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

genres_time_top.to_csv('out_genre_popularity_over_time.csv', index=False)

In [None]:
# 8) Country-wise contributions (overall)
# Split on comma as there can be multiple countries
countries_exploded = (
    df.dropna(subset=['country'])
      .assign(country_list=df['country'].str.split(','))
      .explode('country_list')
)
countries_exploded['country_list'] = countries_exploded['country_list'].str.strip()
country_counts = countries_exploded['country_list'].value_counts().reset_index()
country_counts.columns = ['country','count']
display(country_counts.head(20))

plt.figure(figsize=(12,7))
sns.barplot(data=country_counts.head(15), x='count', y='country', palette='mako')
plt.title('Top Countries Contributing to Netflix Catalog (Overall)')
plt.xlabel('Count')
plt.ylabel('Country')
plt.tight_layout()
plt.show()

country_counts.to_csv('out_country_counts_overall.csv', index=False)

In [None]:
# 9) Country contributions over time
countries_time = (
    countries_exploded.dropna(subset=['release_year'])
                     .groupby(['release_year','country_list'])
                     .size()
                     .reset_index(name='count')
)

# Focus on top 10 overall countries
top_countries = country_counts.head(10)['country']
countries_time_top = countries_time[countries_time['country_list'].isin(top_countries)]

plt.figure(figsize=(14,7))
sns.lineplot(data=countries_time_top, x='release_year', y='count', hue='country_list', marker='o')
plt.title('Country Contributions Over Time (Top 10 Countries)')
plt.xlabel('Release Year')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

countries_time_top.to_csv('out_country_contributions_over_time.csv', index=False)

In [None]:
# 10) Rating distribution by category
rating_counts = (
    df[['rating','category']]
      .dropna()
      .groupby(['rating','category'])
      .size()
      .reset_index(name='count')
      .sort_values('count', ascending=False)
)
display(rating_counts.head(20))

plt.figure(figsize=(12,7))
sns.barplot(data=rating_counts, y='rating', x='count', hue='category')
plt.title('Content Rating Distribution by Category')
plt.xlabel('Count')
plt.ylabel('Rating')
plt.tight_layout()
plt.show()

rating_counts.to_csv('out_rating_distribution_by_category.csv', index=False)

In [None]:
# 11) Duration analysis: Movies (minutes) and TV Shows (seasons)
fig, axes = plt.subplots(1, 2, figsize=(14,6))
sns.histplot(df['movie_minutes'].dropna(), bins=30, ax=axes[0], color='#4C78A8')
axes[0].set_title('Movie Durations (Minutes)')
axes[0].set_xlabel('Minutes')

tv_season_counts = df['tv_seasons'].value_counts().sort_index()
sns.barplot(x=tv_season_counts.index.astype(int), y=tv_season_counts.values, ax=axes[1], color='#F58518')
axes[1].set_title('TV Show Seasons Count Distribution')
axes[1].set_xlabel('Seasons')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

summary_duration = pd.DataFrame({
    'movie_minutes_mean':[df['movie_minutes'].mean()],
    'movie_minutes_median':[df['movie_minutes'].median()],
    'tv_seasons_mean':[df['tv_seasons'].mean()],
    'tv_seasons_median':[df['tv_seasons'].median()],
})
display(summary_duration)
summary_duration.to_csv('out_duration_summary.csv', index=False)

In [None]:
# 12) Strategic recommendations scaffold (data-driven notes)
# This cell prints a concise set of insights derived from the summary tables.

print('Strategic Insights (auto-generated scaffold):')

# Identify most recent 5-year trend for Movies vs TV Shows
last_year = int(df['release_year'].max())
start_trend_year = max(int(df['release_year'].min()), last_year - 4)
trend = (
    df[(df['release_year']>=start_trend_year) & (df['release_year']<=last_year)]
      .groupby(['release_year','category']).size().reset_index(name='count')
      .sort_values(['category','release_year'])
)
print(f"- Movies vs TV Shows trend {start_trend_year}-{last_year}:\n", trend.to_string(index=False))

# Top 5 genres overall
top5_genres = (
    df.dropna(subset=['genres'])
      .assign(genre_list=df['genres'].str.split(','))
      .explode('genre_list')
)
top5_genres['genre_list'] = top5_genres['genre_list'].str.strip()
top5_genres = top5_genres['genre_list'].value_counts().head(5)
print('\n- Top 5 genres overall:')
print(top5_genres.to_string())

# Top 5 contributing countries overall
top5_countries = (
    df.dropna(subset=['country'])
      .assign(country_list=df['country'].str.split(','))
      .explode('country_list')
)
top5_countries['country_list'] = top5_countries['country_list'].str.strip()
top5_countries = top5_countries['country_list'].value_counts().head(5)
print('\n- Top 5 contributing countries overall:')
print(top5_countries.to_string())

# Simple recommendation notes (edit as needed)
print('\nRecommendations:')
print('* Maintain or expand investment in the top 3-5 genres that show sustained or rising counts over recent years.')
print('* Identify underrepresented but growing genres and consider targeted originals to capture emerging demand.')
print('* Deepen partnerships in top-producing countries; pilot originals in fast-growing, mid-tier countries to diversify supply.')
print('* Balance movie lengths around the median while experimenting with high-engagement runtimes; calibrate TV seasons for completion rates.')
print('* Use rating distribution by category to position family vs mature content based on market and regional demand signals.')