# Netflix Content Analysis — Individual Project
End-to-end analysis using **Python, Pandas, NumPy, Matplotlib**.

> Replace the dataset path if needed. Run top-to-bottom.

In [None]:

# Imports
import os, sys, math, textwrap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Matplotlib inline (for Jupyter)
%matplotlib inline

# Paths
DATA_PATH = "netflix_titles.csv"   # Put the Kaggle CSV here
FIG_DIR = "figures"
os.makedirs(FIG_DIR, exist_ok=True)

print("Python version:", sys.version)
print("Pandas version:", pd.__version__)
print("Numpy version:", np.__version__)
print("Figures will be saved to:", os.path.abspath(FIG_DIR))


## 1. Load the dataset

In [None]:

# If the file is missing, this will raise a FileNotFoundError
df = pd.read_csv(DATA_PATH, low_memory=False)
print("Shape:", df.shape)
df.head()


## 2. Standardize columns & basic cleaning

In [None]:

# Normalize column names
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

# Strip whitespace in object columns
for c in df.select_dtypes(include="object").columns:
    df[c] = df[c].astype(str).str.strip().replace({"nan": np.nan})

# Parse dates where relevant
if "date_added" in df.columns:
    df["date_added"] = pd.to_datetime(df["date_added"], errors="coerce")
    df["year_added"] = df["date_added"].dt.year
    df["month_added"] = df["date_added"].dt.month

# Deduplicate if show_id exists
if "show_id" in df.columns:
    before = len(df)
    df = df.drop_duplicates(subset=["show_id"])
    after = len(df)
    print(f"Deduplicated by show_id: {before-after} rows removed")

# Quick missing values summary
na = df.isna().sum().sort_values(ascending=False)
na.to_frame("missing_count").head(10)


## 3. Helper transforms: country & genre

In [None]:

def split_explode(frame, col):
    # Split by comma and explode into rows
    s = frame[col].dropna().astype(str).str.split(",")
    ex = frame.loc[s.index].copy()
    ex[col] = s
    ex = ex.explode(col)
    ex[col] = ex[col].str.strip()
    return ex

countries = split_explode(df, "country") if "country" in df.columns else pd.DataFrame()
genres = split_explode(df, "listed_in") if "listed_in" in df.columns else pd.DataFrame()

print("Unique countries:", countries["country"].nunique() if not countries.empty else 0)
print("Unique genres:", genres["listed_in"].nunique() if not genres.empty else 0)


## 4. Movies vs TV Shows

In [None]:

type_counts = df["type"].value_counts(dropna=False)
display(type_counts)

plt.figure()
type_counts.plot(kind="bar")
plt.title("Count by Type")
plt.xlabel("Type")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "type_counts.png"))
plt.show()


## 5. Content Added by Year

In [None]:

if "year_added" in df.columns:
    year_counts = df.groupby("year_added").size().dropna()
    display(year_counts.tail(10))

    plt.figure()
    year_counts.plot(kind="line", marker="o")
    plt.title("Titles Added by Year")
    plt.xlabel("Year Added")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, "titles_by_year_added.png"))
    plt.show()
else:
    print("year_added not available")


## 6. Top Countries

In [None]:

if not countries.empty:
    top_countries = countries["country"].value_counts().head(10)
    display(top_countries)

    plt.figure()
    top_countries.plot(kind="bar")
    plt.title("Top 10 Countries by Titles")
    plt.xlabel("Country")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, "top_countries.png"))
    plt.show()
else:
    print("Country data not available")


## 7. Top Genres

In [None]:

if not genres.empty:
    top_genres = genres["listed_in"].value_counts().head(10)
    display(top_genres)

    plt.figure()
    top_genres.plot(kind="bar")
    plt.title("Top 10 Genres")
    plt.xlabel("Genre")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, "top_genres.png"))
    plt.show()
else:
    print("Genre data not available")


## 8. Ratings Distribution

In [None]:

if "rating" in df.columns:
    ratings = df["rating"].value_counts().sort_values(ascending=False)
    display(ratings)

    plt.figure()
    ratings.plot(kind="bar")
    plt.title("Ratings Distribution")
    plt.xlabel("Rating")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, "ratings_distribution.png"))
    plt.show()
else:
    print("rating column not available")


## 9. Duration (Movies) & Seasons (TV)

In [None]:

if "duration" in df.columns and "type" in df.columns:
    # Extract numeric duration/seasons
    dur_num = df["duration"].str.extract(r"(\d+)").rename(columns={0: "value"}).astype(float)
    df["duration_value"] = dur_num["value"]

    # Split by type
    movies = df[df["type"].str.lower() == "movie"]
    shows = df[df["type"].str.lower() == "tv show"]

    # Movies (minutes)
    mv = movies["duration_value"].dropna()
    if not mv.empty:
        plt.figure()
        mv.hist(bins=20)
        plt.title("Movie Durations (minutes)")
        plt.xlabel("Minutes")
        plt.ylabel("Frequency")
        plt.tight_layout()
        plt.savefig(os.path.join(FIG_DIR, "movie_duration_hist.png"))
        plt.show()

    # TV Shows (seasons)
    sv = shows["duration_value"].dropna()
    if not sv.empty:
        plt.figure()
        sv.hist(bins=10)
        plt.title("TV Show Seasons")
        plt.xlabel("Seasons")
        plt.ylabel("Frequency")
        plt.tight_layout()
        plt.savefig(os.path.join(FIG_DIR, "tvshow_seasons_hist.png"))
        plt.show()
else:
    print("duration/type columns not available")


## 10. Auto-generate a short insights report

In [None]:

insights = []

# 1) Movies vs TV
if "type" in df.columns:
    tc = df["type"].value_counts()
    if not tc.empty:
        top_type = tc.idxmax()
        pct = round(tc.max() / tc.sum() * 100, 1)
        insights.append(f"- **Content mix:** {top_type} leads with ~{pct}% of total titles.")

# 2) Recent years
if "year_added" in df.columns and df["year_added"].notna().any():
    yr = df["year_added"].value_counts().sort_index()
    if not yr.empty:
        peak_year = yr.idxmax()
        insights.append(f"- **Peak addition year:** {int(peak_year)} had the highest number of titles added.")

# 3) Top country
if not countries.empty:
    top_country, top_country_count = countries["country"].value_counts().idxmax(), countries["country"].value_counts().max()
    insights.append(f"- **Top producing country:** {top_country} (~{top_country_count} titles).")

# 4) Top genre
if not genres.empty:
    top_genre, top_genre_count = genres["listed_in"].value_counts().idxmax(), genres["listed_in"].value_counts().max()
    insights.append(f"- **Top genre:** {top_genre} (~{top_genre_count} titles).")

# 5) Ratings
if "rating" in df.columns and df["rating"].notna().any():
    top_rating = df["rating"].value_counts().idxmax()
    insights.append(f"- **Most common rating:** {top_rating}.")

report = "# Netflix Content Analysis — Quick Insights\n\n" + "\n".join(insights) + "\n\n---\nGenerated by notebook."
with open("report.md", "w", encoding="utf-8") as rf:
    rf.write(report)

print("Report written to report.md")


## 11. Save a cleaned dataset

In [None]:

clean = df.copy()
# Example: keep core columns if present
core_cols = [c for c in ["show_id","type","title","director","cast","country","date_added","release_year","rating","duration","listed_in","description","year_added","month_added","duration_value"] if c in clean.columns]
clean = clean[core_cols]
clean.to_csv("netflix_titles_clean.csv", index=False)
print("Saved netflix_titles_clean.csv with shape:", clean.shape)
