In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [None]:
df = pd.read_csv("C:/Users/WELCOME/Desktop/project 3/youtube_ad_revenue_dataset.csv")

df.head()

In [None]:
print("Rows, Cols:", df.shape)
print("\nData types:")
print(df.dtypes)

print("\nNull % per column:")
print((df.isna().mean() * 100).round(2).sort_values(ascending=False))

print("\nDuplicate rows %:")
print(round(df.duplicated().mean() * 100, 2))


In [None]:
# Convert date to datetime
df["date"] = pd.to_datetime(df["date"], errors="coerce")


In [None]:
# Engagement rate: how reactive the audience is
df["engagement_rate"] = (df["likes"] + df["comments"]) / df["views"]

# Clean infinities from division by zero (safety)
df["engagement_rate"] = df["engagement_rate"].replace([np.inf, -np.inf], np.nan)


In [None]:
numeric_cols = [
    "views",
    "likes",
    "comments",
    "watch_time_minutes",
    "video_length_minutes",
    "subscribers",
    "engagement_rate",
    "ad_revenue_usd"
]

df[numeric_cols].describe(percentiles=[0.5, 0.9, 0.95, 0.99]).T


In [None]:
plt.scatter(df["views"], df["ad_revenue_usd"], alpha=0.3)
plt.xlabel("views")
plt.ylabel("ad_revenue_usd")
plt.title("Views vs Revenue")
plt.show()

plt.scatter(df["watch_time_minutes"], df["ad_revenue_usd"], alpha=0.3)
plt.xlabel("watch_time_minutes")
plt.ylabel("ad_revenue_usd")
plt.title("Watch Time vs Revenue")
plt.show()

plt.scatter(df["engagement_rate"], df["ad_revenue_usd"], alpha=0.3)
plt.xlabel("engagement_rate")
plt.ylabel("ad_revenue_usd")
plt.title("Engagement Rate vs Revenue")
plt.show()


In [None]:
# Drop duplicate rows
df = df.drop_duplicates()

# We will NOT impute missing values here.
# We'll do imputation in the modeling pipeline.
# But we DO save this cleaned version.

df.to_csv("C:/Users/WELCOME/Desktop/project 3/youtube_clean.csv", index=False)


In [None]:
import sys, subprocess, pkgutil
if pkgutil.find_loader("sklearn") is None:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"])
    subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn", "joblib", "pandas", "numpy"])
import sklearn, joblib
print("sklearn:", sklearn.__version__, "| joblib:", joblib.__version__)


In [None]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import joblib

RAW_PATH = "../data/youtube_raw.csv"           # put your CSV here (rename from the original)
CLEAN_PATH = "../data/youtube_clean.csv"
APP_DIR = "../app"
MODEL_PATH = os.path.join(APP_DIR, "model.pkl")


In [None]:
CLEAN_PATH = "C:/Users/WELCOME/Desktop/project 3/youtube_clean.csv"


# 1) Load the cleaned snapshot from Step 1
df = pd.read_csv(CLEAN_PATH)

# 2) Define feature groups and target
numeric_cols = [
    "views",
    "likes",
    "comments",
    "watch_time_minutes",
    "video_length_minutes",
    "subscribers",
    "engagement_rate",
]

categorical_cols = ["category", "device", "country"]
target_col = "ad_revenue_usd"

# 3) Impute missing values
#    - numeric → median (robust to outliers)
#    - categorical → mode (most frequent)
for c in numeric_cols:
    df[c] = df[c].fillna(df[c].median())

for c in categorical_cols:
    df[c] = df[c].fillna(df[c].mode()[0])

# 4) Basic sanity checks
null_pct_after = (df.isna().mean() * 100).round(2).sort_values(ascending=False)
print("=== Imputation done ===")
print("Rows, Cols:", df.shape)
print("\nAny remaining NaNs (top 10):")
print(null_pct_after.head(10))

# 5) Quick peek to confirm columns & types
print("\nColumn dtypes (trimmed):")
print(df[numeric_cols + categorical_cols + [target_col]].dtypes)

# 6) (Optional) save a tiny sample of the imputed data for your records/screenshots
preview = df[numeric_cols + categorical_cols + [target_col]].head(12)
preview


In [None]:
df = pd.read_csv(CLEAN_PATH)

numeric_cols = [
    "views","likes","comments",
    "watch_time_minutes","video_length_minutes",
    "subscribers","engagement_rate",
]
categorical_cols = ["category","device","country"]
target_col = "ad_revenue_usd"

# Impute
for c in numeric_cols:
    df[c] = df[c].fillna(df[c].median())
for c in categorical_cols:
    df[c] = df[c].fillna(df[c].mode()[0])

# Sanity check
null_pct_after = (df.isna().mean()*100).round(2).sort_values(ascending=False)
print("=== Imputation done ===")
print("Rows, Cols:", df.shape)
print("Remaining NaNs (top 10):\n", null_pct_after.head(10))
print("\nDtypes:\n", df[numeric_cols + categorical_cols + [target_col]].dtypes)


In [None]:
# === STEP 3 (Patched): split + preprocessing with imputers ===
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# 0) Use YOUR cleaned file path (Windows-safe raw string)
CLEAN_PATH = r"C:\Users\WELCOME\Desktop\project 3\youtube_clean.csv"

# (optional) derive folders for tomorrow’s export
PROJ_DIR  = os.path.dirname(CLEAN_PATH)
APP_DIR   = os.path.join(PROJ_DIR, "app")
MODEL_PATH = os.path.join(APP_DIR, "model.pkl")

# 1) Load cleaned snapshot from Day 1
df = pd.read_csv(CLEAN_PATH)

# 2) Feature groups and target
numeric_cols = [
    "views","likes","comments",
    "watch_time_minutes","video_length_minutes",
    "subscribers","engagement_rate",
]
categorical_cols = ["category","device","country"]
target_col = "ad_revenue_usd"

X = df[numeric_cols + categorical_cols]
y = df[target_col]

# 3) Split (80/20, reproducible)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

# 4) Build preprocessors (impute → scale / encode)
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

# OneHotEncoder param differs across sklearn versions; handle both
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)  # sklearn >= 1.2
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)         # older versions

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", ohe),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="drop",
)

# 5) Fit preprocessor on TRAIN ONLY (prevents leakage)
preprocess.fit(X_train)

# 6) Transform to confirm shapes
X_train_proc = preprocess.transform(X_train)
X_test_proc  = preprocess.transform(X_test)

# 7) Diagnostics (nice for viva)
try:
    cat_feature_names = preprocess.named_transformers_["cat"].named_steps["ohe"].get_feature_names_out(categorical_cols)
except Exception:
    cat_feature_names = preprocess.named_transformers_["cat"].named_steps["ohe"].get_feature_names(categorical_cols)

print("=== Preprocessing fitted with imputers ===")
print("Train raw shape:", X_train.shape, " -> transformed:", X_train_proc.shape)
print("Test  raw shape:", X_test.shape,  " -> transformed:", X_test_proc.shape)
print(f"Numeric cols: {len(numeric_cols)} | One-hot cols: {len(cat_feature_names)} | Total transformed cols: {X_train_proc.shape[1]}")

for col, cats in zip(categorical_cols, preprocess.named_transformers_["cat"].named_steps["ohe"].categories_):
    print(f"- {col}: {len(cats)} levels (e.g., {list(cats)[:5]}...)")

# Keep X_train, X_test, y_train, y_test, preprocess, MODEL_PATH in memory for next step


In [None]:
from pathlib import Path

PROJ_DIR = Path(r"C:\Users\WELCOME\Desktop\project 3")  # your project folder
PROJ_DIR.mkdir(parents=True, exist_ok=True)

save_path = PROJ_DIR / "model_comparison_results.csv"
results_df.to_csv(save_path, index=False)
print("Saved results to:", save_path)


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from joblib import dump

# --- paths ---
CLEAN_PATH = r"C:\Users\WELCOME\Desktop\project 3\youtube_clean.csv"
PROJ_DIR   = Path(r"C:\Users\WELCOME\Desktop\project 3")
APP_DIR    = PROJ_DIR / "app"
APP_DIR.mkdir(parents=True, exist_ok=True)
MODEL_PATH = APP_DIR / "model.pkl"

# --- reload full data (lets us fit on 100%) ---
df_full = pd.read_csv(CLEAN_PATH)

numeric_cols = [
    "views","likes","comments",
    "watch_time_minutes","video_length_minutes",
    "subscribers","engagement_rate",
]
categorical_cols = ["category","device","country"]
target_col = "ad_revenue_usd"

X_full = df_full[numeric_cols + categorical_cols]
y_full = df_full[target_col]

# --- rebuild the same model dict and pick the winner from your results ---
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.001, max_iter=10000),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
}

best_name = results_df.iloc[0]["model"]  # from your Step 4 table
best_model = models[best_name]

# `preprocess` is from Step 3 (with imputers + scaler + OHE). Reuse it.
final_pipe = Pipeline(steps=[("prep", preprocess), ("model", best_model)])
final_pipe.fit(X_full, y_full)

dump(final_pipe, MODEL_PATH)
print("Best model:", best_name)
print("Saved model to:", MODEL_PATH)
