In [1]:
# 3_model_baseline.ipynb cell — baseline regression pipeline
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error
import joblib

ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
PROC = ROOT / "data" / "processed"
OUT = ROOT / "outputs" / "figures"
MODELS = ROOT / "models"
MODELS.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(PROC / "anime_model_ready.csv")

# Define features (adjust if any missing)
features = []
if "episodes" in df.columns:
    features.append("episodes")
if "members" in df.columns:
    features.append("members")
# categorical
cat_feats = [c for c in ["type","main_genre"] if c in df.columns]
# fillna for simplicity
df[features] = df[features].fillna(0)
df[cat_feats] = df[cat_feats].fillna("Unknown")

X = df[features + cat_feats]
y = df["score"]

# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Column transformer
num_transform = Pipeline([("scaler", StandardScaler())])
cat_transform = Pipeline([("onehot", OneHotEncoder(handle_unknown="ignore"))])

pre = ColumnTransformer([
    ("num", num_transform, features),
    ("cat", cat_transform, cat_feats)
])

model = Pipeline([
    ("pre", pre),
    ("reg", Ridge(alpha=1.0))
])

print("Training model with features:", features + cat_feats)
model.fit(X_train, y_train)
pred = model.predict(X_test)

print("R²:", r2_score(y_test, pred))
print("MAE:", mean_absolute_error(y_test, pred))

# Save model and a small results CSV
joblib.dump(model, MODELS / "ridge_baseline.joblib")
pd.DataFrame({"y_true": y_test, "y_pred": pred}).reset_index(drop=True).to_csv(PROC / "baseline_preds.csv", index=False)
print("Saved model to", MODELS / "ridge_baseline.joblib")


Training model with features: ['episodes', 'members', 'type', 'main_genre']
R²: 0.3026624853786617
MAE: 0.6526825716598407
Saved model to /Users/dylandole-vy/projects/anime-globalization-fresh/models/ridge_baseline.joblib
