In [None]:
import pandas as pd
import numpy as np
import json
import re
import pickle
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

# ============================
# Custom Preprocessor
# ============================
class JSONCountTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def _count_json_array(self, s):
        try:
            s = s.replace("'", '"').replace("None", "null")
            parsed = json.loads(s)
            return len(parsed) if isinstance(parsed, list) else 0
        except:
            return 0

    def transform(self, X):
        X = X.copy()
        X["num_cast"] = X["cast"].apply(self._count_json_array)
        return X[["num_cast"]].values  # Return as 2D array

# ============================
# Load and Clean Data
# ============================
df = pd.read_csv("credits_with_titles.csv")
df = df[["cast", "crew"]].dropna()

# Compute crew size category for training
def bin_crew_size(s):
    try:
        s = s.replace("'", '"').replace("None", "null")
        parsed = json.loads(s)
        n = len(parsed) if isinstance(parsed, list) else 0
        if 0 <= n <= 10:
            return 0
        elif 11 <= n <= 30:
            return 1
        elif n > 30:
            return 2
    except:
        return np.nan

df["crew_size_cat"] = df["crew"].apply(bin_crew_size)
df = df.dropna(subset=["crew_size_cat"])
y = df["crew_size_cat"].astype(int).values

# ============================
# Class Weights
# ============================
classes = np.unique(y)
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y)
class_weight_dict = dict(zip(classes, class_weights))

# ============================
# Define Pipeline
# ============================
pipeline = Pipeline([
    ("json_features", JSONCountTransformer()),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42, class_weight=class_weight_dict))
])

# ============================
# Train Model
# ============================
X_train, X_test, y_train, y_test = train_test_split(df[["cast"]], y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

# ============================
# Evaluate
# ============================
y_pred = pipeline.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")

# ============================
# Export Pipeline
# ============================
with open("crew_classifier_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)


Accuracy: 0.4112
F1 Score: 0.4972
