In [4]:
import sys
import os
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# --- Path setup ---
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.data_loader as dl

# --- Load data ---
df = dl.load_artworks_enriched()

# --- Target column: was this artwork acquired? ---
df["is_acquired"] = df["dateacquired"].notna().astype(int)

# --- Feature selection ---
features = [
    "year", "is_known_artist", "is_male", "is_female",
    "material", "medium", "classification", "country", "era", "department"
]

# --- Prepare modeling DataFrame and drop rows with missing values ---
df_model = df[features + ["is_acquired"]].dropna()
print("Samples after dropna:", len(df_model))

# --- Split into features and target ---
X = df_model[features]
y = df_model["is_acquired"]

# --- Train-test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
print("Train size:", len(X_train))
print("Test size:", len(X_test))

# --- Preprocessing pipelines ---
numeric_features = ["year"]
categorical_features = ["material", "medium", "classification", "country", "era", "department"]
boolean_features = ["is_known_artist", "is_male", "is_female"]

preprocessor = ColumnTransformer(transformers=[
    ("num", SimpleImputer(strategy="median"), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ("bool", "passthrough", boolean_features)
])

# --- Build pipeline ---
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

# --- Train model ---
clf.fit(X_train, y_train)

# --- Predictions and evaluation ---
y_pred = clf.predict(X_test)

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

# --- Save outputs ---
os.makedirs("../outputs", exist_ok=True)

# Save model and evaluation inputs
joblib.dump(clf, "../outputs/classifier_pipeline.pkl")
joblib.dump(X_test, "../outputs/X_test.pkl")
joblib.dump(y_test, "../outputs/y_test.pkl")
joblib.dump(y_pred, "../outputs/y_pred.pkl")

# Save raw test data for future bias/error analysis
X_test_raw = df.loc[X_test.index]
joblib.dump(X_test_raw, "../outputs/X_test_raw.pkl")




Samples after dropna: 153400
Train size: 107380
Test size: 46020

Classification Report:

              precision    recall  f1-score   support

           0       0.87      0.73      0.79      1539
           1       0.99      1.00      0.99     44481

    accuracy                           0.99     46020
   macro avg       0.93      0.86      0.89     46020
weighted avg       0.99      0.99      0.99     46020


Confusion Matrix:

[[ 1120   419]
 [  161 44320]]


['../outputs/X_test_raw.pkl']