# 05 â€” Vote Prediction

**Model comparison: Baseline vs Model A (Party + TF-IDF)**

Temporal split: train <= 2021, val 2022, test >= 2023.

In [None]:
import pandas as pd
from pathlib import Path

from src.ml.features import load_pairs, get_train_val_test
from src.ml.models import (
    train_baseline_party,
    predict_baseline_party,
    train_model_a,
    predict_model_a,
    evaluate,
)

df = load_pairs(sample=100000)
df = df[df["datum"].notna()]
train, val, test = get_train_val_test(df)

# Focus on Voor/Tegen (majority of votes)
train = train[train["vote"].isin(["Voor", "Tegen"])]
val = val[val["vote"].isin(["Voor", "Tegen"])]
test = test[test["vote"].isin(["Voor", "Tegen"])]

print(f"Train: {len(train):,} | Val: {len(val):,} | Test: {len(test):,}")

In [None]:
results = []

# Baseline: party only
model_b = train_baseline_party(train)
pred_b = predict_baseline_party(model_b, val)
r_b = evaluate(val["vote"].values, pred_b)
results.append({"model": "Baseline (party)", **r_b})
print(f"Baseline: acc={r_b['accuracy']:.3f} f1={r_b['f1_macro']:.3f}")

# Model A: party + TF-IDF
model_a = train_model_a(train, max_features=3000)
pred_a = predict_model_a(model_a, val)
r_a = evaluate(val["vote"].values, pred_a)
results.append({"model": "Model A (party+TF-IDF)", **r_a})
print(f"Model A:  acc={r_a['accuracy']:.3f} f1={r_a['f1_macro']:.3f}")

In [None]:
results_df = pd.DataFrame(results)
results_df

In [None]:
# Test set evaluation
pred_b_test = predict_baseline_party(model_b, test)
pred_a_test = predict_model_a(model_a, test)
print("Test set:")
print(f"  Baseline: {evaluate(test['vote'].values, pred_b_test)}")
print(f"  Model A:  {evaluate(test['vote'].values, pred_a_test)}")