# 02 — Voting Exploration

**Tweede Kamer Open Data · Speech-to-Vote Project**

Explore voting data: votes per party, Voor/Tegen rates, party cohesion.
**Baseline 1**: How predictable are votes from party alone? (majority-class baseline)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

DATA_DIR = Path("../data/processed")

stemming = pd.read_parquet(DATA_DIR / "Stemming.parquet")
besluit = pd.read_parquet(DATA_DIR / "Besluit.parquet")
agendapunt = pd.read_parquet(DATA_DIR / "Agendapunt.parquet")
fractie = pd.read_parquet(DATA_DIR / "Fractie.parquet")

print(f"Stemming: {len(stemming):,} votes")
print(f"Besluit: {len(besluit):,} decisions")
print(f"Agendapunt: {len(agendapunt):,} agenda items")

## Vote distribution (Voor / Tegen / Niet deelgenomen)

In [None]:
vote_counts = stemming['Soort'].value_counts()
print(vote_counts)
vote_counts.plot(kind='bar', title='Vote distribution')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## Join votes with Besluit and Agendapunt

In [None]:
# Only Besluiten that have votes
besluit_ids = stemming['Besluit_Id'].dropna().unique()
besluit_voted = besluit[besluit['Id'].isin(besluit_ids)]

votes_joined = stemming.merge(
    besluit_voted[['Id', 'Agendapunt_Id', 'BesluitSoort', 'BesluitTekst', 'StemmingsSoort']],
    left_on='Besluit_Id',
    right_on='Id',
    how='inner',
    suffixes=('', '_besluit')
)
print(f"Votes with Besluit context: {len(votes_joined):,}")

## Votes per party (ActorFractie)

In [None]:
party_votes = votes_joined.groupby('ActorFractie')['Soort'].value_counts().unstack(fill_value=0)
party_votes['total'] = party_votes.sum(axis=1)
party_votes = party_votes.sort_values('total', ascending=False)
party_votes.head(15)

In [None]:
# Voor/Tegen rate per party (excluding Niet deelgenomen for simplicity)
vt = votes_joined[votes_joined['Soort'].isin(['Voor', 'Tegen'])]
vt_party = vt.groupby('ActorFractie')['Soort'].value_counts().unstack(fill_value=0)
vt_party['voor_pct'] = (vt_party['Voor'] / (vt_party['Voor'] + vt_party['Tegen']) * 100).round(1)
vt_party = vt_party.sort_values('Voor', ascending=False)
vt_party.head(15)

## Baseline 1: Party-only prediction

**Question**: If we always predict the majority vote of the party, what accuracy do we get?

This is the lower bound — how much does party membership alone explain?

In [None]:
# Per Besluit, per party: what did the party vote? (majority)
# For each vote record: predict party majority, compare to actual

def party_majority_vote(votes_df):
    """For each (Besluit_Id, ActorFractie), get the majority vote (Voor/Tegen/Niet)."""
    vt = votes_df[votes_df['Soort'].isin(['Voor', 'Tegen', 'Niet deelgenomen'])]
    def mode_or_first(x):
        m = x.mode()
        return m.iloc[0] if len(m) > 0 else x.iloc[0]
    counts = vt.groupby(['Besluit_Id', 'ActorFractie'])['Soort'].agg(mode_or_first)
    return counts.reset_index().rename(columns={'Soort': 'party_majority'})

majority = party_majority_vote(votes_joined)
votes_with_majority = votes_joined.merge(
    majority,
    on=['Besluit_Id', 'ActorFractie'],
    how='left'
)
votes_with_majority = votes_with_majority[votes_with_majority['party_majority'].notna()]

correct = (votes_with_majority['Soort'] == votes_with_majority['party_majority']).sum()
total = len(votes_with_majority)
acc = correct / total * 100 if total > 0 else 0

print(f"Baseline 1 (party majority): {correct:,} / {total:,} correct = {acc:.2f}% accuracy")
print(f"\nInterpretation: Party alone explains ~{acc:.0f}% of votes. Speech text must add value beyond this.")

In [None]:
# Per-party accuracy of party-majority baseline
votes_with_majority['correct'] = votes_with_majority['Soort'] == votes_with_majority['party_majority']
party_acc = votes_with_majority.groupby('ActorFractie')['correct'].agg(['mean', 'sum', 'count'])
party_acc.columns = ['accuracy', 'correct', 'total']
party_acc = party_acc.sort_values('total', ascending=False)
party_acc['accuracy_pct'] = (party_acc['accuracy'] * 100).round(1)
party_acc.head(15)

## Party cohesion over time (if we have date info)

Note: Stemming doesn't have date directly; we'd need to join via Besluit -> Agendapunt -> Activiteit.

In [None]:
# Check if we can get date from Besluit/Agendapunt/Activiteit
activiteit = pd.read_parquet(DATA_DIR / "Activiteit.parquet")
if 'Datum' in activiteit.columns:
    ap_act = agendapunt.merge(activiteit[['Id', 'Datum']], left_on='Activiteit_Id', right_on='Id', how='left')
    besluit_ap = besluit_voted.merge(ap_act[['Id', 'Datum']], left_on='Agendapunt_Id', right_on='Id', how='left', suffixes=('', '_ap'))
    votes_with_date = votes_joined.merge(besluit_ap[['Id', 'Datum']], left_on='Besluit_Id', right_on='Id', how='left')
    votes_with_date['year'] = pd.to_datetime(votes_with_date['Datum'], errors='coerce').dt.year
    print("Years in data:", votes_with_date['year'].dropna().astype(int).unique()[:20])
else:
    print("No Datum in Activiteit")