# 01b â€” Data Quality Check

**Speech-to-Vote Pairs Dataset**

How many pairs? Coverage? Class balance? Distribution by party and year.

In [None]:
import pandas as pd
from pathlib import Path

ANALYSIS_DIR = Path("../data/analysis")
pairs = pd.read_parquet(ANALYSIS_DIR / "speech_vote_pairs.parquet")

print(f"Total speech-vote pairs: {len(pairs):,}")
print(f"Unique speakers: {pairs['persoon_id'].nunique():,}")
print(f"Unique parties: {pairs['fractie'].nunique():,}")
print(f"Unique besluiten: {pairs['besluit_id'].nunique():,}")

## Vote distribution

In [None]:
print(pairs['vote'].value_counts())
pairs['vote'].value_counts().plot(kind='bar', title='Vote distribution')
plt = __import__('matplotlib.pyplot')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## Class balance (Voor vs Tegen vs Niet)

In [None]:
pct = pairs['vote'].value_counts(normalize=True) * 100
print(pct.round(1))
print(f"\nImbalance: Voor {pct.get('Voor',0):.1f}% | Tegen {pct.get('Tegen',0):.1f}% | Niet {pct.get('Niet deelgenomen',0):.1f}%")

## Distribution by party

In [None]:
party_counts = pairs['fractie'].value_counts().head(15)
print(party_counts)

## Distribution by year (if datum available)

In [None]:
if 'datum' in pairs.columns:
    pairs['year'] = pd.to_datetime(pairs['datum'], errors='coerce').dt.year
    year_counts = pairs['year'].value_counts().sort_index()
    print(year_counts.tail(15))
else:
    print("No datum column")

## Train/Val/Test splits (if available)

In [None]:
for name in ['train', 'val', 'test']:
    p = ANALYSIS_DIR / f"{name}.parquet"
    if p.exists():
        df = pd.read_parquet(p)
        print(f"{name}: {len(df):,} rows")