### Tutorial 3 - Per-Parliament Overview (PDF)

In this tutorial, we create one PDF overview per parliament, including:
- Mean sentiment scores per CAP_category
- Distribution of total speeches
- Avg. + median speech length 
- Word count x topic distribution
- Gini-coefficient of word counts across topics
- Spearman correlation scatterplot of topic share x mean sentiment

**1. Setup**

First, we have to set up the requirements: install and load all necessary packages.

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
from scipy.stats import spearmanr
import datetime
import csv
from pathlib import Path

**2. Data Loading & Filtering**

Now, we **read and filter** the data. This code:
- loads the selected columns of the datasets and merges them into a single DataFrame.
- selects only those that were held by Regular MPs (*Members of Parliament*) and filters out the CAP categories "Mix" and "Other". 
- creates separate DataFrames for coalition and opposition party speeches.


In [4]:
# ---- 1. First, we have to increase the CSV field size limit ----
max_int = 2**31 - 1
while True:
    try:
        csv.field_size_limit(max_int)
        break
    except OverflowError:
        max_int = max_int // 10

countries = ["AT", "BA", "BE", "BG", "CZ", "DK", "EE", "ES-CT", "ES-GA", "ES-PV", 
             "FR", "GB", "GR", "HR", "HU", "IS", "IT", "LV", 
             "NL", "NO", "PL", "PT", "RS", "SE", "SI", "TR", "UA"] #change country codes according to your available datasets

base_dir = Path().resolve()

# ---- 2. Choose what columns to read (including CAP and sentiment columns) ----
cols_to_keep = [
    "id", "date", "lang_code", "lang", "speaker_role", "speaker_MP",
    "speaker_minister", "speaker_party", "speaker_party_name", "party_status",
    "party_orientation", "speaker_id", "speaker_name", "speaker_gender",
    "speaker_birth", "word_count", "CAP_category", "sent3_category", "sent6_category", "sent_logit"
]

# ---- 3. Define dtypes to reduce memory ----
dtypes = {
    "id": str,
    "date": str,
    "lang_code": "category",
    "lang": "category",
    "speaker_role": "category",
    "speaker_MP": "category",
    "speaker_minister": "category",
    "speaker_party": "category",
    "speaker_party_name": "category",
    "party_status": "category",
    "party_orientation": "category",
    "speaker_id": "category",
    "speaker_name": "category",
    "speaker_gender": "category",
    "speaker_birth": "Int32",
    "word_count": "Int32",
    "CAP_category": "category",
    "sent3_category": "category",
    "sent6_category": "category",
    "sent_logit": "float32"
}

# ---- 4. Create lists to accumulate filtered chunks ----
all_chunks = []

for country in countries:
    file_path = base_dir / f"ParlaMint-{country}_processed_no_text.tsv"

    # --- 4.1. Read in chunks using pandas.read_csv ----
    for chunk in pd.read_csv(file_path, sep="\t", usecols=cols_to_keep,
                             dtype=dtypes, chunksize=50_000, engine="python"):
        chunk["country"] = country
        chunk["country"] = chunk["country"].astype("category")

        # ---- 4.2. Filter MPs with regular role ----
        filtered_chunk = chunk.query("speaker_MP == 'MP' and speaker_role == 'Regular'")

        # ---- 4.3. Drop rows where CAP_category or sentiment is empty ----
        filtered_chunk = filtered_chunk[
            filtered_chunk["CAP_category"].notna() & (filtered_chunk["CAP_category"] != "") &
            filtered_chunk["sent3_category"].notna() & (filtered_chunk["sent3_category"] != "") &
            filtered_chunk["sent6_category"].notna() & (filtered_chunk["sent6_category"] != "")
        ]

        # ---- 4.4. Accumulate filtered chunks ----
        if not filtered_chunk.empty:
            all_chunks.append(filtered_chunk)

# ---- 5. Concatenate all accumulated chunks into DataFrames ----
filtered_all = pd.concat(all_chunks, ignore_index=True)
del all_chunks
print("All filtered:", filtered_all.shape)

All filtered: (4565042, 21)


**Filter out** the CAP categories **"Mix" and "Other"** (because these labels aren't informative enough for the following analysis)

In [5]:
filtered_all = filtered_all[~filtered_all["CAP_category"].isin(["Mix", "Other"])]
filtered_all["CAP_category"] = filtered_all["CAP_category"].cat.remove_unused_categories()


In [6]:
# Filter for one country of your choice
country = "AT"
country_data = filtered_all[filtered_all["country"] == country].copy()

**3. PDF creation**

In [8]:
# ---- 1. Speech stats per CAP category (avg. + median speech length, total number of speeches + words & mean sentiment) ----
speech_stats = (
    country_data
    .groupby("CAP_category", observed=True)
    .agg(
        avg_speech_len=("word_count", "mean"),      # avg. words per speech
        median_speech_len=("word_count", "median"), # median words per speech
        total_speeches=("word_count", "count"),     # number of speeches
        total_words=("word_count", "sum"),          # total words spoken in that topic
        mean_sent=("sent_logit", "mean")            # average sentiment
    )
    .reset_index()
)

# ---- 1.1. Round numeric summary columns ----
speech_stats["avg_speech_len"] = speech_stats["avg_speech_len"].round(2)
speech_stats["median_speech_len"] = speech_stats["median_speech_len"].round(2)
speech_stats["mean_sent"] = speech_stats["mean_sent"].round(2)

# ---- 2. Aggreate total words and mean sentiment per CAP category ----
agg = (
    country_data
    .groupby("CAP_category", observed=True)
    .agg(mean_sent=("sent_logit", "mean"), total_words=("word_count", "sum"))
    .reset_index()
)

# ---- 2.1. Compute topic share (percentage of total words per topic) ----
if agg["total_words"].sum() > 0:
    agg["topic_share_pct"] = (agg["total_words"] / agg["total_words"].sum()) * 100
else:
    agg["topic_share_pct"] = 0.0

# ---- 2.2. Compute Spearman correlation (topic share vs. mean sentiment) ----
if len(agg) > 1:
    corr_coef, p_value = spearmanr(agg["topic_share_pct"], agg["mean_sent"])
else:
    corr_coef, p_value = (float("nan"), float("nan"))

# ---- 3. Total-level summary for cover page ----
total_speeches = int(country_data.shape[0])
total_words = int(country_data["word_count"].sum())
avg_sent = float(country_data["sent_logit"].mean()) if total_speeches > 0 else float("nan")
median_sent = float(country_data["sent_logit"].median()) if total_speeches > 0 else float("nan")
avg_len = float(country_data["word_count"].mean()) if total_speeches > 0 else float("nan")
median_len = float(country_data["word_count"].median()) if total_speeches > 0 else float("nan")

# ---- 5. Top 3 topics by total words (for the cover page) ----
top3 = agg.sort_values("total_words", ascending=False).head(3)
top3_lines = [
    f"{row['CAP_category']}: {int(row['total_words']):,} words ({row['topic_share_pct']:.1f}%)"
    for _, row in top3.iterrows()
]

# ---- Helper for formatting integers ----
def fmt_num(n):
    try:
        return f"{int(n):,}"
    except Exception:
        return str(n)

date_str = datetime.date.today().strftime("%Y-%m-%d")

# ----- Build cover page figure -----
fig_cover = plt.figure(figsize=(8.27, 11.69))  # A4 portrait
fig_cover.patch.set_facecolor("white")
ax = fig_cover.add_subplot(111)
ax.axis("off")

title = f"ParlaMint Country Analysis — {country}"
subtitle = f"Generated: {date_str}"

left_x = 0.06
y = 0.88
ax.text(left_x, y, title, fontsize=20, weight="bold", transform=fig_cover.transFigure)
ax.text(left_x, y - 0.04, subtitle, fontsize=10, color="gray", transform=fig_cover.transFigure)

# ---- Stats block ----
y0 = y - 0.12
line_height = 0.045
ax.text(left_x, y0, f"Total speeches: {fmt_num(total_speeches)}", fontsize=12, transform=fig_cover.transFigure)
ax.text(left_x, y0 - line_height, f"Total words: {fmt_num(total_words)}", fontsize=12, transform=fig_cover.transFigure)
ax.text(left_x, y0 - 2*line_height, f"Avg sentiment (sent_logit): {avg_sent:.2f}", fontsize=12, transform=fig_cover.transFigure)
ax.text(left_x, y0 - 3*line_height, f"Median sentiment: {median_sent:.2f}", fontsize=12, transform=fig_cover.transFigure)
ax.text(left_x, y0 - 4*line_height, f"Avg speech length (words): {avg_len:.1f}", fontsize=12, transform=fig_cover.transFigure)
ax.text(left_x, y0 - 5*line_height, f"Median speech length: {median_len:.1f}", fontsize=12, transform=fig_cover.transFigure)

# ---- Top topics printed on the cover ----
ax.text(left_x, y0 - 7*line_height, "Top topics (by words):", fontsize=12, weight="bold", transform=fig_cover.transFigure)
for i, line in enumerate(top3_lines):
    ax.text(left_x + 0.02, y0 - (8+i)*line_height, f"{i+1}. {line}", fontsize=11, transform=fig_cover.transFigure)

# ---- Small footer note ----
ax.text(0.06, 0.05, "Note: topic shares are based on total words spoken in each topic.", fontsize=8, color="gray", transform=fig_cover.transFigure)

# ----- 6. Create PDF and append pages (cover + plots + table) -----
pdf_path = f"{country}_speech_analysis.pdf"
with PdfPages(pdf_path) as pdf:
    # ---- 6.1. Cover page ----
    pdf.savefig(fig_cover)
    plt.close(fig_cover)

    # ---- 6.2. Barplot: total words per CAP category ----
    fig1, ax1 = plt.subplots(figsize=(12,6))
    sns.barplot(
        data=agg.sort_values("total_words", ascending=False),
        x="CAP_category",
        y="total_words",
        palette="viridis",
        ax=ax1
    )
    ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha="right")
    ax1.set_ylabel("Total Word Count")
    ax1.set_xlabel("CAP Category")
    ax1.set_title(f"Total Word Counts Across Topics in {country}")
    fig1.tight_layout()
    pdf.savefig(fig1)
    plt.close(fig1)

    # ---- 6.3. Scatter: topic share vs mean sentiment ----
    fig2, ax2 = plt.subplots(figsize=(8,5))

    num_topics = len(agg["CAP_category"].unique())
    palette = sns.color_palette("hsv", num_topics)  # dynamic palette

    sns.scatterplot(
        data=agg,
        x="topic_share_pct",
        y="mean_sent",
        hue="CAP_category",
        palette=palette,
        s=100,
        ax=ax2
    )
    if not np.isnan(corr_coef):
        sns.regplot(data=agg, x="topic_share_pct", y="mean_sent", scatter=False, ci=None, color="gray", ax=ax2)

    ax2.set_title(f"{country} — Topic Share vs. Mean Sentiment")
    ax2.set_xlabel("Topic Share (%)")
    ax2.set_ylabel("Mean Sentiment (sent_logit)")

    # ---- Place legend outside the plot ----
    handles, labels = ax2.get_legend_handles_labels()
    if handles:
        ax2.legend(handles, labels, title="Topic", bbox_to_anchor=(1.05, 1), loc='upper left')

    fig2.tight_layout()
    pdf.savefig(fig2)
    plt.close(fig2)

    # ---- 7. Table: speech_stats (create display-friendly copy) ----
    table_display = speech_stats.copy()
    table_display["total_speeches"] = table_display["total_speeches"].apply(lambda x: f"{int(x):,}")
    table_display["total_words"] = table_display["total_words"].apply(lambda x: f"{int(x):,}")
    table_display = table_display[["CAP_category", "avg_speech_len", "median_speech_len", "total_speeches", "total_words", "mean_sent"]]

    fig3, ax3 = plt.subplots(figsize=(12, max(6, 0.25*len(table_display))))
    ax3.axis("off")
    tbl = ax3.table(cellText=table_display.values, colLabels=table_display.columns, loc='center', cellLoc='center')
    tbl.auto_set_font_size(False)
    tbl.set_fontsize(9)
    tbl.scale(1, 1.2)
    ax3.set_title(f"{country} — Speech Statistics by CAP Category")
    fig3.tight_layout()
    pdf.savefig(fig3)
    plt.close(fig3)

print(f"PDF saved as {pdf_path}")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(
  ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha="right")


PDF saved as AT_speech_analysis.pdf
