Objective: Get insight into ["Fraktionszwang"](https://de.wikipedia.org/wiki/Fraktionsdisziplin) using voting behavior data from the bundestag and abgeordnetenwatch.

Fraktionszwang should become evident by how diverse the votes are by one party across different polls.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import polars as pl
from plotnine import (
    ggplot,
    aes,
    geom_point,
    labs,
    scale_y_continuous,
    facet_wrap,
    theme,
    geom_line,
    scale_color_manual,
)

from bundestag.fine_logging import setup_logging
import logging
from bundestag.paths import get_paths

logger = logging.getLogger(__name__)
setup_logging(logging.INFO)

paths = get_paths("../data")
paths

In [None]:
file = paths.preprocessed_bundestag / "bundestag.de_votes.parquet"
file

In [None]:
data_bundestag = pl.read_parquet(file)

data_bundestag.head()

Use a single name for "Die Linke"

In [None]:
data_bundestag["Fraktion/Gruppe"].value_counts()

In [None]:
data_bundestag = data_bundestag.with_columns(
    **{
        "Fraktion/Gruppe": pl.when(pl.col("Fraktion/Gruppe").eq(pl.lit("DIE LINKE.")))
        .then(pl.lit("Die Linke"))
        .otherwise(pl.col("Fraktion/Gruppe"))
    }
)

In [None]:
data_bundestag["Fraktion/Gruppe"].value_counts()

How many things are voted on per day over time?

In [None]:
things_per_day_over_time = data_bundestag.group_by("date").agg(
    pl.col("Abstimmnr").n_unique().alias("n")
)
things_per_day_over_time.head()

In [None]:
(
    ggplot(things_per_day_over_time, aes("date", "n"))
    + geom_point()
    + labs(
        title="# Abstimmungsnr per day over time", x="Date", y="# unique Abstimmungsnr"
    )
    + scale_y_continuous(breaks=[0, 2, 4, 6, 8, 10])
)

How many members vote per poll over time?

In [None]:
members_per_poll_per_day_over_time = data_bundestag.group_by(["date", "Abstimmnr"]).agg(
    pl.col("Bezeichnung").n_unique().alias("n")
)
members_per_poll_per_day_over_time.head()

In [None]:
(
    ggplot(members_per_poll_per_day_over_time, aes("date", "n"))
    + geom_point()
    + labs(title="# Members voting per poll per day over time", x="Date", y="# members")
    # + theme(figure_size=)
)

Count of vote types by date, poll and party over time.

In [None]:
member_votes_per_faction_per_poll_per_day_over_time = (
    data_bundestag.group_by(["date", "Abstimmnr", "Fraktion/Gruppe", "vote"])
    .agg(pl.col("Bezeichnung").n_unique().alias("n"))
    .sort("date", "Abstimmnr", "Fraktion/Gruppe", "vote")
)

member_votes_per_faction_per_poll_per_day_over_time.head()

In [None]:
member_votes_per_faction_per_poll_per_day_over_time = (
    member_votes_per_faction_per_poll_per_day_over_time.with_columns(
        (
            pl.col("n") / pl.sum("n").over(["date", "Abstimmnr", "Fraktion/Gruppe"])
        ).alias("vote share")
    )
)
member_votes_per_faction_per_poll_per_day_over_time.head()

In [None]:
(
    ggplot(
        member_votes_per_faction_per_poll_per_day_over_time,
        aes("date", "vote share", color="vote"),
    )
    + geom_point(alpha=0.3)
    + labs(
        title="Voting shares per poll per day over time", x="Date", y="Vote fraction"
    )
    + facet_wrap("Fraktion/Gruppe", ncol=1)
    + scale_y_continuous(limits=(0, 1), breaks=[0, 0.25, 0.5, 0.75, 1.0])
    + theme(figure_size=(10, 16), subplots_adjust={"hspace": 0.35})
    + scale_color_manual(
        breaks=["ja", "nein", "nichtabgegeben", "Enthaltung"],
        values=["green", "red", "grey", "orange"],
    )
)

In [None]:
member_votes_per_faction_per_poll_per_day_over_time.with_columns(
    pl.col("vote share").log(base=2)
)

In [None]:
entropy_per_poll_faction = (
    member_votes_per_faction_per_poll_per_day_over_time.with_columns(
        pl.col("vote share").log(base=2).alias("log p")
    )
    .group_by(["date", "Abstimmnr", "Fraktion/Gruppe"])
    .agg(
        -pl.when(pl.col("vote share") > 0)
        .then(pl.col("vote share") * pl.col("log p"))
        .otherwise(0)
        .sum()
        .alias("shannon entropy")
    )
)

entropy_per_poll_faction.head()

In [None]:
party_colos = scale_color_manual(
    breaks=[
        "AfD",
        "BSW",
        "BÜ90/GR",
        "CDU/CSU",
        "Die Linke",
        "FDP",
        "Fraktionslos",
        "SPD",
    ],
    values=["blue", "purple", "green", "black", "red", "yellow", "grey", "salmon"],
)

(
    ggplot(
        entropy_per_poll_faction,
        aes("date", "shannon entropy", color="Fraktion/Gruppe"),
    )
    + geom_point()
    + labs(
        title="Voting entropy per poll per day over time",
        x="Date",
        y="Shannon entropy (smaller = more Fraktionszwang)",
    )
    + facet_wrap("Fraktion/Gruppe", ncol=1)
    + theme(figure_size=(10, 16), subplots_adjust={"hspace": 0.35})
    + party_colos
)

Now we compute the rolling median of `shannon entropy` over 10 polls for each `Fraktion/Gruppe`.

In [None]:
n_polls_to_average = 30

entropy_per_poll_faction = entropy_per_poll_faction.sort("date").with_columns(
    pl.col("shannon entropy")
    .rolling_median(window_size=n_polls_to_average)
    .over("Fraktion/Gruppe")
    .alias("shannon_entropy_rolling_median")
)
entropy_per_poll_faction.head()

Now let's plot the original `shannon entropy` and the `shannon_entropy_rolling_median` to see the effect of the rolling median.

In [None]:
(
    ggplot(entropy_per_poll_faction, aes(x="date", color="Fraktion/Gruppe"))
    + geom_line(aes(y="shannon_entropy_rolling_median"))
    + labs(
        title=f"Voting entropy per poll per day over time with rolling median (n={n_polls_to_average})",
        x="Date",
        y="Shannon entropy (smaller = more Fraktionszwang)",
    )
    + theme(figure_size=(8, 6), subplots_adjust={"hspace": 0.35})
    + party_colos
)

same as above but using abgeordnetenwatch data

In [None]:
from bundestag.data.transform.abgeordnetenwatch.transform import (
    get_polls_parquet_path,
    get_votes_parquet_path,
    get_mandates_parquet_path,
)

In [None]:
legislature_ids = [67, 83, 97, 111, 132, 161]

In [None]:
tmp = []
for legislature_id in legislature_ids:
    p = get_polls_parquet_path(legislature_id, paths.preprocessed_abgeordnetenwatch)
    _mandates = pl.read_parquet(p)
    _mandates = _mandates.with_columns(**{"legislature_id": legislature_id})
    tmp.append(_mandates)

polls = pl.concat(tmp, how="diagonal_relaxed")
polls.head(2), polls.tail(2)

In [None]:
tmp = []
for legislature_id in legislature_ids:
    p = get_votes_parquet_path(legislature_id, paths.preprocessed_abgeordnetenwatch)
    _mandates = pl.read_parquet(p)
    _mandates = _mandates.with_columns(**{"legislature_id": legislature_id})
    tmp.append(_mandates)

votes = pl.concat(tmp, how="diagonal_relaxed")
votes.head(2), votes.tail(2)

In [None]:
tmp = []
for legislature_id in legislature_ids:
    p = get_mandates_parquet_path(legislature_id, paths.preprocessed_abgeordnetenwatch)
    _mandates = pl.read_parquet(p)
    _mandates = _mandates.with_columns(**{"legislature_id": legislature_id})
    tmp.append(_mandates)

mandates = pl.concat(tmp, how="diagonal_relaxed")
mandates.head(2), mandates.tail(2)

In [None]:
data_abgeordnetenwatch = polls.join(
    votes, on=["legislature_id", "poll_id"], how="left"
).join(mandates, on=["legislature_id", "mandate_id"], how="left")

In [None]:
data_abgeordnetenwatch = data_abgeordnetenwatch.with_columns(
    **{"date": pl.col("poll_date").str.to_date(format="%Y-%m-%d")}
)

In [None]:
data_abgeordnetenwatch["party"].unique().to_list()

In [None]:
data_abgeordnetenwatch = data_abgeordnetenwatch.with_columns(
    **{
        "party": pl.when(
            pl.col("party").is_in(pl.lit(["DIE LINKE", "Die Linke. (Gruppe)"]))
        )
        .then(pl.lit("Die Linke"))
        .otherwise(pl.col("party"))
    }
).with_columns(
    **{
        "party": pl.when(pl.col("party").is_in(pl.lit(["DIE GRÜNEN"])))
        .then(pl.lit("BÜNDNIS 90/DIE GRÜNEN"))
        .otherwise(pl.col("party"))
    }
)

In [None]:
data_abgeordnetenwatch["party"].unique().to_list()

In [None]:
things_per_day_over_time = data_abgeordnetenwatch.group_by("date").agg(
    pl.col("poll_id").n_unique().alias("n")
)
things_per_day_over_time.head()

In [None]:
(
    ggplot(things_per_day_over_time, aes("date", "n"))
    + geom_point()
    + labs(title="# poll_ids per day over time", x="Date", y="# unique poll_ids")
    + scale_y_continuous(breaks=[0, 2, 4, 6, 8, 10])
)

In [None]:
members_per_poll_per_day_over_time = data_abgeordnetenwatch.group_by(
    ["date", "poll_id"]
).agg(pl.col("mandate_id").n_unique().alias("n"))
members_per_poll_per_day_over_time.head()

In [None]:
(
    ggplot(members_per_poll_per_day_over_time, aes("date", "n"))
    + geom_point()
    + labs(title="# Members voting per poll per day over time", x="Date", y="# members")
    # + theme(figure_size=)
)

In [None]:
data_abgeordnetenwatch.head()

In [None]:
member_votes_per_faction_per_poll_per_day_over_time = (
    data_abgeordnetenwatch.group_by(["date", "poll_id", "party", "vote"])
    .agg(pl.col("mandate_id").n_unique().alias("n"))
    .sort("date", "poll_id", "party", "vote")
)

member_votes_per_faction_per_poll_per_day_over_time.head()

In [None]:
member_votes_per_faction_per_poll_per_day_over_time = (
    member_votes_per_faction_per_poll_per_day_over_time.with_columns(
        (pl.col("n") / pl.sum("n").over(["date", "poll_id", "party"])).alias(
            "vote share"
        )
    )
)
member_votes_per_faction_per_poll_per_day_over_time.head()

In [None]:
(
    ggplot(
        member_votes_per_faction_per_poll_per_day_over_time,
        aes("date", "vote share", color="vote"),
    )
    + geom_point(alpha=0.3)
    + labs(
        title="Voting shares per poll per day over time", x="Date", y="Vote fraction"
    )
    + facet_wrap("party", ncol=1)
    + scale_y_continuous(limits=(0, 1), breaks=[0, 0.25, 0.5, 0.75, 1.0])
    + theme(figure_size=(10, 16), subplots_adjust={"hspace": 0.35})
    + scale_color_manual(
        breaks=["yes", "no", "no_show", "abstain"],
        values=["green", "red", "grey", "orange"],
    )
)

In [None]:
entropy_per_poll_faction = (
    member_votes_per_faction_per_poll_per_day_over_time.with_columns(
        pl.col("vote share").log(base=2).alias("log p")
    )
    .group_by(["date", "poll_id", "party"])
    .agg(
        -pl.when(pl.col("vote share") > 0)
        .then(pl.col("vote share") * pl.col("log p"))
        .otherwise(0)
        .sum()
        .alias("shannon entropy")
    )
)

entropy_per_poll_faction.head()

In [None]:
party_colos = scale_color_manual(
    breaks=[
        "AfD",
        "BSW (Gruppe)",
        "BÜNDNIS 90/DIE GRÜNEN",
        "CDU/CSU",
        "Die Linke",
        "FDP",
        "Fraktionslos",
        "SPD",
    ],
    values=["blue", "purple", "green", "black", "red", "yellow", "grey", "salmon"],
)

(
    ggplot(entropy_per_poll_faction, aes("date", "shannon entropy", color="party"))
    + geom_point()
    + labs(
        title="Voting entropy per poll per day over time",
        x="Date",
        y="Shannon entropy (smaller = more Fraktionszwang)",
    )
    + facet_wrap("party", ncol=1)
    + theme(figure_size=(10, 16), subplots_adjust={"hspace": 0.35})
    + party_colos
)

In [None]:
n_polls_to_average = 30

entropy_per_poll_faction = entropy_per_poll_faction.sort("date").with_columns(
    pl.col("shannon entropy")
    .rolling_median(window_size=n_polls_to_average)
    .over("party")
    .alias("shannon_entropy_rolling_median")
)
entropy_per_poll_faction.head()

In [None]:
(
    ggplot(entropy_per_poll_faction, aes(x="date", color="party"))
    + geom_line(aes(y="shannon_entropy_rolling_median"))
    + labs(
        title="Voting entropy per poll per day over time with rolling median (n=10)",
        x="Date",
        y="Shannon entropy (smaller = more Fraktionszwang)",
    )
    + theme(figure_size=(8, 6), subplots_adjust={"hspace": 0.35})
    + party_colos
)

In [None]:
import math

-math.log(1 / 5)