# Predicting votes
> Let's see how how well votes of politicians in polls can be predicted.

**The strategy**:
- first: only include a politician id and a poll id as features 
- second: include text features based on the poll title and or description

**TL;DR**
- using only politician id and poll id we find an 88% accuracy (over validation given random split) => individual outcome is highly associated with votes of others in the same poll

**TODO**:
- combine poll title and description for feature generation
- try transformer based features
- visualise most incorrect predicted polls and politicians

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from fastai.tabular.all import (
    Categorify,
    CategoryBlock,
    Normalize,
    TabularPandas,
    tabular_learner,
)
import polars as pl
from bundestag.fine_logging import setup_logging
import logging
from bundestag.paths import get_paths
from bundestag.data.transform.abgeordnetenwatch.transform import (
    get_polls_parquet_path,
    get_votes_parquet_path,
    get_mandates_parquet_path,
)
from bundestag.ml.poll_clustering import SpacyTransformer, clean_text
from bundestag.ml.vote_prediction import (
    poll_splitter,
    plot_predictions,
    get_embeddings,
    get_poll_proponents,
    plot_poll_embeddings,
    plot_politician_embeddings,
)
from plotnine import scale_color_manual
from functools import partial

logger = logging.getLogger(__name__)
setup_logging(logging.DEBUG)

paths = get_paths("../data")
paths

## Setup

Loading preprocessed dataframes (see `03_abgeordnetenwatch.ipynb`). First let's the votes.

In [None]:
legislature_id = 111
file = get_votes_parquet_path(legislature_id, paths.preprocessed_abgeordnetenwatch)
file

In [None]:
df_all_votes = pl.read_parquet(file)
df_all_votes.head()

Loading further info on politicians

In [None]:
file = get_mandates_parquet_path(legislature_id, paths.preprocessed_abgeordnetenwatch)
file

In [None]:
df_mandates = pl.read_parquet(file)

Loading data on polls (description, title and so on)

In [None]:
file = get_polls_parquet_path(legislature_id, paths.preprocessed_abgeordnetenwatch)
file

In [None]:
df_polls = pl.read_parquet(file)
df_polls.head(3)

## Modelling using only poll and politician ids as features

### Split into train and validation

Creating train / valid split

In [None]:
# splits = RandomSplitter(valid_pct=0.2)(df_all_votes)
splits = poll_splitter(df_all_votes, valid_pct=0.2)
splits

Setting target variable and count frequencies

In [None]:
y_col = "vote"
print(f"target values: {df_all_votes[y_col].value_counts()}")

### Training

Final data preprocessing for training

In [None]:
to = TabularPandas(
    df_all_votes.to_pandas(),
    cat_names=["politician name", "poll_id"],
    y_names=[y_col],
    procs=[Categorify],
    y_block=CategoryBlock,
    splits=splits,
)

dls = to.dataloaders(bs=512)

Finding the learning rate for training

In [None]:
learn = tabular_learner(dls)
lrs = learn.lr_find()
lrs

Training the artificial neural net

In [None]:
learn.fit_one_cycle(5, lrs.valley)

### Inspecting predictions

In [None]:
df_mandates.head()

In [None]:
# df_mandates = df_mandates.with_columns(**{
#     "party_original": pl.col("party"),
#     "party": pl.col("party").list.last()
# })
# df_mandates.head()
# df_mandates["party_original"] = df_mandates["party"].copy()
# df_mandates["party"] = df_mandates["party"].apply(lambda x: x[-1])

In [None]:
plot_predictions(learn, df_all_votes, df_mandates, df_polls, splits)

accuracy:
- random split: 88% 
- poll based split: ~50%, politician embedding itself insufficient to reasonably predict vote

### Inspecting resulting embeddings

In [None]:
learn.model.cpu()

In [None]:
embeddings = get_embeddings(learn)
embeddings

In [None]:
proponents = get_poll_proponents(df_all_votes, df_mandates)
proponents.head()

In [None]:
embeddings["politician name"]

In [None]:
embeddings_pl = {
    "politician name": pl.DataFrame(
        {
            "politician name__emb_component_0": embeddings["politician name"][
                "politician name__emb_component_0"
            ],
            "politician name__emb_component_1": embeddings["politician name"][
                "politician name__emb_component_1"
            ],
            "politician name": embeddings["politician name"]["politician name"],
        }
    ),
    "poll_id": pl.DataFrame(
        {
            "poll_id__emb_component_0": embeddings["poll_id"][
                "poll_id__emb_component_0"
            ].values[1:],
            "poll_id__emb_component_1": embeddings["poll_id"][
                "poll_id__emb_component_1"
            ].values[1:],
            "poll_id": [int(v) for v in embeddings["poll_id"]["poll_id"].values[1:]],
        }
    ),
}

party_colors = scale_color_manual(
    breaks=[
        "AfD",
        "BSW",
        "DIE GRÜNEN",
        "CDU/CSU",
        "DIE LINKE",
        "FDP",
        "fraktionslos",
        "SPD",
    ],
    values=["blue", "purple", "green", "black", "red", "yellow", "grey", "salmon"],
)

plot_poll_embeddings(df_all_votes, df_polls, embeddings_pl, df_mandates, party_colors)

In [None]:
plot_politician_embeddings(df_all_votes, df_mandates, embeddings_pl, party_colors)

embed scatters after pca:
- poll based split => mandates form two groups
- random split => polls and mandates each form 2-3 groups

## Modelling using `poll_title`-based features

### LDA topic weights as features

In [None]:
source_col = "poll_title"
nlp_col = f"{source_col}_nlp_processed"
num_topics = 8

st = SpacyTransformer()

# load data and prepare text for modelling
df_polls_lda = df_polls.with_columns(
    **{
        nlp_col: pl.col(source_col).map_elements(
            partial(clean_text, nlp=st.nlp), return_dtype=pl.List(pl.String)
        )
    }
)

# modelling
st.fit_lda(df_polls_lda[nlp_col].to_list(), num_topics=num_topics)

# creating text features using fitted model
df_polls_lda = st.transform(df_polls_lda, col=nlp_col)

# inspecting
display(df_polls_lda.head())

In [None]:
df_all_votes.head()

In [None]:
df_input = df_all_votes.join(
    df_polls_lda[["poll_id"] + st.nlp_cols],
    on="poll_id",
)
df_input.head()

In [None]:
splits = poll_splitter(df_input, valid_pct=0.2)
splits

In [None]:
to = TabularPandas(
    df_input.to_pandas(),
    cat_names=[
        "politician name",
    ],  # 'poll_id'
    cont_names=st.nlp_cols,  # using the new features
    y_names=[y_col],
    procs=[Categorify, Normalize],
    y_block=CategoryBlock,
    splits=splits,
)

dls = to.dataloaders(bs=512)

In [None]:
learn = tabular_learner(dls)
lrs = learn.lr_find()
lrs

In [None]:
learn.fit_one_cycle(
    5,
    #                     2e-2)
    lrs.valley,
)

In [None]:
plot_predictions(learn, df_all_votes, df_mandates, df_polls, splits)

poll_id split:
- politician name + poll_id + 10 lda topics based on poll title do not improve the accuracy
- politician name + <s>poll_id</s> + 5 lda topics based on poll title: ~49%
- politician name + <s>poll_id</s> + 10 lda topics based on poll title: ~57%
- politician name + <s>poll_id</s> + 25 lda topics based on poll title: ~45%

## Modelling using `poll_description`-based features

### LDA topic weights as features

In [None]:
source_col = "poll_description"
nlp_col = f"{source_col}_nlp_processed"
num_topics = 25

st = SpacyTransformer()

# load data and prepare text for modelling
df_polls_lda = df_polls.with_columns(
    **{
        nlp_col: pl.col(source_col).map_elements(
            partial(clean_text, nlp=st.nlp), return_dtype=pl.List(pl.String)
        )
    }
)

# modelling
st.fit_lda(df_polls_lda[nlp_col].to_list(), num_topics=num_topics)

# creating text features using fitted model
df_polls_lda = df_polls_lda.pipe(
    st.transform,
    col=nlp_col,
)

# inspecting
display(df_polls_lda.head())

In [None]:
df_input = df_all_votes.join(
    df_polls_lda[["poll_id"] + st.nlp_cols],
    on="poll_id",
)
df_input.head()

In [None]:
splits = poll_splitter(df_input, valid_pct=0.2)
splits

In [None]:
to = TabularPandas(
    df_input.to_pandas(),
    cat_names=[
        "politician name",
    ],  # 'poll_id'
    cont_names=st.nlp_cols,  # using the new features
    y_names=[y_col],
    procs=[Categorify, Normalize],
    y_block=CategoryBlock,
    splits=splits,
)

dls = to.dataloaders(bs=512)

In [None]:
learn = tabular_learner(dls)
lrs = learn.lr_find()
lrs

In [None]:
learn.fit_one_cycle(
    5,
    #                     2e-2)
    lrs.valley,
)

In [None]:
plot_predictions(learn, df_all_votes, df_mandates, df_polls, splits)

poll_id split:
- politician name + <s>poll_id</s> + 5 lda topics based on poll description: ~51%
- politician name + <s>poll_id</s> + 10 lda topics based on poll description: ~53%
- politician name + <s>poll_id</s> + 20 lda topics based on poll description: ~56%
- politician name + <s>poll_id</s> + 25 lda topics based on poll description: ~59%