# 1 Setup

## 1.1 Setup Notebook

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from if_license_plates_could_talk import data

## 1.2 Load Data

In [11]:
df_raw = data.db.get_data()

In [20]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 473 entries, 0 to 472
Columns: 201 entries, kreis_key to ew_winner_2019
dtypes: bool(1), float64(194), object(6)
memory usage: 743.2+ KB


In [21]:
"ew_winner_2019" in list(df_raw.columns)

True

# 2 Preprocessing

## 2.1 Filter Dataset / Handle missing values

In [23]:
df_relevant = df_raw.dropna(subset=["kreis_name"]).copy()

## cheat -> Todo: get data on income in 2019
df_relevant["income_pp_2019"] = df_relevant.income_pp_2018
df_relevant["income_2019"] = df_relevant.income_2018

time_dep_cols = list(set([col[:-5] for col in df_relevant.columns if "20" in col]))
time_indep_cols = list([col for col in df_relevant.columns if "20" not in col])


print("--- Zeitabhängige Spalten ---")
print(time_dep_cols)

print("--- Zeitunabhängige Spalten ---")
print(time_indep_cols)

cols_14 = time_indep_cols + [f"{col}_2014" for col in time_dep_cols]
cols_19 = time_indep_cols + [f"{col}_2019" for col in time_dep_cols]

df_14 = df_relevant[cols_14]
df_14.columns = time_indep_cols + [f"{col}" for col in time_dep_cols]

df_19 = df_relevant[cols_19]
df_19.columns = time_indep_cols + [f"{col}" for col in time_dep_cols]


--- Zeitabhängige Spalten ---
['violence', 'ew_vot_abs_die_linke', 'crimes', 'fraud_pp', 'fraud', 'population_density', 'ew_winner', 'drug', 'prop_no_haupt', 'crimes_pp', 'ew_eli', 'income_pp', 'ew_vot_rel_alternative_fuer_deutschland', 'ew_invalid', 'ew_vot_abs_christlich_demokratische_union_deutschlands', 'theft', 'income', 'ew_valid', 'ew_vot_abs_alternative_fuer_deutschland', 'ew_vot_rel_die_linke', 'ew_vot_rel_christlich_demokratische_union_deutschlands', 'ew_vot_abs_sozialdemokratische_partei_deutschlands', 'ew_vot_rel_buendnis_90_die_gruenen', 'ew_vot_abs_freie_demokratische_partei', 'ew_vot_abs_buendnis_90_die_gruenen', 'ew_vot_rel_freie_demokratische_partei', 'ew_vot', 'theft_pp', 'population', 'ew_vot_rel_sozialdemokratische_partei_deutschlands', 'drug_pp', 'prop_abitur', 'violence_pp']
--- Zeitunabhängige Spalten ---
['kreis_key', 'kreis_name', 'east', 'border_vic', 'hh_ges', 'hh_avg', 'bl_key']


## 2.2 Concat election results in 2014 and 2019

In [24]:
df_election = pd.concat([df_14, df_19], ignore_index=True)

In [25]:
df_election

Unnamed: 0,kreis_key,kreis_name,east,border_vic,hh_ges,hh_avg,bl_key,violence,ew_vot_abs_die_linke,crimes,...,ew_vot_abs_freie_demokratische_partei,ew_vot_abs_buendnis_90_die_gruenen,ew_vot_rel_freie_demokratische_partei,ew_vot,theft_pp,population,ew_vot_rel_sozialdemokratische_partei_deutschlands,drug_pp,prop_abitur,violence_pp
0,09361,Amberg,False,86.207442,20571.0,2.017355,9.0,459.0,417.0,3073.0,...,291.0,892.0,0.025278,11512.0,0.059787,41564.0,0.245049,0.007194,33.7,0.011043
1,09561,Ansbach,False,199.570294,19488.0,2.045823,9.0,463.0,512.0,2993.0,...,247.0,1529.0,0.021799,11331.0,0.058710,39925.0,0.268291,0.004959,39.8,0.011597
2,09661,Aschaffenburg,False,167.815474,33108.0,2.043977,9.0,815.0,688.0,5699.0,...,680.0,2560.0,0.039738,17112.0,0.071655,68006.0,0.230189,0.007029,39.3,0.011984
3,09761,Augsburg,False,99.620492,138889.0,1.913406,9.0,4074.0,3395.0,23188.0,...,1870.0,10932.0,0.027328,68429.0,0.060184,278827.0,0.204606,0.005369,32.6,0.014611
4,08211,Baden-Baden,False,26.433758,26192.0,1.965715,8.0,450.0,533.0,4243.0,...,1089.0,2464.0,0.058370,18657.0,0.073378,53177.0,0.235997,0.003855,42.6,0.008462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
797,09479,Wunsiedel im Fichtelgebirge,False,39.065404,36375.0,2.160137,9.0,566.0,690.0,4045.0,...,789.0,3531.0,0.024914,31669.0,0.028018,72917.0,0.153747,0.008009,31.0,0.007762
798,09679,Würzburg,False,226.974374,68900.0,2.363628,9.0,499.0,1841.0,3720.0,...,3078.0,17730.0,0.035524,86646.0,0.010020,162068.0,0.111892,0.003030,10.5,0.003079
799,08417,Zollernalbkreis,False,72.872614,80123.0,2.331341,8.0,996.0,2017.0,6562.0,...,6408.0,14378.0,0.077914,82245.0,0.021856,189149.0,0.112615,0.002728,18.2,0.005266
800,14524,Zwickau,True,47.481052,167404.0,1.988363,14.0,1954.0,18838.0,14892.0,...,7799.0,11191.0,0.048904,159475.0,0.033728,316267.0,0.088936,0.002409,32.6,0.006178


## 2.3 Compute regional winners

In [26]:
print("--- Zeilen ---")
print("Gesamt: ", len(df_election))
print("--- Gewinner ---")
print("Gesamt: ", df_election["ew_winner"].value_counts().sum())
df_election["ew_winner"].value_counts()

--- Zeilen ---
Gesamt:  802
--- Gewinner ---
Gesamt:  802


christlich_demokratische_union_deutschlands    604
sozialdemokratische_partei_deutschlands        110
buendnis_90_die_gruenen                         50
alternative_fuer_deutschland                    31
die_linke                                        7
Name: ew_winner, dtype: int64

## 2.4 drop election data

In [34]:
no_election_cols = [col for col in df_election.columns if "ew" not in col]
df = df_election[no_election_cols]


In [35]:
df_election.head()

Unnamed: 0,kreis_key,kreis_name,east,border_vic,hh_ges,hh_avg,bl_key,violence,ew_vot_abs_die_linke,crimes,...,ew_vot_abs_freie_demokratische_partei,ew_vot_abs_buendnis_90_die_gruenen,ew_vot_rel_freie_demokratische_partei,ew_vot,theft_pp,population,ew_vot_rel_sozialdemokratische_partei_deutschlands,drug_pp,prop_abitur,violence_pp
0,9361,Amberg,False,86.207442,20571.0,2.017355,9.0,459.0,417.0,3073.0,...,291.0,892.0,0.025278,11512.0,0.059787,41564.0,0.245049,0.007194,33.7,0.011043
1,9561,Ansbach,False,199.570294,19488.0,2.045823,9.0,463.0,512.0,2993.0,...,247.0,1529.0,0.021799,11331.0,0.05871,39925.0,0.268291,0.004959,39.8,0.011597
2,9661,Aschaffenburg,False,167.815474,33108.0,2.043977,9.0,815.0,688.0,5699.0,...,680.0,2560.0,0.039738,17112.0,0.071655,68006.0,0.230189,0.007029,39.3,0.011984
3,9761,Augsburg,False,99.620492,138889.0,1.913406,9.0,4074.0,3395.0,23188.0,...,1870.0,10932.0,0.027328,68429.0,0.060184,278827.0,0.204606,0.005369,32.6,0.014611
4,8211,Baden-Baden,False,26.433758,26192.0,1.965715,8.0,450.0,533.0,4243.0,...,1089.0,2464.0,0.05837,18657.0,0.073378,53177.0,0.235997,0.003855,42.6,0.008462


# Model

## 3.1 OneVsRest Classifier

In [30]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import Pipeline

In [31]:
rndState = np.random.RandomState(42)

In [None]:
pipe = Pipeline[()]