In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os    
from chardet import detect
import re
import scipy as stats
import camelot
from sklearn.linear_model import LinearRegression

# Collecting variable descriptions
containing variable names, description and source for each dataset

In [2]:
election_variable_description = pd.read_csv('data\\bundeswahlleiter\\election_variable_description.csv'
                                            ,sep=';',
                                            )
creditreform_variable_description = pd.read_csv('data\\creditreform\\creditreform_variable_description.csv'
                                            ,sep=';',
                                            )
structural_data_variable_description = pd.read_csv('data\\bundeswahlleiter\\struk_variable_descriptives.csv'
                                            ,sep=','
                                            )
bka_variable_description = pd.read_csv('data\\bka\\bka_variable_description.csv'
                                            ,sep=';'
                                            )
variable_description = pd.concat([election_variable_description
                                  ,creditreform_variable_description
                                  ,structural_data_variable_description
                                  ,bka_variable_description]
                                 ,sort=False
                                )
variable_description.to_excel('data\\variable_description.xlsx',index=False)

# Reading and merging data sets

In [3]:
# Reading data on electoral outcome 
# See: data\bundeswahlleiter\bundeswahlleiter_reader.ipynb
election = pd.read_csv('data\\bundeswahlleiter\\election.csv')

election_sel = ['Nr', 'region', 'subregion', 'vot19_14', 'turnout14', 'turnout19', 'turnout19_14']
election = election[election_sel]
# Reading Data for overindebtedness 
# See: data\creditreform\creditreform_reader.ipynb
creditreform = pd.read_csv('data\\creditreform\\creditreform.csv')



df = election.merge(creditreform, how='outer', left_on='region', right_on='region')#, indicator=True

# skip national level 
df_state = df[df['subregion']==99]
df = df[df['subregion']!=99]
df.set_index('Nr',inplace=True)

# getting meta_columns to front
meta_columns_names = ['region', 'subregion', 'state']
df_meta = df[meta_columns_names]
df = df.drop(columns=df_meta.columns)
df = pd.concat([df_meta,df],axis=1)

# reading structural data
structural_data = pd.read_csv('data\\bundeswahlleiter\\strukturdaten.csv',index_col=0,sep=';')
df = df.merge(structural_data, left_index=True, right_index=True,how='left')

# reading crime data
bka = pd.read_csv('data\\bka\\bka.csv')
bka = bka.set_index('region')
df = df.merge(bka, left_index=True,right_index=True,how='left')

# some of the entries are inf
df = df.replace([np.inf,-np.inf],np.nan)

df.to_csv('data\\df.csv',index=True,sep=';')
df.head()

Unnamed: 0_level_0,region,subregion,state,vot19_14,turnout14,turnout19,turnout19_14,state_abbrev,debt_2013,debt_2014,...,f_crime_2015,total_suspects_2014,foreign_suspects_2014,f_crime_2014,total_suspects_2013,foreign_suspects_2013,f_crime_2013,total_suspects_2012,foreign_suspects_2012,f_crime_2012
Nr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,"Flensburg, Stadt",1.0,Schleswig-Holstein,-0.003027,0.3574,0.56292,0.20552,KS,16.41,16.4,...,32.5,233.0,66.0,28.3,207.0,45.0,21.7,277.0,42.0,15.2
1002,"Kiel, Landeshauptstadt",1.0,Schleswig-Holstein,0.060316,0.402589,0.588603,0.186015,KS,12.04,12.03,...,26.4,463.0,80.0,17.3,439.0,113.0,25.7,490.0,95.0,19.4
1003,"Lübeck, Hansestadt",1.0,Schleswig-Holstein,0.551817,0.376398,0.546124,0.169726,KS,15.25,15.59,...,23.4,512.0,109.0,21.3,509.0,85.0,16.7,526.0,101.0,19.2
1004,"Neumünster, Stadt",1.0,Schleswig-Holstein,2.392481,0.453649,0.482205,0.028556,KS,16.61,16.94,...,31.6,241.0,59.0,24.5,241.0,54.0,22.4,247.0,45.0,18.2
1051,Dithmarschen,1.0,Schleswig-Holstein,3.374651,0.397193,0.544108,0.146915,K,12.52,12.8,...,12.2,167.0,19.0,11.4,151.0,25.0,16.6,227.0,23.0,10.1


# Selecting important variables and fitting a model

In [95]:
#X = df.iloc[:,]
y = df.vot19_14

X = df.drop(labels=['vot19_14','region','state_abbrev','subregion'], axis=1)
#X = X.set_index('Nr')

numeric = X.dtypes != object
numeric_features = list(X.columns[numeric])
categoric_features = list(X.columns[~numeric])

X = X[categoric_features+numeric_features]

In [97]:
#numeric_features

In [98]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categoric_features)
    ])

In [17]:
X_pre = preprocessor.fit_transform(X)

In [149]:
reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('poly', PolynomialFeatures(1)),
                      ('feature_selection', SelectFromModel(
                          estimator=RandomForestRegressor(max_depth=2,
                                                random_state=0
                                                ,n_estimators=1000
                                               ),
                      threshold='mean'
                      )),
                      #max_depth=3
                      #('feature_selection', SelectFromModel(LassoCV(cv=5, random_state=0))),
                      ('regressor',
                       LassoCV(cv=5, random_state=0)
                       #LinearRegression()
                      )])

In [150]:
reg.fit(X,y)
print(reg.score(X,y))

predict = reg.predict(X)
reg_res = reg.named_steps['regressor']

0.7437588811931254


In [174]:
feature_names = [a for a in list(X.columns) if a != 'state']

# add new onehot encoder variable names 
feature_names_cat = reg.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names(categoric_features)
feature_names_cat = list(feature_names_cat)
feature_names.extend(feature_names_cat)

# add new poly names
feature_names_poly = reg.named_steps.poly.get_feature_names(feature_names)
#reg.named_steps.poly.get_params()
#len(feature_names_poly)
#feature_names_poly

from itertools import compress
features_pre_bool = reg.named_steps.feature_selection.get_support()
list(compress(feature_names_poly,list(features_pre_bool)))

['turnout19_14',
 'age_18_24_2017',
 'ag4_60_74_2017',
 'child_day_care_2018',
 'business_reg_2017',
 'dwellings_new_2012',
 'empl_manuf_2012']

In [153]:
#Getting the feature importances of the regressors  
feature_importance = reg.named_steps.feature_selection.estimator_.feature_importances_
feature_importance.shape

(162,)

In [178]:
feat_imp_df = pd.DataFrame({'feature':feature_names_poly,'importance':feature_importance})
feat_imp_df[feat_imp_df.importance>feat_imp_df.importance.mean()
           ].sort_values(by='importance', ascending=False)#.shape

Unnamed: 0,feature,importance
19,age_18_24_2017,0.491582
41,child_day_care_2018,0.413714
42,business_reg_2017,0.021315
116,empl_manuf_2012,0.014462
3,turnout19_14,0.009586
106,dwellings_new_2012,0.007987
22,ag4_60_74_2017,0.007955


In [179]:
len([a for a in reg_res.coef_ if a>0])

3

In [159]:
feat_imp_df.importance.mean()

0.006172839506172839

In [180]:
feat_imp_df#[feat_imp_df.coefficient>0]

Unnamed: 0,feature,importance
0,1,0.000000
1,turnout14,0.000000
2,turnout19,0.000000
3,turnout19_14,0.009586
4,debt_2013,0.000237
5,debt_2014,0.000000
6,debt_2015,0.000275
7,debt_2016,0.000639
8,debt_2017,0.000454
9,debt_2018,0.000253


In [110]:
# getting rescaled coefficients following the lines of 
# https://stackoverflow.com/questions/31029340/
# how-to-adjust-scaled-scikit-learn-logicistic-regression-coeffs-to-score-a-non-sc/38836670
# retrieving scales from pipe was not straight forward

scaler = reg.named_steps.preprocessor.transformers[0][1].get_params()['scaler']

#scaler.scales_ does not work therefore:
scaler_x = StandardScaler().fit(X[numeric_features]).scale_[feature_mask[1:]] # here again, the first value is not, this is the categorical value

len(scaler_x)

  return self.partial_fit(X, y)


NameError: name 'feature_mask' is not defined

In [None]:
#reg_res.feature_importances_
coefficients_rescaled = reg_res.coef_[1:]/scaler_x
pd.DataFrame({'Regressor':coefficient_names, 'Coefficients':coefficients_rescaled})
#reg_res.feature_importances_[[a!=0 for a in reg_res.feature_importances_]]
#reg_res.coef_

In [None]:
np.array(X.columns).reshape(1, -1).shape
#X.shape

In [161]:
reg_res.coef_[1:]

array([-1.31048183,  0.88979463,  2.16184925, -0.70593208, -0.19359235,
        0.13915398])