# Congressional Voting

**Kaggle: 184.702 TU ML WS 20**

**Goal: Predict the party of a congress member.**

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Get the Data

In [None]:
data = pd.read_csv('./184702-tu-ml-ws-20-congressional-voting/CongressionalVotingID.shuf.lrn.csv')

In [None]:
data

### Basic Data Information

In [None]:
data.info()

In [None]:
data.describe(include = 'object')

### Missing Data

**Replace 'unknown' for a recognised variable**

In [None]:
data.replace("unknown", np.nan, inplace = True)

**Missing data per feature**

In [None]:
missing_values_feature = data.isnull().sum(axis=0)
missing_values_feature

In [None]:
plt.figure(figsize=(20, 8))
plt.xticks(rotation=90)
plt.bar(missing_values_feature.axes[0].to_list(), missing_values_feature.values)

**Missing data per column**

In [None]:
percent_missing = data.isnull().sum() * 100 / len(data)
missing_value_data_columns = pd.DataFrame({'percent_missing (%)': percent_missing})
sort_data = missing_value_data_columns.copy()
sort_data.sort_values('percent_missing (%)', inplace=True, ascending=False)
sort_data

**Remove columns that have more than 30% (?) of missing values**

In [None]:
to_keep = list(missing_value_data_columns.index[missing_value_data_columns['percent_missing (%)'] < 30])
data = data[to_keep]
data

**Missing data per row**

In [None]:
#pd.set_option('display.max_rows', None)
percent_missing = (1 - data.apply(lambda x: x.count(), axis=1) / len(data.columns)) * 100
missing_value_data_rows = pd.DataFrame({'percent_missing (%)': percent_missing})
sort_data = missing_value_data_rows.copy()
sort_data.sort_values('percent_missing (%)', inplace=True, ascending=False)
sort_data

**Remove rows that have more than 30% (?) of missing values**

In [None]:
#pd.set_option('display.max_rows', 10)
to_exclude = missing_value_data_rows[(missing_value_data_rows['percent_missing (%)'] >= 30)]
data = data.drop(to_exclude.index)
data.index = np.arange(1, len(data) + 1)
data.shape

**Decision: Replace missing values with the class mode**

In [None]:
for y in data.select_dtypes(include=['object']).columns.tolist():
    mode_value = data[y].mode()
    data[y] = data[y].fillna(mode_value[0])    
    
data

**Check no missing data**

In [None]:
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='viridis')

**Replace 'n' and 'y' for a numeric value**

In [None]:
data.replace("n", 0, inplace = True)
data.replace("y", 1, inplace = True)

### Exploratory Data Analysis

**Class**

In [None]:
sns.set_style('darkgrid')
ax = sns.countplot(x = data['class'])

total = len(data['class'])

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:.1f}%'.format(100 * height/total),
            ha="center")

### Remove identifiers

In [None]:
idsTrain = data['ID'].to_frame()
data = data.drop('ID',axis=1)

...