# Congressional Voting

**Kaggle: 184.702 TU ML WS 20**

**Goal: Predict the party of a congress member.**

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.feature_selection import SelectPercentile, chi2, SelectFromModel
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer
from sklearn import tree
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import time

### Get the Data

In [None]:
data = pd.read_csv('./184702-tu-ml-ws-20-congressional-voting/CongressionalVotingID.shuf.lrn.csv')

In [None]:
data

### Basic Data Information

In [None]:
data.info()

In [None]:
data.describe(include = 'object')

### Remove identifiers

In [None]:
idsTrain = data['ID'].to_frame()
data = data.drop('ID',axis=1)

### Missing Data

**Replace 'unknown' for a recognised variable**

In [None]:
data.replace("unknown", np.nan, inplace = True)

**Missing data per feature**

In [None]:
missing_values_feature = data.isnull().sum(axis=0)
missing_values_feature

In [None]:
plt.figure(figsize=(20, 8))
plt.xticks(rotation=90)
plt.bar(missing_values_feature.axes[0].to_list(), missing_values_feature.values)

**Missing data per column**

In [None]:
percent_missing = data.isnull().sum() * 100 / len(data)
missing_value_data_columns = pd.DataFrame({'percent_missing (%)': percent_missing})
sort_data = missing_value_data_columns.copy()
sort_data.sort_values('percent_missing (%)', inplace=True, ascending=False)
sort_data

**Remove columns that have more than 30% (?) of missing values**

In [None]:
to_keep = list(missing_value_data_columns.index[missing_value_data_columns['percent_missing (%)'] < 30])
data = data[to_keep]
data

**Missing data per row**

In [None]:
#pd.set_option('display.max_rows', None)
percent_missing = (1 - data.apply(lambda x: x.count(), axis=1) / len(data.columns)) * 100
missing_value_data_rows = pd.DataFrame({'percent_missing (%)': percent_missing})
sort_data = missing_value_data_rows.copy()
sort_data.sort_values('percent_missing (%)', inplace=True, ascending=False)
sort_data

**Remove rows that have more than 50% (?) of missing values**

In [None]:
#pd.set_option('display.max_rows', 10)
to_exclude = missing_value_data_rows[(missing_value_data_rows['percent_missing (%)'] >= 50)]
data = data.drop(to_exclude.index)
data.index = np.arange(1, len(data) + 1)
data.shape

**Decision: Replace missing values with the class mode**

In [None]:
for y in data.select_dtypes(include=['object']).columns.tolist():
    mode_value = data[y].mode()
    data[y] = data[y].fillna(mode_value[0])    
    
data

**Check no missing data**

In [None]:
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='viridis')

**Replace 'n' and 'y' for a numeric value**

In [None]:
data.replace("n", 0, inplace = True)
data.replace("y", 1, inplace = True)

### Exploratory Data Analysis

**Class**

In [None]:
sns.set_style('darkgrid')
ax = sns.countplot(x = data['class'])

total = len(data['class'])

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:.1f}%'.format(100 * height/total),
            ha="center")

### Target Split

In [None]:
X = data.drop('class',axis=1)
y = data['class']

### Feature Selection

**Choose one of the methods:**
1. SelectPercentile (chi2)
2. SelectFromModel (LinearSVC)
3. SelectFromModel (LogisticRegression)
3. SelectFromModel (ExtraTreesClassifier)

In [None]:
featureselection_method = 0

In [None]:
if featureselection_method == 1:
    selection = SelectPercentile(chi2, percentile=50)
elif featureselection_method == 2:
    clf = LinearSVC()
    #clf = LinearSVC(C=0.1, penalty="l1", dual=False).fit(X, y)
    selection = SelectFromModel(clf, prefit=False)
elif featureselection_method == 3:
    clf = LogisticRegression()
    #clf = LogisticRegression(C=0.2, penalty="l2", dual=False, max_iter=200).fit(X, y)
    selection = SelectFromModel(clf, prefit=False)
elif featureselection_method == 4:
    clf = ExtraTreesClassifier(n_estimators=50).fit(X, y)
    selection = SelectFromModel(clf, prefit=False)

#clf.feature_importances_ 
#X_transformed = selection.fit_transform(X, y)
#columns = np.asarray(X.columns.values)
#support = np.asarray(selection.get_support())
#columns_with_support = columns[support]
#X_transformed.shape

### Scaling

**Choose one of the methods:**
1. Standardization (StandardScaler)
2. Standardization (RobustScaler)
3. MinMaxScaler
4. Normalization

In [None]:
scaling_method = 0

In [None]:
if scaling_method == 1:
    scaler = StandardScaler()
    scaler = scaler.fit(X_transformed)
    X_scaled = scaler.transform(X_transformed)
elif scaling_method == 2:
    scaler = RobustScaler()
    scaler = scaler.fit(X_transformed)
    X_scaled = scaler.transform(X_transformed)
elif scaling_method == 3:
    scaler = preprocessing.MinMaxScaler()
    scaler = scaler.fit(X_transformed)
    X_scaled = scaler.transform(X_transformed)
elif scaling_method == 4:
    scaler = preprocessing.Normalizer()
    scaler = scaler.fit(X_transformed)
    X_scaled = scaler.transform(X_transformed)

In [None]:
X_scaled = X

### Outliers Detection

**Using mathematical function Z-Score**

In [None]:
z = np.abs(stats.zscore(X_scaled))
threshold = 3
outliers_rows = np.where(z > threshold)
print(np.where(z > threshold))
# The first array contains the list of row numbers and second array respective column numbers

In [None]:
len(set(outliers_rows[0]))

### Data Preparation

**Remove the identified outliers**

In [None]:
#X_prepared = X_scaled[(np.abs(stats.zscore(X_scaled)) < threshold).all(axis=1)]
#X_prepared = X_prepared.to_numpy()
#X_prepared.shape

In [None]:
y = y.drop(outliers_rows[0])
y = y.to_numpy()
y.shape

In [None]:
X_prepared = X
X_prepared = X_prepared.to_numpy()

### Train Dataset Split

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=0.30)

### K-Fold Cross Validation

In [None]:
n_folds = 5

kf = KFold(n_splits = n_folds, random_state = None, shuffle = True)

### Models Prediction

**Types:**
- Linear Classifiers: Logistic Regression, Naive Bayes Classifier
- Nearest Neighbor
- Support Vector Machines
- Decision Trees
- Random Forest
- Neural Networks

In [None]:
d = {}

d["Logistic Regression"] = LogisticRegression()
d["Gaussian Naive Bayes"] = GaussianNB()

d["KNearest Neighbors"] = KNeighborsClassifier()

d["SVM rbf"] = SVC()
d["SGD Classifier"] = SGDClassifier()

d["Decision Tree"] = DecisionTreeClassifier()

d["Random Forest"] = RandomForestClassifier()

d["Multi-layer Perceptron Classifier"] = MLPClassifier(max_iter=1000)

In [None]:
scoreList = []
nameList = []

from sklearn.model_selection import cross_val_score

for name, clf in d.items():
    print("\n--------------",name,"---------------\n")
    start = time.time()
    i = 0
    final_score = 0
    mislabeled_points = 0
    for train_index, test_index in kf.split(X_prepared):
        i = i+1
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X_prepared[train_index], X_prepared[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        score = accuracy_score(y_test, predictions)
        print("Fold", i, 'Accuracy: {0:.2%}'.format(score),\
             "-> %d mislabeled points of %d total points"% ((y_test != predictions).sum(),X_test.shape[0]))
        final_score = final_score + score
        mislabeled_points = mislabeled_points + (y_test != predictions).sum()

        if(i == n_folds):
            end = time.time()
            scoreList.append(final_score/n_folds)
            nameList.append(name)
            print('Final Accuracy: {0:.2%} -> Time: {1:.3} seconds'.format(final_score/n_folds, end - start),\
             "-> %d mislabeled points of %d total points\n"% (mislabeled_points, X_prepared.shape[0]))

In [None]:
ranking = sorted(zip(scoreList,nameList))[::-1]
print("\nClassifiers from best to worst:")
for i in range(0, len(ranking)):
    print(i+1, ') {0:35} Score: {1:.2%}'.format(ranking[i][1], ranking[i][0]))

### Hyperparameter optimization

In [None]:
bestClassifierName = ranking[0][1]

In [None]:
if bestClassifierName == 'Logistic Regression':
    param_grid = {
    'penalty' : ['l1','l2'],
    'C': [0.1, 1, 10], 
    'solver': ['newton-cg','lbfgs','liblinear','sag','saga'],
    'max_iter': [100, 1000]}
    param_randomized = {}
elif bestClassifierName == 'Gaussian Naive Bayes':
    param_grid = {} 
    param_randomized = {}
elif bestClassifierName == 'KNearest Neighbors':
    param_grid = {
    'n_neighbors' : [3,5,7,11,13, 15, 17, 25, 30, 50],
    'weights' : ['uniform', 'distance'],
    'metric' : ['euclidean', 'manhattan']} 
    param_randomized = {}
elif bestClassifierName == 'SVM rbf':
    param_grid = {
    'class_weight': ['balanced', None], 
    'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001], 
    'kernel': ['rbf', 'linear']} 
    param_randomized = {}
elif bestClassifierName == 'SGD Classifier':
    param_grid = {} 
    param_randomized = {}
elif bestClassifierName == 'Decision Tree':
    param_grid = {} 
    param_randomized = {}
elif bestClassifierName == 'Random Forest':
    param_grid = {
    'n_estimators' : [10,20,30,50,100,200,1000],
    'max_depth' : [1, 10, 20, None],
    'bootstrap': [True, False],} 
    param_randomized = {}
elif bestClassifierName == 'Multi-layer Perceptron Classifier':
    param_grid = {
    'activation': ['identity', 'logistic','tanh','relu'],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'max_iter': [200, 1000],
    'alpha': 10.0 ** -np.arange(3, 7),
    'hidden_layer_sizes': [10, 20, 50, 100]} 
    param_randomized = {}

**GridSearchCV**

In [None]:
start = time.time()
clf_gridsearch = GridSearchCV(d.get(bestClassifierName), param_grid, verbose=0)
clf_gridsearch.fit(X_train, y_train)
print(clf_gridsearch.best_params_)
predictions = clf_gridsearch.predict(X_test)
score_gridsearch = accuracy_score(y_test, predictions)
end = time.time()
print(bestClassifierName, "GridSearchCV - Accuracy: %0.3f" % score_gridsearch, "- Time: %0.2f" % (end - start), "seconds")

**RandomizedSearchCV**

In [None]:
start = time.time()
clf_randomizedsearch = RandomizedSearchCV(d.get(bestClassifierName), param_randomized, random_state=0)
clf_randomizedsearch.fit(X_train, y_train)
print(clf_randomizedsearch.best_params_)
predictions = clf_randomizedsearch.predict(X_test)
score_randomizedsearch = accuracy_score(y_test, predictions)
end = time.time()
print(bestClassifierName, "RandomizedSearchCV - Accuracy: %0.3f" % score_randomizedsearch, "- Time: %0.2f" % (end - start), "seconds")

In [None]:
if score_gridsearch > score_randomizedsearch:
    clf = clf_gridsearch.best_estimator_
else:
    clf = clf_randomizedsearch.best_estimator_

**Best Classifier with Hyper Parametrization**

In [None]:
clf

### Test Data

In [None]:
testData = pd.read_csv('./184702-tu-ml-ws-20-congressional-voting/CongressionalVotingID.shuf.tes.csv')

In [None]:
testData

**Missing Data**

In [None]:
testData.replace("unknown", np.nan, inplace = True)

for i in testData.select_dtypes(include=['object']).columns.tolist():
    mode_value = testData[i].mode()
    testData[i] = testData[i].fillna(mode_value[0])  
    
testData.replace("n", 0, inplace = True)
testData.replace("y", 1, inplace = True)

In [None]:
testData

**Remove identifiers**

In [None]:
idsTest = testData['ID'].to_frame()
testData = testData.drop('ID',axis=1)

**Fit Classifier & Predict in all Training Data**

In [None]:
X_prepared.shape

In [None]:
y.shape

In [None]:
start = time.time()
# clf -> best estimator
predictions = cross_val_predict(clf, X_prepared, y, cv=5)
score = accuracy_score(y, predictions)
end = time.time()
print('Accuracy Training Data: {0:.2%} - Time: {1:.3} seconds\n'.format(score, end - start))

#print(confusion_matrix(y, predictions))
print(classification_report(y, predictions))

### Final Prediction

In [None]:
predictions = clf.predict(testData)

In [None]:
predictions

### Join IDs to create Submission Dataset

In [None]:
predictions = pd.DataFrame(predictions, columns=['Class']) 

In [None]:
result = pd.concat([idsTest,predictions], axis=1)

In [None]:
result

In [None]:
result.to_csv('submission.csv', index=False)