In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [3]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data', 
                 header=None)

df.columns = ['target', 'handicapped-infants', 'water-project-cost-sharing', 'adoption-of-the-budget-resolution', 'physician-fee-freeze', 'el-salvador-aid', 'religious-groups-in-schools', 'anti-satellite-test-ban', 'aid-to-nicaraguan-contras', 'mx-missile', 'immigration', 'synfuels-corporation-cutback', 'education-spending', 'superfund-right-to-sue', 'crime', 'duty-free-exports', 'export-administration-act-south-africa']

target = 'target'

df.head()

Unnamed: 0,target,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [4]:
df = df.replace('?', np.NaN)
df.isna().sum()

target                                      0
handicapped-infants                        12
water-project-cost-sharing                 48
adoption-of-the-budget-resolution          11
physician-fee-freeze                       11
el-salvador-aid                            15
religious-groups-in-schools                11
anti-satellite-test-ban                    14
aid-to-nicaraguan-contras                  15
mx-missile                                 22
immigration                                 7
synfuels-corporation-cutback               21
education-spending                         31
superfund-right-to-sue                     25
crime                                      17
duty-free-exports                          28
export-administration-act-south-africa    104
dtype: int64

In [5]:
print('Number of rows before removing rows with missing values:', str(df.shape[0]))

df = df.dropna(how='any')

print('Number of rows after removing rows with missing values:', str(df.shape[0]))

Number of rows before removing rows with missing values: 435
Number of rows after removing rows with missing values: 232


In [6]:
X = df.drop(columns=[target])
y = df[target]

In [7]:
for j in range(X.shape[1]):
    print(X.columns[j] + ':')
    print(X.iloc[:, j].value_counts(), end='\n\n')

handicapped-infants:
n    136
y     96
Name: handicapped-infants, dtype: int64

water-project-cost-sharing:
n    125
y    107
Name: water-project-cost-sharing, dtype: int64

adoption-of-the-budget-resolution:
y    123
n    109
Name: adoption-of-the-budget-resolution, dtype: int64

physician-fee-freeze:
n    119
y    113
Name: physician-fee-freeze, dtype: int64

el-salvador-aid:
y    128
n    104
Name: el-salvador-aid, dtype: int64

religious-groups-in-schools:
y    149
n     83
Name: religious-groups-in-schools, dtype: int64

anti-satellite-test-ban:
y    124
n    108
Name: anti-satellite-test-ban, dtype: int64

aid-to-nicaraguan-contras:
y    119
n    113
Name: aid-to-nicaraguan-contras, dtype: int64

mx-missile:
n    119
y    113
Name: mx-missile, dtype: int64

immigration:
y    128
n    104
Name: immigration, dtype: int64

synfuels-corporation-cutback:
n    152
y     80
Name: synfuels-corporation-cutback, dtype: int64

education-spending:
n    124
y    108
Name: education-spending, 

In [8]:
X = pd.get_dummies(X, columns=X.columns)
X.head()

Unnamed: 0,handicapped-infants_n,handicapped-infants_y,water-project-cost-sharing_n,water-project-cost-sharing_y,adoption-of-the-budget-resolution_n,adoption-of-the-budget-resolution_y,physician-fee-freeze_n,physician-fee-freeze_y,el-salvador-aid_n,el-salvador-aid_y,...,education-spending_n,education-spending_y,superfund-right-to-sue_n,superfund-right-to-sue_y,crime_n,crime_y,duty-free-exports_n,duty-free-exports_y,export-administration-act-south-africa_n,export-administration-act-south-africa_y
5,1,0,0,1,0,1,1,0,0,1,...,1,0,0,1,0,1,0,1,0,1
8,1,0,0,1,1,0,0,1,0,1,...,0,1,0,1,0,1,1,0,0,1
19,0,1,0,1,0,1,1,0,1,0,...,1,0,1,0,1,0,0,1,0,1
23,0,1,0,1,0,1,1,0,1,0,...,1,0,1,0,1,0,0,1,0,1
25,0,1,1,0,0,1,1,0,1,0,...,1,0,1,0,1,0,0,1,0,1


In [9]:
y.value_counts()

democrat      124
republican    108
Name: target, dtype: int64

In [10]:
le = LabelEncoder()
y = le.fit_transform(y)
pd.DataFrame(data=y, columns=[target])[target].value_counts()

0    124
1    108
Name: target, dtype: int64

In [11]:
ros = RandomOverSampler(random_state = 0)
X, y = ros.fit_sample(X,y)

pd.DataFrame(data=y, columns = [target])[target].value_counts()

1    124
0    124
Name: target, dtype: int64

In [12]:
clfs = {'lr': LogisticRegression(random_state=0),
        'mlp': MLPClassifier(random_state=0),
        'dt': DecisionTreeClassifier(random_state=0),
        'rf': RandomForestClassifier(random_state=0),
        'svc': SVC(random_state=0),
        'knn': KNeighborsClassifier(),
        'gnb': GaussianNB(),
        'gbc': GradientBoostingClassifier(random_state=0)}

In [13]:
n_components = [X.shape[1] // 4, X.shape[1] // 2, X.shape[1]]

pipe_clfs = {}

for name, clf in clfs.items():
    pipe_clfs[name] = {}
    for n_component in n_components:
        if n_component < X.shape[1]:
            pipe_clfs[name][n_component] = Pipeline([('StandardScaler', StandardScaler()), 
                                                     ('PCA', PCA(n_components=n_component, random_state=0)), 
                                                     ('clf', clf)])
        else:
            pipe_clfs[name][n_component] = Pipeline([('StandardScaler', StandardScaler()), 
                                                     ('clf', clf)])

In [14]:
param_grids = {}

In [15]:
#logistic regression parameter grid

C_range = [10**i for i in range(-4,5)]

param_grid = [{'clf__multi_class': ['ovr'],
            'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            'clf__C': C_range},
           {'clf__multi_class': ['multinomial'],
            'clf__solver':['newton-cg', 'lbfgs', 'sag', 'saga'],
            'clf__C': C_range}]

param_grids['lr'] = param_grid

In [16]:
# multi-layer perceptron parameter grid
param_grid = [{'clf__hidden_layer_sizes': [10, 100, 200],
               'clf__activation': ['identity', 'logistic', 'tanh', 'relu']
              }]

param_grids['mlp'] = param_grid

In [17]:
# decision tree parameter grid
param_grid = [{'clf__min_samples_split': [2, 10, 30],
               'clf__min_samples_leaf': [1, 10, 30]
              }]

param_grids['dt'] = param_grid

In [18]:
# random forest parameter grid
param_grid = [{'clf__n_estimators': [2, 10, 30],
               'clf__min_samples_split': [2, 10, 30],
               'clf__min_samples_leaf': [1, 10, 30]
              }]

param_grids['rf'] = param_grid

In [19]:
# SVC parameter grid
param_grid = [{'clf__C': [0.01, 0.1, 1, 10, 100],
               'clf__gamma': ['auto', 'scale'],
               'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
              }]

param_grids['svc'] = param_grid

In [20]:
# KNN parameter grid
param_grid = [{'clf__n_neighbors': list(range(1, 11))}]

param_grids['knn'] = param_grid

In [21]:
# GNB parameter grid
param_grid = [{'clf__var_smoothing': [10 ** i for i in range(-10, -7)]}]

param_grids['gnb'] = param_grid

In [24]:
# GBC parameter grid
param_grid = [{"clf__loss":["deviance"],
    "clf__learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "clf__max_depth":[3,5,8],
    "clf__min_samples_split": np.linspace(0.1, 0.5, 12),           
    "clf__min_samples_leaf": np.linspace(0.1, 0.5, 12),
    }]

param_grids['gbc'] = param_grid

In [25]:
# Hyperparameter Tuning
# The list of [best_score_, best_params_, best_estimator_]
best_score_param_estimators = []

#loop over classifiers
for name in pipe_clfs.keys():
    for n_component in n_components:
        gs = GridSearchCV(estimator=pipe_clfs[name][n_component],
                          param_grid=param_grids[name],
                          scoring='accuracy',
                          n_jobs=-1,
                          cv=StratifiedKFold(n_splits=10,
                                             shuffle=True,
                                             random_state=0))
        #fit pipeline
        gs = gs.fit(X, y)

        # update best_score_param_estimators
        best_score_param_estimators.append([gs.best_score_, gs.best_params_, gs.best_estimator_])



In [26]:
# select best model
# Sort best_score_param_estimators in descending order of the best_score_
best_score_param_estimators = sorted(best_score_param_estimators, key=lambda x : x[0], reverse=True)

# Print out best_estimator
print(best_score_param_estimators[0][2])

Pipeline(memory=None,
     steps=[('StandardScaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('PCA', PCA(copy=True, iterated_power='auto', n_components=16, random_state=0,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_...=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))])


In [32]:
best_score_param_estimators

[[0.9717741935483871,
  {'clf__activation': 'logistic', 'clf__hidden_layer_sizes': 10},
  Pipeline(memory=None,
       steps=[('StandardScaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('PCA', PCA(copy=True, iterated_power='auto', n_components=16, random_state=0,
    svd_solver='auto', tol=0.0, whiten=False)), ('clf', MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
         beta_...=True, solver='adam', tol=0.0001,
         validation_fraction=0.1, verbose=False, warm_start=False))])],
 [0.9717741935483871,
  {'clf__min_samples_leaf': 1, 'clf__min_samples_split': 30},
  Pipeline(memory=None,
       steps=[('StandardScaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=30,
        