## Preprocessing

In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np

data = arff.loadarff('4year.arff')
df = pd.DataFrame(data[0])
df['bankruptcy'] = (df['class']==b'1')
del df['class']
df.columns = ['X{0:02d}'.format(k) for k in range(1,65)] + ['bankruptcy']

In [2]:
from sklearn.preprocessing import Imputer

imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)

In [3]:
from sklearn.model_selection import train_test_split

X, y = imputed_data[:, :-1], imputed_data[:, -1]

X_train, X_test, y_train, y_test =\
    train_test_split(X, y, 
                     test_size=0.3, 
                     random_state=0, 
                     stratify=y)

## Using PCA to extract 3 features, and apply LR using pipeline, grid research and 10-fold cross validation to find optimal hyperparameters

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

pipe_lr = make_pipeline(StandardScaler(),
                        PCA(n_components=3),
                        LogisticRegression(random_state=1))

param_range_lr = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid_lr = [{'logisticregression__C': param_range_lr}]

gs_lr = GridSearchCV(estimator=pipe_lr, 
                     param_grid=param_grid_lr, 
                     scoring='accuracy', 
                     cv=10,
                     n_jobs=-1)
gs_lr = gs_lr.fit(X_train, y_train)
print(gs_lr.best_score_)
print(gs_lr.best_params_)

0.946454625036
{'logisticregression__C': 0.01}


## Using PCA to extract 3 features, and apply Decision Tree using pipeline, grid research and 10-fold cross validation to find optimal hyperparameters

In [11]:
from sklearn.tree import DecisionTreeClassifier

pipe_dtree = make_pipeline(StandardScaler(),
                        PCA(n_components=3),
                        DecisionTreeClassifier(random_state=1))

param_range_dtree = [2, 4, 6, 8, 10, 12, 14]
param_grid_dtree = [{'decisiontreeclassifier__max_depth': param_range_dtree,
                     'decisiontreeclassifier__criterion':['gini']}]


gs_dtree = GridSearchCV(estimator=pipe_dtree, 
                     param_grid=param_grid_dtree, 
                     scoring='accuracy', 
                     cv=10,
                     n_jobs=-1)
gs_dtree = gs_dtree.fit(X_train, y_train)
print(gs_dtree.best_score_)
print(gs_dtree.best_params_)

0.947475926466
{'decisiontreeclassifier__criterion': 'gini', 'decisiontreeclassifier__max_depth': 2}


## Using PCA to extract 3 features, and apply SVM using pipeline, grid research and 10-fold cross validation to find optimal hyperparameters

In [12]:
pipe_svc = make_pipeline(StandardScaler(),
                         PCA(n_components=3),
                         SVC(random_state=1))

param_range_svc = [0.01, 0.1, 1]
param_grid_svc = [{'svc__C': param_range_svc, 
                   'svc__kernel': ['linear']},
                  {'svc__C': param_range_svc, 
                   'svc__gamma': param_range_svc, 
                   'svc__kernel': ['rbf']}]

gs_svc = GridSearchCV(estimator=pipe_svc, 
                      param_grid=param_grid_svc, 
                      scoring='accuracy', 
                      cv=10,
                      n_jobs=-1)
gs_svc = gs_svc.fit(X_train, y_train)
print(gs_svc.best_score_)
print(gs_svc.best_params_)

0.947475926466
{'svc__C': 0.01, 'svc__kernel': 'linear'}
