In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = arff.loadarff('../hw1/4year.arff')
df = pd.DataFrame(data[0])
df['bankruptcy'] = (df['class']==b'1')
del df['class']
df.columns = ['X{0:02d}'.format(k) for k in range(1,65)] + ['bankruptcy']
df.describe()

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_imp = imp_mean.fit_transform(df.values)

In [3]:


X, y = X_imp[:, :-1], X_imp[:, -1]

X_train, X_test, y_train, y_test =\
    train_test_split(X, y, 
                     test_size=0.3, 
                     random_state=0, 
                     stratify=y)

In [4]:
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [5]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
pca.fit(X_train_std)
X_train_pca = pca.transform(X_train_std)
X_test_pca = pca.transform(X_test_std)


Apply LR / SVM / Decision Tree below

In [6]:
estimators = []

In [7]:
steps = [('scaler', StandardScaler()),('pca', PCA(n_components=3)), ('estimator', LogisticRegression())]
from sklearn.pipeline import Pipeline
estimator = Pipeline(steps)
print(estimator.get_params().keys())
param_grid={'estimator__C':[0.001, 0.01, 0.1, 1, 10, 100]}

grid=GridSearchCV(estimator,param_grid=param_grid,cv=10)             
grid.fit(X_train,y_train)
print('the best paras are :{}\nand the training score are:{}'.format(grid.best_params_, grid.best_score_))
print('Test accuracy:', grid.score(X_test, y_test))


pd.DataFrame(grid.cv_results_)

dict_keys(['memory', 'steps', 'verbose', 'scaler', 'pca', 'estimator', 'scaler__copy', 'scaler__with_mean', 'scaler__with_std', 'pca__copy', 'pca__iterated_power', 'pca__n_components', 'pca__random_state', 'pca__svd_solver', 'pca__tol', 'pca__whiten', 'estimator__C', 'estimator__class_weight', 'estimator__dual', 'estimator__fit_intercept', 'estimator__intercept_scaling', 'estimator__l1_ratio', 'estimator__max_iter', 'estimator__multi_class', 'estimator__n_jobs', 'estimator__penalty', 'estimator__random_state', 'estimator__solver', 'estimator__tol', 'estimator__verbose', 'estimator__warm_start'])
the best paras are :{'estimator__C': 0.001}
and the training score are:0.9470381562426848
Test accuracy: 0.9462219196732471


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.351622,0.181391,0.004579,0.002591,0.001,{'estimator__C': 0.001},0.947522,0.946064,0.947522,0.947522,0.947445,0.945985,0.947445,0.948905,0.944526,0.947445,0.947038,0.001143,1
1,0.201127,0.113257,0.002995,0.000705,0.01,{'estimator__C': 0.01},0.947522,0.944606,0.946064,0.947522,0.944526,0.945985,0.947445,0.948905,0.944526,0.947445,0.946455,0.001465,2
2,0.420845,0.363503,0.004816,0.003083,0.1,{'estimator__C': 0.1},0.947522,0.944606,0.946064,0.947522,0.944526,0.945985,0.947445,0.948905,0.944526,0.947445,0.946455,0.001465,2
3,0.345148,0.143777,0.00834,0.004857,1.0,{'estimator__C': 1},0.947522,0.944606,0.946064,0.947522,0.944526,0.945985,0.947445,0.948905,0.944526,0.947445,0.946455,0.001465,2
4,0.211764,0.12166,0.004065,0.004311,10.0,{'estimator__C': 10},0.947522,0.944606,0.946064,0.947522,0.944526,0.945985,0.947445,0.948905,0.944526,0.947445,0.946455,0.001465,2
5,0.16549,0.040361,0.003201,0.001051,100.0,{'estimator__C': 100},0.947522,0.944606,0.946064,0.947522,0.944526,0.945985,0.947445,0.948905,0.944526,0.947445,0.946455,0.001465,2


In [None]:
# estimator = SVC()
steps = [('scaler', StandardScaler()),('pca', PCA(n_components=3)), ('estimator', SVC())]
from sklearn.pipeline import Pipeline
estimator = Pipeline(steps)
print(estimator.get_params().keys())

param_grid = {'estimator__kernel':('linear', 'rbf'), 'estimator__C':[0.01, 0.1]}

grid=GridSearchCV(estimator,param_grid=param_grid,cv=10)             
grid.fit(X_train,y_train)
print('the best paras are :{}\nand the training score are:{}'.format(grid.best_params_, grid.best_score_))
print('Test accuracy:', grid.score(X_test, y_test))


pd.DataFrame(grid.cv_results_)

dict_keys(['memory', 'steps', 'verbose', 'scaler', 'pca', 'estimator', 'scaler__copy', 'scaler__with_mean', 'scaler__with_std', 'pca__copy', 'pca__iterated_power', 'pca__n_components', 'pca__random_state', 'pca__svd_solver', 'pca__tol', 'pca__whiten', 'estimator__C', 'estimator__break_ties', 'estimator__cache_size', 'estimator__class_weight', 'estimator__coef0', 'estimator__decision_function_shape', 'estimator__degree', 'estimator__gamma', 'estimator__kernel', 'estimator__max_iter', 'estimator__probability', 'estimator__random_state', 'estimator__shrinking', 'estimator__tol', 'estimator__verbose'])


In [None]:
# estimator = DecisionTreeClassifier()
steps = [('scaler', StandardScaler()),('pca', PCA(n_components=3)), ('estimator', DecisionTreeClassifier())]
from sklearn.pipeline import Pipeline
estimator = Pipeline(steps)
print(estimator.get_params().keys())

param_grid = {'estimator__criterion':('gini', 'entropy'), 'estimator__max_depth':np.arange(1, 21), 'estimator__min_samples_leaf':[1, 5, 10, 50, 100]}

grid=GridSearchCV(estimator,param_grid=param_grid,cv=10)             
grid.fit(X_train,y_train)
print('the best paras are :{}\nand the training score are:{}'.format(grid.best_params_, grid.best_score_))
print('Test accuracy:', grid.score(X_test, y_test))

estimators.append(grid)
pd.DataFrame(grid.cv_results_)

dict_keys(['memory', 'steps', 'verbose', 'scaler', 'pca', 'estimator', 'scaler__copy', 'scaler__with_mean', 'scaler__with_std', 'pca__copy', 'pca__iterated_power', 'pca__n_components', 'pca__random_state', 'pca__svd_solver', 'pca__tol', 'pca__whiten', 'estimator__ccp_alpha', 'estimator__class_weight', 'estimator__criterion', 'estimator__max_depth', 'estimator__max_features', 'estimator__max_leaf_nodes', 'estimator__min_impurity_decrease', 'estimator__min_impurity_split', 'estimator__min_samples_leaf', 'estimator__min_samples_split', 'estimator__min_weight_fraction_leaf', 'estimator__presort', 'estimator__random_state', 'estimator__splitter'])
