In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [3]:
data = pd.read_csv('./data/대장암 데이터.csv', engine='python')

## drop

In [4]:
drop_feature = ['ob_rt', 'analge', 'exercise', 'crc_hx']
for col in drop_feature:
    data.drop(col, axis=1, inplace=True)

## 결측치 처리

In [5]:
fill_mean_cols=['height','hct','plt','hs_crp','ft3','ferritin']
fill_mfv_cols=['ob_yn','bp_13580','alc_freq','rdw','ca','sbp','dbp']

for col in fill_mean_cols:
    data[col].fillna(data[col].mean(),inplace=True)
    
for col in fill_mfv_cols:
    data[col].fillna(data[col].mode()[0],inplace=True)

## Sampling

In [6]:
X=data.iloc[:,:-1]
y=data.iloc[:,-1]

In [7]:
from imblearn.under_sampling import RandomUnderSampler as down_sampler
from imblearn.over_sampling import RandomOverSampler as up_sampler

down=down_sampler(random_state=42)
up=up_sampler(random_state=42)

X_up,y_up=up.fit_resample(X,y)
X_down,y_down=down.fit_resample(X,y)

In [8]:
print('UP sampling : 1-{} 0-{}'.format(X_up[X_up[:,-1]==1].shape,X_up[X_up[:,-1]==0].shape))
print('Down sampling : 1-{} 0-{}'.format(X_down[X_down[:,-1]==1].shape,X_down[X_down[:,-1]==0].shape))

UP sampling : 1-(6341, 55) 0-(132411, 55)
Down sampling : 1-(84, 55) 0-(1836, 55)


In [9]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

X_up_train,X_up_test,y_up_train,y_up_test=train_test_split(X_up,y_up,test_size=0.3,random_state=42,stratify=y_up)
X_down_train,X_down_test,y_down_train,y_down_test=train_test_split(X_down,y_down,test_size=0.3,random_state=42,stratify=y_down)

## PCA

In [10]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca=PCA(n_components=20)

## Model

In [11]:
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from tqdm import tqdm

# Support Vector Machine
- 선형, 비선형, 회귀, 이상치 탐색 문제에 사용할 수 있다
- 복잡한 분류 문제, 작거나 중간 크기의 데이터셋에 적합

## Large Margin Classification
SVM의 Decision Boundary는 클래스를 나누면서 제일 가까운 훈련 샘플로부터 가능한 멀리 떨어져 있다.

![](./img/large_margin_classification.JPG)
## 특성의 스케일에 민감함
![](./img/scale.JPG)
## Soft Margin Classification
- Hard Margin Classification
  - **모든** 샘플이 올바르게 분류.
  - 데이터가 선형적으로 구분될 수 있어야 함
  - 이상치에 민감
- Soft Margin Classification
  - 도로의 폭을 넓게 유지하면서
  - Margin 오류 사이의 적절한 균형
 '
![](./img/outlier.JPG)
## 라지 마진 VS 마진 오류   
- 하이퍼파라미터 C값이 작으면 폭이 커지며 마진 오류도 커진다

![](./img/margin.JPG)
## 비선형 분류  
![](./img/non-linear.JPG)

# 엑스트라 트리
- Random Forest에서 각 노드는 무작위로 서브셋을 만들어 사용
- 보통의 Decision Tree는 최적의 임계값을 찾는다
- Extra Tree는 무작위로 분할한 다음 최상의 분할을 선택

# 앙상블 학습
## 투표 기반 분류기
![](./img/ensemble.JPG)
### 직접 투표(Hard Voting)
- 각 분류기의 예측을 모아서 가장 많이 선택된 클래스를 예측

### 간접 투표(Soft Voting)
- 모든 분류기가 클래스의 확률을 예측할 수 있을 때
- 개별 분류기의 예측을 평균내어 확률이 가장 높은 클래스를 예측
- 확률이 높은 투표에 비중을 둬서 hard voting 보다 성능이 높다.


### Upsampling & PCA

In [12]:
#Machine Learning Algorithm (MLA) Selection and Initialization
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
#     gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
#     svm.SVC(probability=True),
#     svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    #Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),

    
    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
    XGBClassifier()    
    ]

#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy', 'MLA Test Accuracy', 'MLA Precision', 'MLA Recall']
MLA_compare = pd.DataFrame(columns = MLA_columns)

row_index = 0
for alg in tqdm(MLA):
    pipeline=Pipeline([('pca',pca),('MLA',alg)])
    pipeline.fit(X_up_train, y_up_train)
    y_up_pred = pipeline.predict(X_up_test)
    #set name and parameters
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    MLA_compare.loc[row_index, 'MLA Train Accuracy'] = pipeline.score(X_up_train, y_up_train)
    MLA_compare.loc[row_index, 'MLA Test Accuracy'] = metrics.accuracy_score(y_up_test, y_up_pred)
    MLA_compare.loc[row_index, 'MLA Precision'] = metrics.precision_score(y_up_test, y_up_pred)
    MLA_compare.loc[row_index, 'MLA Recall'] = metrics.recall_score(y_up_test, y_up_pred)

    row_index+=1

MLA_compare.sort_values(by = ['MLA Test Accuracy'], ascending = False, inplace = True)
MLA_compare

100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [01:41<00:00,  5.17s/it]


Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Test Accuracy,MLA Precision,MLA Recall
2,ExtraTreesClassifier,"{'bootstrap': False, 'class_weight': None, 'cr...",1.0,1.0,1.0,1.0
4,RandomForestClassifier,"{'bootstrap': True, 'class_weight': None, 'cri...",1.0,0.999808,0.999616,1.0
1,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...",0.999918,0.99779,0.995599,1.0
15,ExtraTreeClassifier,"{'class_weight': None, 'criterion': 'gini', 'm...",1.0,0.99152,0.983322,1.0
14,DecisionTreeClassifier,"{'class_weight': None, 'criterion': 'gini', 'm...",1.0,0.990799,0.981931,1.0
12,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...",0.983434,0.97564,0.953544,1.0
3,GradientBoostingClassifier,"{'criterion': 'friedman_mse', 'init': None, 'l...",0.794978,0.792702,0.813891,0.758949
18,XGBClassifier,"{'base_score': 0.5, 'booster': 'gbtree', 'cols...",0.7902,0.786552,0.806317,0.754288
0,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.74043,0.737159,0.753622,0.704704
7,RidgeClassifierCV,"{'alphas': array([ 0.1, 1. , 10. ]), 'class_w...",0.704085,0.701413,0.696808,0.713112


### Downsampling & PCA

In [13]:
MLA_down_compare = pd.DataFrame(columns = MLA_columns)

row_index = 0
for alg in tqdm(MLA):
    pipeline=Pipeline([('pca',pca),('MLA',alg)])
    pipeline.fit(X_down_train, y_down_train)
    y_down_pred = pipeline.predict(X_down_test)
    #set name and parameters
    MLA_name = alg.__class__.__name__
    MLA_down_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_down_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    MLA_down_compare.loc[row_index, 'MLA Train Accuracy'] = pipeline.score(X_down_train, y_down_train)
    MLA_down_compare.loc[row_index, 'MLA Test Accuracy'] = metrics.accuracy_score(y_down_test, y_down_pred)
    MLA_down_compare.loc[row_index, 'MLA Precision'] = metrics.precision_score(y_down_test, y_down_pred)
    MLA_down_compare.loc[row_index, 'MLA Recall'] = metrics.recall_score(y_down_test, y_down_pred)

    row_index+=1

MLA_down_compare.sort_values(by = ['MLA Test Accuracy'], ascending = False, inplace = True)
MLA_down_compare

100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:01<00:00,  9.44it/s]


Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Test Accuracy,MLA Precision,MLA Recall
3,GradientBoostingClassifier,"{'criterion': 'friedman_mse', 'init': None, 'l...",0.886161,0.706597,0.717949,0.680556
18,XGBClassifier,"{'base_score': 0.5, 'booster': 'gbtree', 'cols...",0.857887,0.703125,0.714286,0.677083
5,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': 'warn',...",0.68378,0.701389,0.688312,0.736111
16,LinearDiscriminantAnalysis,"{'n_components': None, 'priors': None, 'shrink...",0.695685,0.699653,0.686084,0.736111
7,RidgeClassifierCV,"{'alphas': array([ 0.1, 1. , 10. ]), 'class_w...",0.695685,0.699653,0.686084,0.736111
4,RandomForestClassifier,"{'bootstrap': True, 'class_weight': None, 'cri...",0.989583,0.680556,0.732143,0.569444
11,GaussianNB,"{'priors': None, 'var_smoothing': 1e-09}",0.674851,0.666667,0.74,0.513889
10,BernoulliNB,"{'alpha': 1.0, 'binarize': 0.0, 'class_prior':...",0.667411,0.664931,0.655738,0.694444
0,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.771577,0.661458,0.676806,0.618056
2,ExtraTreesClassifier,"{'bootstrap': False, 'class_weight': None, 'cr...",1.0,0.657986,0.682731,0.590278


## Ensemble

In [16]:
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 )
vote_est = [
    ('ada', ensemble.AdaBoostClassifier()),
    ('bc', ensemble.BaggingClassifier()),
    ('etc',ensemble.ExtraTreesClassifier()),
    ('gbc', ensemble.GradientBoostingClassifier()),
    ('rfc', ensemble.RandomForestClassifier()),

    ('lr', linear_model.LogisticRegressionCV()),
    
#     ('bnb', naive_bayes.BernoulliNB()),
#     ('gnb', naive_bayes.GaussianNB()),
    
    ('knn', neighbors.KNeighborsClassifier()),
    
#     ('svc', svm.SVC(probability=True)),
    
    ('xgb', XGBClassifier())

]


#Hard Vote or majority rules
vote_hard = ensemble.VotingClassifier(estimators = vote_est , voting = 'hard')
vote_hard_cv = model_selection.cross_validate(vote_hard, X_up, y_up, cv  = cv_split,)
vote_hard.fit(X_up, y_up)

print("Hard Voting Training w/bin score mean: {:.2f}". format(vote_hard_cv['train_score'].mean()*100)) 
print("Hard Voting Test w/bin score mean: {:.2f}". format(vote_hard_cv['test_score'].mean()*100))
print("Hard Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_hard_cv['test_score'].std()*100*3))
print('-'*10)


#Soft Vote or weighted probabilities
vote_soft = ensemble.VotingClassifier(estimators = vote_est , voting = 'soft')
vote_soft_cv = model_selection.cross_validate(vote_soft, X_up, y_up, cv  = cv_split)
vote_soft.fit(X_up, y_up)

print("Soft Voting Training w/bin score mean: {:.2f}". format(vote_soft_cv['train_score'].mean()*100)) 
print("Soft Voting Test w/bin score mean: {:.2f}". format(vote_soft_cv['test_score'].mean()*100))
print("Soft Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_soft_cv['test_score'].std()*100*3))
print('-'*10)





Hard Voting Training w/bin score mean: 92.57
Hard Voting Test w/bin score mean: 92.19
Hard Voting Test w/bin score 3*std: +/- 1.06
----------






Soft Voting Training w/bin score mean: 99.94
Soft Voting Test w/bin score mean: 99.67
Soft Voting Test w/bin score 3*std: +/- 0.12
----------


In [17]:
# Down_sampling
vote_hard = ensemble.VotingClassifier(estimators = vote_est , voting = 'hard')
vote_hard_cv = model_selection.cross_validate(vote_hard, X_down, y_down, cv  = cv_split)
vote_hard.fit(X_down, y_down)

print("Hard Voting Training w/bin score mean: {:.2f}". format(vote_hard_cv['train_score'].mean()*100)) 
print("Hard Voting Test w/bin score mean: {:.2f}". format(vote_hard_cv['test_score'].mean()*100))
print("Hard Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_hard_cv['test_score'].std()*100*3))
print('-'*10)

vote_soft = ensemble.VotingClassifier(estimators = vote_est , voting = 'soft')
vote_soft_cv = model_selection.cross_validate(vote_soft, X_down, y_down, cv  = cv_split)
vote_soft.fit(X_down, y_down)

print("Soft Voting Training w/bin score mean: {:.2f}". format(vote_soft_cv['train_score'].mean()*100)) 
print("Soft Voting Test w/bin score mean: {:.2f}". format(vote_soft_cv['test_score'].mean()*100))
print("Soft Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_soft_cv['test_score'].std()*100*3))
print('-'*10)







Hard Voting Training w/bin score mean: 93.39
Hard Voting Test w/bin score mean: 70.83
Hard Voting Test w/bin score 3*std: +/- 6.95
----------








Soft Voting Training w/bin score mean: 99.30
Soft Voting Test w/bin score mean: 70.78
Soft Voting Test w/bin score 3*std: +/- 5.61
----------


In [18]:
import time

## Grid Search

In [21]:
#WARNING: Running is very computational intensive and time expensive.
#Code is written for experimental/developmental purposes and not production ready!


#Hyperparameter Tune with GridSearchCV: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
grid_n_estimator = [10, 50, 100, 300]
grid_ratio = [.1, .25, .5, .75, 1.0]
grid_learn = [.01, .03, .05, .1, .25]
grid_max_depth = [2, 4, 6, 8, 10, None]
grid_min_samples = [5, 10, .03, .05, .10]
grid_criterion = ['gini', 'entropy']
grid_bool = [True, False]
grid_seed = [0]


grid_param = [
            [{
            #AdaBoostClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
            'n_estimators': grid_n_estimator, #default=50
            'learning_rate': grid_learn, #default=1
            #'algorithm': ['SAMME', 'SAMME.R'], #default=’SAMME.R
            'random_state': grid_seed
            }],
       
    
#             [{
#             #BaggingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier
#             'n_estimators': grid_n_estimator, #default=10
#             'max_samples': grid_ratio, #default=1.0
#             'random_state': grid_seed
#              }],

    
            [{
            #ExtraTreesClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
            'n_estimators': grid_n_estimator, #default=10
            'criterion': grid_criterion, #default=”gini”
            'max_depth': grid_max_depth, #default=None
            'random_state': grid_seed
             }],


#             [{
#             #GradientBoostingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier
#             #'loss': ['deviance', 'exponential'], #default=’deviance’
#             'learning_rate': [.05], #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
#             'n_estimators': [300], #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
#             #'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse”
#             'max_depth': grid_max_depth, #default=3   
#             'random_state': grid_seed
#              }],

    
#             [{
#             #RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
#             'n_estimators': grid_n_estimator, #default=10
#             'criterion': grid_criterion, #default=”gini”
#             'max_depth': grid_max_depth, #default=None
#             'oob_score': [True], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.
#             'random_state': grid_seed
#              }],
    
#             [{    
#             #GaussianProcessClassifier
#             'max_iter_predict': grid_n_estimator, #default: 100
#             'random_state': grid_seed
#             }],
        
    
#             [{
#             #LogisticRegressionCV - http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV
#             'fit_intercept': grid_bool, #default: True
#             #'penalty': ['l1','l2'],
#             'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], #default: lbfgs
#             'random_state': grid_seed
#              }],
            
    
#             [{
#             #BernoulliNB - http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB
#             'alpha': grid_ratio, #default: 1.0
#              }],
    
    
#             #GaussianNB - 
#             [{}],
    
#             [{
#             #KNeighborsClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
#             'n_neighbors': [1,2,3,4,5,6,7], #default: 5
#             'weights': ['uniform', 'distance'], #default = ‘uniform’
#             'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
#             }],
            
    
#             [{
#             #SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
#             #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r
#             #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
#             'C': [1,2,3,4,5], #default=1.0
#             'gamma': grid_ratio, #edfault: auto
#             'decision_function_shape': ['ovo', 'ovr'], #default:ovr
#             'probability': [True],
#             'random_state': grid_seed
#              }],

    
#             [{
#             #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html
#             'learning_rate': grid_learn, #default: .3
#             'max_depth': [1,2,4,6,8,10], #default 2
#             'n_estimators': grid_n_estimator, 
#             'seed': grid_seed  
#              }]
            ]

start_total = time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter
for clf, param in zip (vote_est, grid_param):
    
    start = time.perf_counter()        
    best_search = model_selection.GridSearchCV(estimator = clf[1], param_grid = param, cv = cv_split, scoring = 'roc_auc')
    best_search.fit(X_up, y_up)
    run = time.perf_counter() - start

    best_param = best_search.best_params_
    print('The best parameter for {} is {} with a runtime of {:.2f} seconds.'.format(clf[1].__class__.__name__, best_param, run))
    clf[1].set_params(**best_param) 


run_total = time.perf_counter() - start_total
print('Total optimization time was {:.2f} minutes.'.format(run_total/60))

print('-'*10)

KeyboardInterrupt: 