## eXtreme Gradient Boosting (XGBoost)

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,KFold
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,\
recall_score,roc_curve,auc

from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from function import split_train_test,make_data_balance

In [2]:
np.random.seed(1)

First of all, the processed data are imported.

In [3]:
#data_list = ['1paradox']
#data_list = np.loadtxt('data_list.txt',dtype='str')
data_list = np.loadtxt('data_list_30sets.txt',dtype='str')
#data_list = ['9coag']

print(data_list)

['1paradox' '2peptide' '3stigma' '4nki' '5mental' '6smoking' '7anemia'
 '8language' '9coag' '10tazamia' '11hepato' '12heat' '13ef' '14cervix'
 '15heart' '16liver' '17nwosu' '18school' '19ibs' '21survival'
 '29parkinson' '30paradox2' '31renal' '33svr' '35pcos' '36probiotic'
 '101kidney' '102breast_cancer' '103diabetes_niddk'
 '104diabetic_retinopathy']


In [4]:
def read_data(data_id):    
    data_name = data_list[data_id]
    print('data_name:',data_name)
    Xy = np.loadtxt('../classification_data/%s/data_processed_median.dat'%data_name) 
    X = Xy[:,:-1]
    y = Xy[:,-1]

    #print(np.unique(y,return_counts=True))
    X,y = make_data_balance(X,y)
    print(np.unique(y,return_counts=True))

    X, y = shuffle(X, y, random_state=1)
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state = 1)
    
    sc = MinMaxScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    return X_train,X_test,y_train,y_test

In [5]:
def measure_performance(X_train,X_test,y_train,y_test):
    
    model = XGBClassifier(n_estimators=100,tree_method = 'auto')

    max_depth = [2,4,6,8,10,12]
    min_child_weight = [0.2,0.5,0.8,1.]
        
    #l2 regularization term
    reg_lambda = np.logspace(-4,1,num=6)
    
    #l1 regularization term
    reg_alpha = np.logspace(-4,1,num=6) 
    
    # Create hyperparameter options
    hyper_parameters = dict(max_depth = max_depth,min_child_weight = min_child_weight,\
                            reg_lambda = reg_lambda, reg_alpha = reg_alpha)
    
    # Create grid search using cross validation
    clf = GridSearchCV(model, hyper_parameters, cv=4, iid='deprecated')
    
    # Fit grid search
    best_model = clf.fit(X_train, y_train)
    
    # View best hyperparameters
    #print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
    #print('Best C:', best_model.best_estimator_.get_params()['C'])
    #print('Best alpha:', best_model.best_estimator_.get_params()['alpha'])
    #print('Best l1_ratio:', best_model.best_estimator_.get_params()['l1_ratio'])
    
    # best hyper parameters
    print('best_hyper_parameters:',best_model.best_params_)

    # performance:
    y_test_pred = best_model.best_estimator_.predict(X_test)
    acc = accuracy_score(y_test,y_test_pred)
    #print('Accuracy:', acc)

    p_test_pred = best_model.best_estimator_.predict_proba(X_test) # prob of [0,1]
    p_test_pred = p_test_pred[:,1] # prob of 1    
    fp,tp,thresholds = roc_curve(y_test, p_test_pred, drop_intermediate=False)
    roc_auc = auc(fp,tp)
    #print('AUC:', roc_auc)

    precision = precision_score(y_test,y_test_pred)
    #print('Precision:',precision)

    recall = recall_score(y_test,y_test_pred)
    #print('Recall:',recall)
    
    f1_score = 2*precision*recall/(precision+recall)

    return acc,roc_auc,precision,recall,f1_score

In [6]:
n_data = len(data_list)
roc_auc = np.zeros(n_data)   ; acc = np.zeros(n_data)
precision = np.zeros(n_data) ; recall = np.zeros(n_data)
f1_score = np.zeros(n_data)

#data_id = 0
for data_id in range(n_data):
    X_train,X_test,y_train,y_test = read_data(data_id)
    acc[data_id],roc_auc[data_id],precision[data_id],recall[data_id],f1_score[data_id] =\
           measure_performance(X_train,X_test,y_train,y_test)
    print(data_id,acc[data_id],roc_auc[data_id],precision[data_id],recall[data_id],f1_score[data_id])    
    

data_name: 1paradox
(array([-1.,  1.]), array([60, 60]))
best_hyper_parameters: {'max_depth': 4, 'min_child_weight': 0.2, 'reg_alpha': 0.0001, 'reg_lambda': 10.0}
0 0.8 0.899529964747356 0.7037037037037037 0.8260869565217391 0.76
data_name: 2peptide
(array([-1.,  1.]), array([23, 23]))
best_hyper_parameters: {'max_depth': 2, 'min_child_weight': 0.2, 'reg_alpha': 0.0001, 'reg_lambda': 0.0001}
1 1.0 1.0 1.0 1.0 1.0
data_name: 3stigma
(array([-1.,  1.]), array([2725, 2725]))
best_hyper_parameters: {'max_depth': 2, 'min_child_weight': 0.2, 'reg_alpha': 0.0001, 'reg_lambda': 0.0001}
2 1.0 1.0 1.0 1.0 1.0
data_name: 4nki
(array([-1.,  1.]), array([77, 77]))
best_hyper_parameters: {'max_depth': 4, 'min_child_weight': 0.8, 'reg_alpha': 0.01, 'reg_lambda': 1.0}
3 0.8311688311688312 0.9028925619834711 0.8974358974358975 0.7954545454545454 0.8433734939759037
data_name: 5mental
(array([-1.,  1.]), array([147, 147]))
best_hyper_parameters: {'max_depth': 2, 'min_child_weight': 0.8, 'reg_alpha': 10.0

In [7]:
print('acc_mean:',acc.mean())
print('roc_mean:',roc_auc.mean())
print('precision:',precision.mean())
print('recall:',recall.mean())
print('f1_score:',f1_score.mean())

acc_mean: 0.8495535486942711
roc_mean: 0.883857625349843
precision: 0.855791652446917
recall: 0.8468876895453802
f1_score: 0.849533111431746


In [8]:
np.savetxt('result_XGB.dat',(roc_auc,acc,precision,recall,f1_score),fmt='%f')