In [1]:
import pandas as pd
import warnings
import os
warnings.filterwarnings("ignore")

In [2]:
X_balance_scale = pd.read_csv('data/X_balance_scale', delimiter=',')
y_balance_scale = pd.read_csv('data/y_balance_Scale', delimiter=',')

X_car_eval = pd.read_csv('data/X_car_eval', delimiter=',')
y_car_eval = pd.read_csv('data/y_car_eval', delimiter=',')

X_contraceptive = pd.read_csv('data/X_contraceptive', delimiter=',')
y_contraceptive = pd.read_csv('data/y_contraceptive', delimiter=',')

X_dermatology = pd.read_csv('data/X_dermatology', delimiter=',')
y_dermatology = pd.read_csv('data/y_dermatology', delimiter=',')

X_glass = pd.read_csv('data/X_glass', delimiter=',')
y_glass = pd.read_csv('data/y_glass', delimiter=',')

X_hayes = pd.read_csv('data/X_hayes', delimiter=',')
y_hayes = pd.read_csv('data/y_hayes', delimiter=',')

X_heart = pd.read_csv('data/X_heart', delimiter=',')
y_heart = pd.read_csv('data/y_heart', delimiter=',')

X_new_thyroid = pd.read_csv('data/X_new_thyroid', delimiter=',')
y_new_thyroid = pd.read_csv('data/y_new_thyroid', delimiter=',')

X_page = pd.read_csv('data/X_page', delimiter=',')
y_page = pd.read_csv('data/y_page', delimiter=',')

X_pen_based = pd.read_csv('data/X_pen_based', delimiter=',')
y_pen_based = pd.read_csv('data/y_pen_based', delimiter=',')

X_shuttle = pd.read_csv('data/X_shuttle', delimiter=',')
y_shuttle = pd.read_csv('data/y_shuttle', delimiter=',')

X_vertebra = pd.read_csv('data/X_vertebra', delimiter=',')
y_vertebra = pd.read_csv('data/y_vertebra', delimiter=',')

X_wine = pd.read_csv('data/X_wine', delimiter=',')
y_wine = pd.read_csv('data/y_wine', delimiter=',')

X_yeast = pd.read_csv('data/X_yeast', delimiter=',')
y_yeast = pd.read_csv('data/y_yeast', delimiter=',')

X_fars = pd.read_csv('data/X_fars', delimiter=',')
y_fars = pd.read_csv('data/y_fars', delimiter=',')

In [3]:
X_list = [
    X_balance_scale, X_car_eval, X_contraceptive,
    X_dermatology, X_glass, X_hayes, X_heart,
    X_new_thyroid, X_page, X_pen_based, X_shuttle,
    X_vertebra, X_wine, X_yeast, X_fars
]

y_list = [
    y_balance_scale, y_car_eval, y_contraceptive,
    y_dermatology, y_glass, y_hayes, y_heart,
    y_new_thyroid, y_page, y_pen_based, y_shuttle,
    y_vertebra, y_wine, y_yeast, y_fars
]

name_list = [
    'Balance Scale', 'Car Evaluation', 'Contraceptive',
    'Dermatology', 'Glass', 'Hayes', 'Heart',
    'New Thyroid', 'Page', 'Pen Based', 'Shuttle',
    'Vertebra', 'Wine', 'Yeast', 'FARS'
]

In [4]:
import numpy as np
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.ensemble import RUSBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from adacost import *


from utils import *
from tqdm import tqdm

In [5]:
clf_dict = {
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'AdaBoost_SAMME': AdaBoostClassifier(n_estimators=100, algorithm='SAMME', random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.3, max_features=2, max_depth=5, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'CatBoost': CatBoostClassifier(verbose=0), 
    'RUSBoostClassifier': RUSBoostClassifier(n_estimators=200, random_state=42),
}

Define different base learner for each appoarch

In [6]:
base_learners_dict = {
    'AdaBoost': [DecisionTreeClassifier(max_depth=1),
                 DecisionTreeClassifier(max_depth=3),
                 RandomForestClassifier(n_estimators=100)],
    'AdaBoost_SAMME': [DecisionTreeClassifier(max_depth=1),
                       DecisionTreeClassifier(max_depth=3),
                       RandomForestClassifier(n_estimators=100)],
    'GradientBoosting': [DecisionTreeClassifier(max_depth=3),
                         RandomForestClassifier(n_estimators=100),
                         LogisticRegression()],
    'XGBoost': [XGBClassifier(max_depth=3, n_estimators=100),
                XGBClassifier(max_depth=5, n_estimators=200),
                XGBClassifier(max_depth=3, n_estimators=200, booster='gblinear')],
    'CatBoost': [CatBoostClassifier(verbose=0),
                 CatBoostClassifier(depth=3, iterations=100),
                 CatBoostClassifier(depth=5, iterations=200)],
    'RUSBoostClassifier': [DecisionTreeClassifier(max_depth=1),
                            RandomForestClassifier(n_estimators=100),
                            LogisticRegression()],
    'AdaCost' : [DecisionTreeClassifier(max_depth=1),
                RandomForestClassifier(n_estimators=100),
                LogisticRegression()]
}

In [7]:
clf_score_dict = {}

for clf_name, clf in tqdm(clf_dict.items()):
    score = {}
    base_learners = base_learners_dict[clf_name]
    
    for i, (X, y) in enumerate(zip(X_list, y_list)):

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        for n , base_learner in enumerate(base_learners):
            clf_b = clf.base_estimator_ = base_learner  
            clf_b.fit(X_train, y_train) 
        
            y_preds = clf_b.predict(X_test) 
            y_scores = clf_b.predict_proba(X_test)

            score_p = metric_list(y_test, y_preds, y_scores, len(y.iloc[:,0].unique()))
            score[f'{name_list[i]}_{type(base_learner).__name__}_{n}'] = score_p
    
    df = pd.DataFrame.from_dict(score, orient='index', columns=['accuracy', 'precision', 'recall', 'f1','Macro-Averaged AUPRC',
                                                            'F_measure Beta=10','Modified mcc','MCC metirc','Gmean'])
    clf_score_dict[clf_name] = df

  0%|          | 0/6 [00:00<?, ?it/s]

 67%|██████▋   | 4/6 [01:44<01:00, 30.16s/it]

Learning rate set to 0.5
0:	learn: 0.8114115	total: 699us	remaining: 69.2ms
1:	learn: 0.6795393	total: 1.21ms	remaining: 59.4ms
2:	learn: 0.5699800	total: 1.79ms	remaining: 57.9ms
3:	learn: 0.4982700	total: 2.21ms	remaining: 53.1ms
4:	learn: 0.4594548	total: 2.63ms	remaining: 50.1ms
5:	learn: 0.4276718	total: 3.04ms	remaining: 47.6ms
6:	learn: 0.4057498	total: 3.48ms	remaining: 46.3ms
7:	learn: 0.3971774	total: 3.9ms	remaining: 44.8ms
8:	learn: 0.3782458	total: 4.33ms	remaining: 43.8ms
9:	learn: 0.3622065	total: 4.78ms	remaining: 43ms
10:	learn: 0.3521202	total: 5.22ms	remaining: 42.2ms
11:	learn: 0.3423632	total: 5.62ms	remaining: 41.2ms
12:	learn: 0.3270549	total: 6.02ms	remaining: 40.3ms
13:	learn: 0.3142112	total: 6.41ms	remaining: 39.4ms
14:	learn: 0.3068842	total: 6.79ms	remaining: 38.5ms
15:	learn: 0.2962187	total: 7.21ms	remaining: 37.9ms
16:	learn: 0.2914422	total: 7.67ms	remaining: 37.5ms
17:	learn: 0.2823861	total: 8.07ms	remaining: 36.8ms
18:	learn: 0.2748415	total: 8.51ms	

100%|██████████| 6/6 [04:35<00:00, 45.97s/it]


In [8]:
clf_score_dict['RUSBoostClassifier']

Unnamed: 0,accuracy,precision,recall,f1,Macro-Averaged AUPRC,F_measure Beta=10,Modified mcc,MCC metirc,Gmean
Balance Scale_DecisionTreeClassifier_0,0.56,0.514409,0.56,0.531522,0.568625,0.555524,0.144254,0.200678,0.372573
Balance Scale_RandomForestClassifier_1,0.808,0.807944,0.808,0.804517,0.671766,0.807995,0.495388,0.670275,0.785208
Balance Scale_LogisticRegression_2,0.84,0.789772,0.84,0.804285,0.776403,0.835171,0.527102,0.733634,0.847458
Car Evaluation_DecisionTreeClassifier_0,0.679191,0.4613,0.679191,0.549431,0.661212,0.651227,0.0,0.0,0.0
Car Evaluation_RandomForestClassifier_1,0.962428,0.964409,0.962428,0.962891,0.929262,0.962608,0.894121,0.921405,0.625668
Car Evaluation_LogisticRegression_2,0.916185,0.916741,0.916185,0.916349,0.844125,0.916236,0.812081,0.823721,0.466947
Contraceptive_DecisionTreeClassifier_0,0.440678,0.194197,0.440678,0.269591,0.692132,0.395091,0.0,0.0,0.0
Contraceptive_RandomForestClassifier_1,0.549153,0.542555,0.549153,0.544036,0.558873,0.548546,0.286827,0.295009,0.179802
Contraceptive_LogisticRegression_2,0.545763,0.550661,0.545763,0.535977,0.550779,0.546204,0.284277,0.287641,0.155829
Dermatology_DecisionTreeClassifier_0,0.594595,0.460204,0.594595,0.495316,0.656855,0.579218,0.353353,0.518013,0.0


In [9]:
file_name_list = []

In [10]:
def save_result(result,file_name_list):
    for clf, score in result.items():
        score.to_csv(f'result/baselearner/{clf}.csv', index=True)
        file_name_list.append(f'{clf}.csv')

In [11]:
save_result(clf_score_dict,file_name_list)