In [1]:
from ucimlrepo import fetch_ucirepo 
from sklearn.datasets import fetch_openml

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
import category_encoders as ce

from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from utils import *
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [3]:
dataset_list_openML = [
    fetch_openml(data_id=329), #hayes-roth
    fetch_openml(data_id=1523), #vertebra-column
    fetch_openml(data_id=40682), #thyroid-new
    fetch_openml(data_id=30), #page-blocks  
]

dataset_name_list_openML = ['Hayes-Roth', 
                     'Vertebral column',
                     'Thyroid', 
                     'Page blocks']

dataset_list_uci = [
    fetch_ucirepo(id=109), #Wine dataset
    #fetch_ucirepo(id=44), #Hayes-Roth dataset  ---- not available for uci
    fetch_ucirepo(id=30), #contraceptive_method_choice dataset
    fetch_ucirepo(id=81), #Pen based dataset
    #fetch_ucirepo(id=212), #Vertebral column ---- not available for uci
    fetch_ucirepo(id=33), #Dermatology dataset
    fetch_ucirepo(id=12), #Balance scale dataset
    fetch_ucirepo(id=42), #Glass 
    fetch_ucirepo(id=45), #Heart 
    fetch_ucirepo(id=19), #Car evaluation
    #fetch_ucirepo(id=102), #Thyroid ---- not available for uci
    fetch_ucirepo(id=110), #Yeast
    #fetch_ucirepo(id=78), #"Page blocks" ---- not available for uci
    fetch_ucirepo(id=148), #Shuttle
]
dataset_name_list_uci = ['Wine',
                     #'Hayes-Roth', ---- not available for uci
                     'Contraceptive',
                     'Pen based',
                     #'Vertebral column',---- not available
                     'Dermatology',
                     'Balance scale',
                     'Glass','Heart',
                     'Car evaluation',
                     #'Thyroid', ---- not available
                     'Yeast', 
                     #'Page blocks',---- not available
                     'Shuttle']

dataset_list = dataset_list_uci+dataset_list_openML
dataset_name_list = dataset_name_list_uci+dataset_name_list_openML

In [4]:
clf_dict = {
    'AdaBoost' : AdaBoostClassifier(n_estimators=100, random_state=42),
    'GradientBoosting' : GradientBoostingClassifier(n_estimators=100, learning_rate=0.3, max_features=2, max_depth=5, random_state=42),
}

In [5]:
len(dataset_list_openML),len(dataset_list_uci),len(dataset_list)

(4, 10, 14)

In [14]:
clf_score_dict = {}

In [15]:
mapping_for_car_dataset = [{'col':'buying', 'mapping':{'low':0, 'med':1, 'high':2, 'vhigh':3}},
          {'col':'maint', 'mapping':{'low':0, 'med':1, 'high':2, 'vhigh':3}},
          {'col':'doors', 'mapping':{'2':0, '3':1, '4':2, '5more':3}},
          {'col':'persons', 'mapping':{'2':0, '4':1, 'more':2}},
          {'col':'lug_boot', 'mapping':{'small':0, 'med':1, 'big':2}},
          {'col':'safety', 'mapping':{'low':0, 'med':1, 'high':2}}]

In [16]:
def metric_lsit(y_test,y_preds,y_scores,num_of_classes):
    accuracy = accuracy_score(y_test, y_preds)
    precision = precision_score(y_test, y_preds, average='weighted')
    recall = recall_score(y_test, y_preds, average='weighted')
    f1 = f1_score(y_test, y_preds, average='weighted')
    Macro_Averaged =macro_averaged_auprc(y_true=y_test,y_scores=y_scores,n_classes=num_of_classes)
    F_measure = f_measure(y_test,y_preds,beta=10)
    Modified_mcc = mmcc(y_true=y_test,y_pred=y_preds,classes=range(num_of_classes))
    MCC_metric = matthews_corrcoef(y_test,y_preds)
    Gmean = np.prod(g_mean_multiclass(y_true=y_test,y_pred=y_preds,n_classes=num_of_classes)[1][1::])

    score_list = [accuracy,precision,recall,f1,Macro_Averaged,F_measure,Modified_mcc,MCC_metric,Gmean]
    return score_list

In [17]:
for clf_name,clf in tqdm(clf_dict.items()):
    # print('Processing on' ,clf_name)
    score={}
    for index,data in enumerate(dataset_list):
        
        #data preparation
        if index in range(10):
            table = pd.concat([data.data.features, data.data.targets], axis=1)
        elif index in range(10,14):
            table = pd.concat([data.data, data.target], axis=1)
        table = table.dropna()
        x = table.iloc[:, :-1]
        if index == 7: # for Car evaluation dataset
            encoder = ce.OrdinalEncoder(cols=x.columns.values.tolist(), mapping = mapping_for_car_dataset)
            x = encoder.fit_transform(x)
        y = np.ravel(table.iloc[:, -1:])
        num_of_classes=np.unique(y).size
        # print('\t Dataset : ',dataset_name_list[index],' is processing')
        
        #spilt data
        X_train, X_test, y_train, y_test =train_test_split(x,y,test_size=0.2,random_state=42)
        #define ML method
        classification = clf
        #train data
        classification.fit(X_train, y_train) 
        
        #predict data from test set
        y_preds = classification.predict(X_test) 
        y_scores = classification.predict_proba(X_test)

        ##pagged data
        score_p = metric_lsit(y_test,y_preds,y_scores,num_of_classes)
        score[dataset_name_list[index]]= score_p
        # print('\t Done!')
        
    
    df = pd.DataFrame.from_dict(score, orient='index',columns=['accuracy', 'precision', 'recall', 'f1','Macro-Averaged AUPRC',
                                                            'F_measure Beta=10','Modified mcc','MCC metirc','Gmean'])
    clf_score_dict[clf_name] = df

    # print('Classifier : ' ,clf_name, 'Completed')

100%|██████████| 2/2 [00:42<00:00, 21.15s/it]


In [18]:
clf_score_dict

{'AdaBoost':                   accuracy  precision    recall        f1  \
 Wine              0.916667   0.931373  0.916667  0.917636   
 Contraceptive     0.589831   0.596148  0.589831  0.585984   
 Pen based         0.283765   0.243578  0.283765  0.170141   
 Dermatology       0.833333   0.747930  0.833333  0.779246   
 Balance scale     0.944000   0.965778  0.944000  0.949436   
 Glass             0.488372   0.481190  0.488372  0.425292   
 Heart             0.600000   0.598448  0.600000  0.594580   
 Car evaluation    0.878613   0.886212  0.878613  0.869804   
 Yeast             0.393939   0.358730  0.393939  0.361526   
 Shuttle           0.867845   0.814973  0.867845  0.840544   
 Hayes-Roth        0.500000   0.307692  0.500000  0.361111   
 Vertebral column  0.725806   0.765705  0.725806  0.739919   
 Thyroid           0.720930   0.618605  0.720930  0.633926   
 Page blocks       0.738813   0.915992  0.738813  0.794022   
 
                   Macro-Averaged AUPRC  F_measure Beta=

In [19]:
clf_score_dict['AdaBoost']

Unnamed: 0,accuracy,precision,recall,f1,Macro-Averaged AUPRC,F_measure Beta=10,Modified mcc,MCC metirc,Gmean
Wine,0.916667,0.931373,0.916667,0.917636,0.366349,0.917984,0.882965,0.87798,0.857143
Contraceptive,0.589831,0.596148,0.589831,0.585984,0.350871,0.590399,0.358602,0.365253,0.247887
Pen based,0.283765,0.243578,0.283765,0.170141,0.19934,0.279572,0.212055,0.256988,0.0
Dermatology,0.833333,0.74793,0.833333,0.779246,0.199987,0.824772,0.770677,0.801432,0.0
Balance scale,0.944,0.965778,0.944,0.949436,0.5,0.945939,0.0,0.910854,0.0
Glass,0.488372,0.48119,0.488372,0.425292,0.293444,0.48771,0.175065,0.326144,0.0
Heart,0.6,0.598448,0.6,0.59458,0.309687,0.599859,0.31674,0.346405,0.0
Car evaluation,0.878613,0.886212,0.878613,0.869804,0.5,0.879298,0.0,0.743804,0.0
Yeast,0.393939,0.35873,0.393939,0.361526,0.5,0.390455,0.0,0.236096,0.0
Shuttle,0.867845,0.814973,0.867845,0.840544,0.361896,0.862757,0.308742,0.598385,0.0


In [20]:
clf_score_dict['GradientBoosting']

Unnamed: 0,accuracy,precision,recall,f1,Macro-Averaged AUPRC,F_measure Beta=10,Modified mcc,MCC metirc,Gmean
Wine,1.0,1.0,1.0,1.0,0.338596,1.0,1.0,1.0,1.0
Contraceptive,0.535593,0.539368,0.535593,0.533688,0.359789,0.535934,0.273517,0.280102,0.215818
Pen based,0.994088,0.994114,0.994088,0.994088,0.999565,0.994091,0.993442,0.993432,0.946188
Dermatology,0.986111,0.987654,0.986111,0.986262,0.216294,0.986251,0.980744,0.982688,0.928571
Balance scale,0.816,0.787066,0.816,0.798012,0.5,0.813282,0.0,0.679347,0.0
Glass,0.837209,0.856848,0.837209,0.829605,0.294141,0.838957,0.701236,0.79559,0.0
Heart,0.6,0.572863,0.6,0.580571,0.325036,0.597427,0.276366,0.297059,0.0
Car evaluation,0.985549,0.988524,0.985549,0.986262,0.5,0.985819,0.0,0.970094,0.0
Yeast,0.572391,0.564047,0.572391,0.565864,0.5,0.571622,0.0,0.44365,0.0
Shuttle,0.11181,0.896635,0.11181,0.18299,0.31131,0.121477,0.137932,0.103608,1e-06
