In [4]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Downloading data


In [5]:
X_balance_scale = pd.read_csv('data/X_balance_scale', delimiter=',')
y_balance_scale = pd.read_csv('data/y_balance_Scale', delimiter=',')

X_car_eval = pd.read_csv('data/X_car_eval', delimiter=',')
y_car_eval = pd.read_csv('data/y_car_eval', delimiter=',')

X_contraceptive = pd.read_csv('data/X_contraceptive', delimiter=',')
y_contraceptive = pd.read_csv('data/y_contraceptive', delimiter=',')

X_dermatology = pd.read_csv('data/X_dermatology', delimiter=',')
y_dermatology = pd.read_csv('data/y_dermatology', delimiter=',')

X_glass = pd.read_csv('data/X_glass', delimiter=',')
y_glass = pd.read_csv('data/y_glass', delimiter=',')

X_hayes = pd.read_csv('data/X_hayes', delimiter=',')
y_hayes = pd.read_csv('data/y_hayes', delimiter=',')

X_heart = pd.read_csv('data/X_heart', delimiter=',')
y_heart = pd.read_csv('data/y_heart', delimiter=',')

X_new_thyroid = pd.read_csv('data/X_new_thyroid', delimiter=',')
y_new_thyroid = pd.read_csv('data/y_new_thyroid', delimiter=',')

X_page = pd.read_csv('data/X_page', delimiter=',')
y_page = pd.read_csv('data/y_page', delimiter=',')

X_pen_based = pd.read_csv('data/X_pen_based', delimiter=',')
y_pen_based = pd.read_csv('data/y_pen_based', delimiter=',')

X_shuttle = pd.read_csv('data/X_shuttle', delimiter=',')
y_shuttle = pd.read_csv('data/y_shuttle', delimiter=',')

X_vertebra = pd.read_csv('data/X_vertebra', delimiter=',')
y_vertebra = pd.read_csv('data/y_vertebra', delimiter=',')

X_wine = pd.read_csv('data/X_wine', delimiter=',')
y_wine = pd.read_csv('data/y_wine', delimiter=',')

X_yeast = pd.read_csv('data/X_yeast', delimiter=',')
y_yeast = pd.read_csv('data/y_yeast', delimiter=',')

X_fars = pd.read_csv('data/X_fars', delimiter=',')
y_fars = pd.read_csv('data/y_fars', delimiter=',')

In [6]:
X_list = [
    X_balance_scale, X_car_eval, X_contraceptive,
    X_dermatology, X_glass, X_hayes, X_heart,
    X_new_thyroid, X_page, X_pen_based, X_shuttle,
    X_vertebra, X_wine, X_yeast, X_fars
]

y_list = [
    y_balance_scale, y_car_eval, y_contraceptive,
    y_dermatology, y_glass, y_hayes, y_heart,
    y_new_thyroid, y_page, y_pen_based, y_shuttle,
    y_vertebra, y_wine, y_yeast, y_fars
]

name_list = [
    'Balance Scale', 'Car Evaluation', 'Contraceptive',
    'Dermatology', 'Glass', 'Hayes', 'Heart',
    'New Thyroid', 'Page', 'Pen Based', 'Shuttle',
    'Vertebra', 'Wine', 'Yeast', 'FARS'
]

# Preprocessing

### Scaling using StandardScaler()

In [56]:
X_needed_to_scale = [X_glass, X_heart, X_new_thyroid, X_page, X_pen_based, X_shuttle, X_vertebra, X_wine, X_fars]

In [58]:
from sklearn.preprocessing import StandardScaler

def scale_df(df):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [59]:
X_glass = scale_df(X_glass)
X_heart = scale_df(X_heart)
X_new_thyroid = scale_df(X_new_thyroid)
X_page = scale_df(X_page)
X_pen_based = scale_df(X_pen_based)
X_shuttle = scale_df(X_shuttle)
X_vertebra = scale_df(X_vertebra)
X_wine = scale_df(X_wine)
X_fars = scale_df(X_fars)

In [65]:
X_list_scaled = [
    X_balance_scale, X_car_eval, X_contraceptive,
    X_dermatology, X_glass, X_hayes, X_heart,
    X_new_thyroid, X_page, X_pen_based, X_shuttle,
    X_vertebra, X_wine, X_yeast, X_fars
]

# Prediction

### using 7 boosting algorithms to predict scaled data

In [68]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.ensemble import RUSBoostClassifier
from adacost import AdaCost
from utils import *
from tqdm import tqdm

In [70]:
clf_dict = {
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'AdaBoost_SAMME': AdaBoostClassifier(n_estimators=100, algorithm='SAMME', random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.3, max_features=2, max_depth=5, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'CatBoost': CatBoostClassifier(verbose=0), 
    'RUSBoostClassifier': RUSBoostClassifier(n_estimators=200, random_state=42),
    'AdaCost' : AdaCost(algorithm = "SAMME.R",random_state = 100)
}

clf_score_dict = {}

In [75]:
for clf_name, clf in tqdm(clf_dict.items()):
    score = {}
    
    for i, (X, y) in enumerate(zip(X_list_scaled, y_list)):

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        clf.fit(X_train, np.ravel(y_train)) 
        
        y_preds = clf.predict(X_test) 
        y_scores = clf.predict_proba(X_test)

        score_p = metric_list(y_test,y_preds,y_scores, len(y.iloc[:,0].unique()))
        score[name_list[i]]= score_p
    
    df = pd.DataFrame.from_dict(score, orient='index', columns=['accuracy', 'precision', 'recall', 'f1','Macro-Averaged AUPRC',
                                                            'F_measure Beta=10','Modified mcc','MCC metirc','Gmean'])
    clf_score_dict[clf_name] = df

100%|██████████| 7/7 [04:00<00:00, 34.37s/it]


In [81]:
clf_score_dict['CatBoost']

Unnamed: 0,accuracy,precision,recall,f1,Macro-Averaged AUPRC,F_measure Beta=10,Modified mcc,MCC metirc,Gmean
Balance Scale,0.88,0.846267,0.88,0.86052,0.723314,0.876823,0.608943,0.788488,0.915254
Car Evaluation,0.973988,0.980708,0.973988,0.975628,0.978974,0.974596,0.933166,0.946918,0.909091
Contraceptive,0.549153,0.547273,0.549153,0.543968,0.564997,0.548981,0.289413,0.297712,0.182649
Dermatology,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Glass,0.906977,0.914341,0.906977,0.90778,0.970088,0.907641,0.883432,0.882985,0.589286
Hayes,0.71875,0.767857,0.71875,0.716667,0.912292,0.722953,0.585374,0.561444,0.45
Heart,0.47541,0.381148,0.47541,0.423059,0.354687,0.464956,0.181516,0.205881,0.0
New Thyroid,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Page,0.975342,0.976794,0.975342,0.975681,0.887257,0.975474,0.858844,0.877837,0.623546
Pen Based,0.994543,0.994578,0.994543,0.994543,0.999725,0.994546,0.993937,0.993939,0.950758


Comment: Scaling features didn't make changes in scores