In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Downloading data


In [2]:
X_balance_scale = pd.read_csv('data/X_balance_scale', delimiter=',')
y_balance_scale = pd.read_csv('data/y_balance_Scale', delimiter=',')

X_car_eval = pd.read_csv('data/X_car_eval', delimiter=',')
y_car_eval = pd.read_csv('data/y_car_eval', delimiter=',')

X_contraceptive = pd.read_csv('data/X_contraceptive', delimiter=',')
y_contraceptive = pd.read_csv('data/y_contraceptive', delimiter=',')

X_dermatology = pd.read_csv('data/X_dermatology', delimiter=',')
y_dermatology = pd.read_csv('data/y_dermatology', delimiter=',')

X_glass = pd.read_csv('data/X_glass', delimiter=',')
y_glass = pd.read_csv('data/y_glass', delimiter=',')

X_hayes = pd.read_csv('data/X_hayes', delimiter=',')
y_hayes = pd.read_csv('data/y_hayes', delimiter=',')

X_heart = pd.read_csv('data/X_heart', delimiter=',')
y_heart = pd.read_csv('data/y_heart', delimiter=',')

X_new_thyroid = pd.read_csv('data/X_new_thyroid', delimiter=',')
y_new_thyroid = pd.read_csv('data/y_new_thyroid', delimiter=',')

X_page = pd.read_csv('data/X_page', delimiter=',')
y_page = pd.read_csv('data/y_page', delimiter=',')

X_pen_based = pd.read_csv('data/X_pen_based', delimiter=',')
y_pen_based = pd.read_csv('data/y_pen_based', delimiter=',')

X_shuttle = pd.read_csv('data/X_shuttle', delimiter=',')
y_shuttle = pd.read_csv('data/y_shuttle', delimiter=',')

X_vertebra = pd.read_csv('data/X_vertebra', delimiter=',')
y_vertebra = pd.read_csv('data/y_vertebra', delimiter=',')

X_wine = pd.read_csv('data/X_wine', delimiter=',')
y_wine = pd.read_csv('data/y_wine', delimiter=',')

X_yeast = pd.read_csv('data/X_yeast', delimiter=',')
y_yeast = pd.read_csv('data/y_yeast', delimiter=',')

X_fars = pd.read_csv('data/X_fars', delimiter=',')
y_fars = pd.read_csv('data/y_fars', delimiter=',')

In [3]:
X_list = [
    X_balance_scale, X_car_eval, X_contraceptive,
    X_dermatology, X_glass, X_hayes, X_heart,
    X_new_thyroid, X_page, X_pen_based, X_shuttle,
    X_vertebra, X_wine, X_yeast, X_fars
]

y_list = [
    y_balance_scale, y_car_eval, y_contraceptive,
    y_dermatology, y_glass, y_hayes, y_heart,
    y_new_thyroid, y_page, y_pen_based, y_shuttle,
    y_vertebra, y_wine, y_yeast, y_fars
]

name_list = [
    'Balance Scale', 'Car Evaluation', 'Contraceptive',
    'Dermatology', 'Glass', 'Hayes', 'Heart',
    'New Thyroid', 'Page', 'Pen Based', 'Shuttle',
    'Vertebra', 'Wine', 'Yeast', 'FARS'
]

# Preprocessing

### Scaling using StandardScaler()

In [4]:
from sklearn.preprocessing import StandardScaler

def scale_dataframes(dfs):
    scaler = StandardScaler()
    scaled_dfs = []
    
    for df in dfs:
        scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
        scaled_dfs.append(scaled_df)
    
    return scaled_dfs

In [5]:
X_list_scaled = scale_dataframes(X_list)

# Prediction

### using 7 boosting algorithms to predict scaled data

In [6]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.ensemble import RUSBoostClassifier
from adacost import AdaCost
from utils import *
from tqdm import tqdm

In [7]:
clf_dict = {
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'AdaBoost_SAMME': AdaBoostClassifier(n_estimators=100, algorithm='SAMME', random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.3, max_features=2, max_depth=5, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'CatBoost': CatBoostClassifier(verbose=0), 
    'RUSBoostClassifier': RUSBoostClassifier(n_estimators=200, random_state=42),
    'AdaCost' : AdaCost(algorithm = "SAMME.R",random_state = 100)
}

clf_score_dict = {}

In [8]:
for clf_name, clf in tqdm(clf_dict.items()):
    score = {}
    
    for i, (X, y) in enumerate(zip(X_list_scaled, y_list)):

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        clf.fit(X_train, np.ravel(y_train)) 
        
        y_preds = clf.predict(X_test) 
        y_scores = clf.predict_proba(X_test)

        score_p = metric_list(y_test,y_preds,y_scores, len(y.iloc[:,0].unique()))
        score[name_list[i]]= score_p
    
    df = pd.DataFrame.from_dict(score, orient='index', columns=['accuracy', 'precision', 'recall', 'f1','Macro-Averaged AUPRC',
                                                            'F_measure Beta=10','Modified mcc','MCC metirc','Gmean'])
    clf_score_dict[clf_name] = df

100%|██████████| 7/7 [04:02<00:00, 34.66s/it]


In [11]:
clf_score_dict['AdaCost']

Unnamed: 0,accuracy,precision,recall,f1,Macro-Averaged AUPRC,F_measure Beta=10,Modified mcc,MCC metirc,Gmean
Balance Scale,0.56,0.514409,0.56,0.531522,0.568625,0.555524,0.144254,0.200678,0.372573
Car Evaluation,0.679191,0.4613,0.679191,0.549431,0.661212,0.651227,0.0,0.0,0.0
Contraceptive,0.440678,0.194197,0.440678,0.269591,0.692132,0.395091,0.0,0.0,0.0
Dermatology,0.594595,0.460204,0.594595,0.495316,0.656855,0.579218,0.353353,0.518013,0.0
Glass,0.511628,0.316279,0.511628,0.372093,0.666667,0.484427,0.311927,0.437439,0.0
Hayes,0.34375,0.288306,0.34375,0.20601,0.693604,0.337844,0.164989,0.181865,0.166667
Heart,0.47541,0.226015,0.47541,0.306375,0.591959,0.432068,0.0,0.0,0.0
New Thyroid,0.790698,0.655563,0.790698,0.71127,0.808895,0.776153,0.487803,0.590638,0.0
Page,0.930594,0.897045,0.930594,0.913246,0.658874,0.92744,0.405152,0.6056,0.0
Pen Based,0.203274,0.061138,0.203274,0.088825,0.560006,0.167808,0.106059,0.179551,0.0


Comment: Scaling features didn't make changes in scores

In [12]:
file_name_list = []
def save_result(result,file_name_list):
    for clf, score in result.items():
        score.to_csv(f'result/scaled/{clf}.csv', index=True)
        file_name_list.append(f'{clf}.csv')
save_result(clf_score_dict,file_name_list)