In [10]:
import numpy as np
import pandas as pd
from decorate import DecorateClassifier 
from data_generation_methods import *
from datasets import *
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import time

np.warnings.filterwarnings('ignore')

datasets = ['chess', 'abalone', 'balance_scale', 'contraceptive-method-choice', 'glass']
curr_dataset = 'chess'

TRAIN_PATH = f'data/{curr_dataset}/{curr_dataset}.data'
data = pd.read_csv(TRAIN_PATH, header=None)



In [11]:
datasets = [
           AbaloneDataset(),
           BalanceScaleDataset(),
           ContraceptiveMethodChoiceDataset(),
           ChessDataset(),
           LetterRecognitionDataset(),
]
shapes = {ds.__class__.__name__.replace('Dataset', ''): ds.shape for ds in datasets}
print(shapes)

{'Abalone': (4177, 9), 'BalanceScale': (625, 5), 'ContraceptiveMethodChoice': (1473, 10), 'Chess': (28056, 7), 'LetterRecognition': (20000, 17)}


In [None]:
acc_avgs = {} # map between dataset -> 10-fold accuracy evaluation 
fit_time_avgs = {} # average time for fit 
predict_time_avgs = {} # average time for fit 

skf = StratifiedKFold(n_splits=10)
for dataset in datasets:    
    ## init sums
    acc_avg = 0 
    fit_avg = 0
    predict_avg = 0
    
    ## classifier and data
    dec = DecorateClassifier(n_estimators=100, n_iter=50, gen_artificial_method=TganDataGeneration(), dataset=dataset.name, label_encoder=dataset.get_le())
    X, y = dataset.get_X_y()
    
    ## k-fold loop
    for train_index, test_index in skf.split(X,y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        fit_tic = time.time()
        dec.fit(X_train, y_train)
        fit_tac = time.time()
        
        predict_tic = time.time()
        y_pred = dec.predict(X_test)
        predict_tac = time.time()
        
        acc_avg += accuracy_score(y_pred, y_test)
        fit_avg += (fit_tac - fit_tic)
        predict_avg += (predict_tac - predict_tic)
        
    ds_name = dataset.__class__.__name__.replace('Dataset', '')
    acc_avgs[ds_name] = round(acc_avg/10, 3)
    fit_time_avgs[ds_name] = round(fit_avg/10, 3)
    predict_time_avgs[ds_name] = round(predict_avg/10, 3)
    print(f"Done {ds_name}, accuracy = {acc_avgs[ds_name]}")

print()
print("acc_avgs:\n\t ", acc_avgs)    
print()
print("fit_time_avgs:\n\t ", fit_time_avgs)    
print()
print("predict_time_avgs:\n\t ", predict_time_avgs)    

Done Abalone, accuracy = 0.913
Done BalanceScale, accuracy = 0.225
Done ContraceptiveMethodChoice, accuracy = 0.886
Done Chess, accuracy = 0.976
