In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Datasets, preprocessing, and metrics
from sklearn import datasets
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold

# Useful Functions
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

# Ensemble Classifiers
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

import load_dataset
from HoTdiagram import HoTdiagram
from AMEnsemble import CvECNN, CvICNN, HardRCNN_Ensemble

In [2]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [3]:
def EvalClassifiers(Name, X, y, n_splits=10, score = balanced_accuracy_score):
    df = pd.DataFrame()
    df_params = pd.DataFrame()
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1)
    for train_index, test_index in skf.split(X, y):
        df_sim = pd.DataFrame()
        df_params_sim = pd.DataFrame()
        Xtr, Xte = X[train_index], X[test_index]
        ytr, yte = y[train_index], y[test_index]
        # Process the data
        scaler = StandardScaler()
        Xtr = scaler.fit_transform(Xtr)
        Xte = scaler.transform(Xte)
                
        # Random Forest
        rf = RandomForestClassifier(n_estimators = 30).fit(Xtr,ytr)
        y_pred = rf.predict(Xte)
        df_sim["Random Forest"] = [score(yte,y_pred)]
        
        ICAM = HardRCNN_Ensemble(classifiers=rf.estimators_, RCNN = CvICNN).fit(Xtr,ytr)
        y_pred = ICAM.predict(Xte)
        df_sim["Identity RCAM"] = [score(yte,y_pred)]
        
        ECAM = HardRCNN_Ensemble(classifiers=rf.estimators_, RCNN = CvECNN, alpha = 1).fit(Xtr,ytr)
        y_pred = ECAM.predict(Xte)
        df_sim["Exponential RCAM"] = [score(yte,y_pred)]
                        
        parameters = {'classifiers':[rf.estimators_], 'RCNN':[CvECNN],
              'alpha':[0.01, 0.1, 0.5, 1, 5, 10, 20, 50]}
        ECAM_grid = GridSearchCV(HardRCNN_Ensemble(), parameters, cv = 5).fit(Xtr,ytr)
        y_pred = ECAM_grid.predict(Xte)
        df_sim["Exp. RCAM + Grid Search"] = [score(yte,y_pred)]
                
        df = pd.concat([df,df_sim])
    df.to_csv("CSVs/%s.csv" % Name)
    return df, df_params

In [4]:
AllDataSets = [
    ("Breast Cancer Wisconsin","wdbc",1),
    ("Diabetes","diabetes",1),
    ("Banknote","banknote-authentication",1),
    ("Spambase","spambase",1),
    ("Ionosphere","ionosphere",1),
    ("Colic","colic",2),
    ("Sonar","sonar",1),
    ("Tic-Tac-Toe","tic-tac-toe",1),
    ("Monks-2","monks-problems-2",1),
    ("Australian","Australian",4),
    ("Banana","banana",1),
    ("Cylinder Bands","cylinder-bands",2),
    ("Chess","kr-vs-kp",1),
    ("Haberman","haberman",1),
    ("Mushroom","mushroom",1),
    ("Phoneme","phoneme",1),
    ("Titanic","Titanic",2),
    ("Pishing Websites","PhishingWebsites",1),
    ("Internet Advertisements","Internet-Advertisements",2),
    ("Thoracic Surgery","thoracic_surgery",1),
    ("Credit Approval","credit-approval",1),
    ("Hill-Valley","hill-valley",1),
    ("Egg-Eye-State","eeg-eye-state",1),
    ("MOFN-3-7-10","mofn-3-7-10",1),
    ("Credit-g","credit-g",1),
    ("Accute Inflammations","acute-inflammations",1),
    ("ilpd","ilpd",1),
    ("Arsene","arcene",1),
    ("Blood Transfusion","blood-transfusion-service-center",1),
    ("Steel Plates Fault","steel-plates-fault",1),
    ("Sick","sick",1)
]

In [5]:
data = pd.DataFrame()
data_params = pd.DataFrame()
for name, dataset, version in AllDataSets:
    start_time = time.time()
    print("\nProcessing dataset: ",name)
    X, y = datasets.fetch_openml(dataset,version=version,return_X_y = True)
    # Imput missing data
    X = SimpleImputer().fit_transform(X)
    # Convert the class labels to 0 and 1
    y = LabelEncoder().fit_transform(y)
    df, df_params = EvalClassifiers(name, X, y)
    data = pd.concat([data,df.rename(index={0:name})])
    data_params = pd.concat([data_params,df_params.rename(index={0:name})])
    data.to_csv("CSVs/BinaryDataSets.csv")
    print("\nTime to process the dataset: %2.2f seconds." % (time.time() - start_time))


Processing dataset:  Breast Cancer Wisconsin

Time to process the dataset: 11.66 seconds.

Processing dataset:  Diabetes

Time to process the dataset: 9.66 seconds.

Processing dataset:  Banknote

Time to process the dataset: 10.31 seconds.

Processing dataset:  Spambase

Time to process the dataset: 24.13 seconds.

Processing dataset:  Ionosphere

Time to process the dataset: 9.12 seconds.

Processing dataset:  Colic

Time to process the dataset: 9.18 seconds.

Processing dataset:  Sonar

Time to process the dataset: 7.37 seconds.

Processing dataset:  Tic-Tac-Toe

Time to process the dataset: 9.84 seconds.

Processing dataset:  Monks-2

Time to process the dataset: 9.36 seconds.

Processing dataset:  Australian

Time to process the dataset: 7.78 seconds.

Processing dataset:  Banana

Time to process the dataset: 18.26 seconds.

Processing dataset:  Cylinder Bands

Time to process the dataset: 10.23 seconds.

Processing dataset:  Chess

Time to process the dataset: 15.96 seconds.

Pr

In [6]:
data

Unnamed: 0,Random Forest,Identity RCAM,Exponential RCAM,Exp. RCAM + Grid Search
Breast Cancer Wisconsin,0.977273,1.000000,1.000000,1.000000
Breast Cancer Wisconsin,0.963384,0.963384,0.963384,0.963384
Breast Cancer Wisconsin,0.976190,0.976190,0.976190,0.976190
Breast Cancer Wisconsin,0.976190,0.976190,0.976190,0.976190
Breast Cancer Wisconsin,0.896825,0.896825,0.896825,0.896825
Breast Cancer Wisconsin,0.976190,0.976190,0.976190,0.976190
Breast Cancer Wisconsin,0.972222,0.972222,0.972222,0.972222
Breast Cancer Wisconsin,0.914286,0.914286,0.914286,0.914286
Breast Cancer Wisconsin,0.961905,0.961905,0.961905,0.961905
Breast Cancer Wisconsin,0.928571,0.928571,0.928571,0.928571
