In [3]:
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import norm
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
# !pip install openml
import openml
# !pip install cardinal
# !pip install scikit-learn==0.20.4
# !pip install sklearn.cluster._k_means_fast
from cardinal.uncertainty import MarginSampler
from cardinal.random import RandomSampler
from cardinal.zhdanov2019 import TwoStepKMeansSampler
from cardinal.plotting import plot_confidence_interval
import tqdm


import os
import tensorflow as tf
from sklearn.model_selection import KFold, StratifiedKFold
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import clone_model

In [4]:
np.random.seed(123)

# Détermination manuelle des features catégorielles OpenML

# Selection of the best model

In [13]:
dataset_ids = []    #['1461', '1471', '1502', '1590', '40922', '41138', '42395', '43439', '43551', '42803', '41162', 'cifar10', 'cifar10_simclr', 'mnist']
assert len(dataset_ids) >0

### Preprocessing

In [102]:
def getData(dataset_id):
    """
    Returns X, y corresponding to a specific OpenML dataset id with some additional preprocessing
    """

    dataset = openml.datasets.get_dataset(dataset_id)

    if dataset_id in [1471, 1502, 40922, 1461, 1590, 41138]:
        X, y, cat_indicator, names = dataset.get_data(dataset_format='array', target=dataset.default_target_attribute)
    else:
        X, y, cat_indicator, names = dataset.get_data(dataset_format='dataframe', target=dataset.default_target_attribute)
        y = np.asarray(y)   #Update here in order to easily access the labels with kfolds (array indexing)
    
    cat_indicator = np.asarray(cat_indicator)


    #Special preprocessing for debugging
    if dataset_id == 42395:
        X=X.drop(['ID_code'], axis = 1)       #id = 42395
        cat_indicator = cat_indicator[1:]
    if dataset_id == 42088:
        X=X.drop(['brewery_name', 'review_profilename'], axis = 1)        #id = 42088
        cat_indicator = cat_indicator[2:]
    if dataset_id == 42256:
        X=X.drop(['full_name'], axis = 1)       #id = 42256
        cat_indicator = cat_indicator[1:]
    if dataset_id == 42803:
        X=X.drop(['Accident_Index', 'Date','Time', 'Local_Authority_(Highway)', 'LSOA_of_Accident_Location'], axis = 1)       #id = 42803
        cat_indicator = cat_indicator[5:]
    if dataset_id == 43439:
        X=X.drop(['Gender', 'ScheduledDay', 'AppointmentDay','Neighbourhood'], axis = 1)     
        cat_indicator = cat_indicator[4:]
    if dataset_id == 42088: 
        X=X.drop(['brewery_name', 'review_profilename', 'beer_name'], axis = 1)      #id = 42088
        cat_indicator = cat_indicator[3:]


    ct_cat = ColumnTransformer([
        ('normalizer', StandardScaler(), np.where(~cat_indicator)[0])
    ], remainder='passthrough')

    X_cat = pd.DataFrame(ct_cat.fit_transform(X)).convert_dtypes()

    ct = ColumnTransformer([
        ('encoder', OneHotEncoder(), np.where(cat_indicator)[0]),
        ('normalizer', StandardScaler(), np.where(~cat_indicator)[0])
    ], remainder='passthrough')

    X = ct.fit_transform(X)

    if dataset_id in [41162, 1590]:
        X = np.asarray(np.nan_to_num(X.todense()))
    else:
        X = np.asarray(np.nan_to_num(X))

    #Shuffle
    idx = np.arange(X.shape[0]) 
    np.random.shuffle(idx)
    X = X[idx]
    y = y[idx]
    
    #Reduce execution time
    X = X[:int(0.1 * X.shape[0])]
    y = y[:int(0.1 * y.shape[0])]
    
    return X, y

### Run

In [104]:
models = [
    ('GBC', GradientBoostingClassifier()),
    # ('Margin', MarginSampler(model, batch_size)),
    # ('Random', RandomSampler(batch_size)),
]

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
num_folds = 10

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)
skf = StratifiedKFold(n_splits=num_folds)


for dataset_id in tqdm.tqdm(dataset_ids, desc = 'DATASETS'):
    print(dataset_id)

    X, y = getData(dataset_id)

    for model_name, base_model in tqdm.tqdm(models, desc = f'models dataset id={dataset_id}'):

        #Check if model already studied in this dataset
        filePath = f'./results/{dataset_id}-{model_name}.csv'
        if os.path.isfile(filePath) == False :

            all_accuracies = []

            #Train/test split : le "test set" sera utilisé plus tard dans le benchmark pour entrainer initialement le modèle (donc pas utilisé ici)
            X_train, X_test, y_train, y_test = \
                    train_test_split(X, y, test_size=int(.2 * X.shape[0]))
        
            for train, validation in kfold.split(X_train, y_train):

                #Training with EarlyStopping based on the crossvalidation validation set
                model = base_model    #clone_model(base_model)  #TODO : verifier qu'il s'agit bien d'une nouvelle instance
                if model_name in []:
                    model.fit(X_train[train], y_train[train], callbacks=[callback])
                else:
                    # No need for Early Stopping callback for GBC : https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_early_stopping.html
                    model.fit(X_train[train], y_train[train])

                # Record metrics
                all_accuracies.append(model.score(X_train[validation], y_train[validation]))

            #Save model results
            results = np.array([[dataset_id, model_name, acc] for acc in all_accuracies])
            df = pd.DataFrame(results, columns = ['datasetId', 'modelName', 'accuracy'])
            save_path = f'./results/{dataset_id}-{model_name}.csv'
            df.to_csv(save_path, index=False)


models dataset id=1471: 100%|██████████| 1/1 [00:00<00:00, 3184.74it/s]
models dataset id=1502: 100%|██████████| 1/1 [00:00<00:00, 7037.42it/s]
models dataset id=40922: 100%|██████████| 1/1 [00:00<00:00, 2308.37it/s]
models dataset id=41162: 100%|██████████| 1/1 [00:00<00:00, 3125.41it/s]
models dataset id=43551: 100%|██████████| 1/1 [00:00<00:00, 9279.43it/s]
models dataset id=1461: 100%|██████████| 1/1 [00:00<00:00, 4462.03it/s]
models dataset id=1590: 100%|██████████| 1/1 [00:00<00:00, 3953.16it/s]
models dataset id=41138: 100%|██████████| 1/1 [00:00<00:00, 3334.10it/s]
models dataset id=42395: 100%|██████████| 1/1 [00:00<00:00, 4100.00it/s]
models dataset id=42803: 100%|██████████| 1/1 [00:00<00:00, 7681.88it/s]
models dataset id=43439: 100%|██████████| 1/1 [00:00<00:00, 5405.03it/s]
DATASETS: 100%|██████████| 11/11 [00:20<00:00,  1.89s/it]


### Summarizing results

In [None]:
all_results = pd.DataFrame(columns = ['datasetId', 'modelName', 'accuracy'])
avg_results = pd.DataFrame(columns = ['datasetId', 'modelName', 'accuracy'])    #Averaged accuracy of K-fold model results
pd.DataFrame(results, columns = ['datasetId', 'modelName', 'accuracy'])

for dataset_id in dataset_ids:
    for model_name, base_model in models:

        #Load model results
        load_path = f'./results/{dataset_id}-{model_name}.csv'
        df = pd.read_csv(load_path)

        #Join results go global dataframes
        all_results.append(df)

        mean_acc = df['accuracy'].mean()
        avg_results.append(\
            pd.DataFrame(\
                np.array([dataset_id, model_name, mean_acc]),
                columns = ['datasetId', 'modelName', 'accuracy']
                )
            )

#Saving grouped results 
save_path = f'./results/ALL-RESULTS.csv'
all_results.to_csv(save_path, index=False)

save_path = f'./results/RESULTS.csv'
avg_results.to_csv(save_path, index=False)        