In [50]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import metrics 
import seaborn as sns
import matplotlib.pyplot as plt
# Turn interactive plotting off, show plot only when plt.show() is called
plt.ioff()

import tensorflow.keras as keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt


In [51]:
SEED = 42

import os
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

import random 
random.seed(SEED)

import numpy as np
np.random.seed(SEED)

import tensorflow as tf
tf.random.set_seed(SEED)

INPUT = "datasets"
OUTPUT = "part3_results"
if not os.path.exists(OUTPUT):
    os.makedirs(OUTPUT)

# Import datasets

In [52]:
# preprocessing method
def cat_2_num(df:pd.DataFrame):
    cat_columns = df.select_dtypes(['object']).columns
    df[cat_columns] = df[cat_columns].astype('category')
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df

## UCI datasets

In [53]:
#German dataset
def German():
    df = pd.read_table(os.path.join(INPUT,"german.data-numeric"),delim_whitespace = True, header = None)
    df[24] = df[24]-1    # change label from 1,2 to 0,1
    return train_test_split(df.drop(columns = [24]), df[24].astype(bool), test_size=0.3, random_state=SEED, stratify=df[24]) # 70% training and 30% test

In [54]:
def Australian():
    df = pd.read_table(os.path.join(INPUT,"australian.dat"),delim_whitespace = True, header = None)
    return train_test_split(df.drop(columns = [14]), df[14].astype(bool), test_size=0.3, random_state=SEED, stratify=df[14]) # 70% training and 30% test

In [55]:
def Crx():
    df = pd.read_csv(os.path.join(INPUT,"crx.data"), header = None)
    # drop entries with ?
    df = df.replace("?", np.nan).dropna()
    # convert category data to numerical data
    df = cat_2_num(df)
    return train_test_split(df.drop(columns = [15]), df[15].astype(bool), test_size=0.3, random_state=SEED, stratify=df[15]) # 70% training and 30% test

In [56]:
def Hepatitis():
    df = pd.read_csv(os.path.join(INPUT,"hepatitis.data"), header = None)
    df = cat_2_num(df)
    df[19] = df[19]-1 # change to 0 or 1
    return train_test_split(df.drop(columns = [19]), df[19].astype(bool), test_size=0.3, random_state=SEED, stratify=df[19]) # 70% training and 30% test

In [57]:
def Ionosphere():
    df = pd.read_csv(os.path.join(INPUT, "ionosphere.data"), header=None)
    df = cat_2_num(df)
    
    return train_test_split(df.drop(columns = [34]), df[34].astype(bool), test_size=0.3, random_state=SEED, stratify=df[34]) # 70% training and 30% test

## Additional Kaggle datasets

In [58]:
def Pumpkin():
    df = pd.read_excel(os.path.join("datasets",'Pumpkin_Seeds_Dataset.xlsx'), sheet_name='Pumpkin_Seeds_Dataset',engine='openpyxl')
    df = cat_2_num(df)
    return train_test_split(df.drop(columns = ['Class']), df['Class'].astype(bool), test_size=0.3, random_state=SEED, stratify=df['Class'])

In [59]:
# 5644 samples, relatively large dataset
def Mushroom():
    df = pd.read_csv(os.path.join(INPUT,'mushrooms.csv'))
    df = df.replace("?", np.nan).dropna()
    df = cat_2_num(df)
    return train_test_split(df.drop(columns = ['class']), df['class'].astype(bool), test_size=0.3, random_state=SEED, stratify=df['class'])

In [60]:
def Diabetes():
    df = pd.read_csv(os.path.join(INPUT,'diabetes_data.csv'), sep=';')
    df = cat_2_num(df)
    return train_test_split(df.drop(columns = ['class']), df['class'].astype(bool), test_size=0.3, random_state=SEED, stratify=df['class'])

# Models

In [61]:
def print_dataset_stats(X_train, X_test, y_train, y_test, name:str):
    print(f"Current dataset: {name}")
    print(f"X_train: {X_train.shape}; y_train: {y_train.shape}")#\ny_train value counts:\n{y_train.value_counts()}")
    print(f"X_test: {X_test.shape}; y_test: {y_test.shape}")#\ny_test value counts:\n{y_test.value_counts()}")
    print("")
    return {
        'X_train shape':X_train.shape, 
        'y_train shape':y_train.shape, 
        'X_test shape':X_test.shape, 
        'y_test shape':y_test.shape
        }

In [62]:
counter = 1
def eval_result(y_test, y_pred, model_name):
    global counter
    image_name=f"{counter}_{model_name}.png"
    f = metrics.f1_score(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    ctx = metrics.confusion_matrix(y_test, y_pred)
    plt.figure(counter)
    sns.heatmap(ctx, cmap='Oranges', annot=True, fmt='g')
    plt.savefig(os.path.join(OUTPUT, image_name))
    plt.close(counter)
    print(f"Accuracy for {model_name}:", acc)
    print(f"F-score for {model_name}:", f)
    print()
    counter += 1
    return f, acc, image_name

## Decision Tree

In [63]:
def DT(X_train, X_test, y_train, y_test):
    
    print(f"---------------- Decision Tree ---------------------")
    param_grid = { 
        'criterion': ['gini','entropy'],
        'splitter': ['best','random'],
        'max_features': ['sqrt','log2'],
        'max_depth':list(range(1,10))
    }

    # grid search for best parameters
    
    grid_search = GridSearchCV(DecisionTreeClassifier(random_state=SEED), param_grid, cv=3)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    # Create Decision Tree classifer object
    clf = DecisionTreeClassifier(
        criterion = best_params['criterion'],
        splitter = best_params['splitter'], 
        max_features = best_params['max_features'], 
        max_depth = best_params['max_depth'], 
        random_state=SEED)
    clf = clf.fit(X_train,y_train)
    #Predict the response for test dataset
    y_pred = clf.predict(X_test)
    return eval_result(y_test, y_pred, "decision tree"), best_params

# Model Accuracy, how often is the classifier correct?


## Ramdom Forest

In [64]:
def RF(X_train, X_test, y_train, y_test):

    print(f"---------------- Random Forest ---------------------")

    print("Randomized searching best parameters...")
    param_grid = {'n_estimators': list(range(800, 1600, 200)),
               'max_depth': list(range(10,110,10)),
               'min_samples_split': [2,5,10],
               'min_samples_leaf': [1,2,4],
               'bootstrap': [True, False]}
    grid_search = RandomizedSearchCV(RandomForestClassifier(random_state=SEED), param_grid, cv= 3)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    print(f"Best parameters found {best_params}")

    # train model using the best parameters
    rf = RandomForestClassifier(
        n_estimators = best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split = best_params['min_samples_split'], 
        min_samples_leaf=best_params['min_samples_leaf'],
        bootstrap=best_params['bootstrap'],
        random_state = SEED)
    # Train the model on training data
    rf = rf.fit(X_train, y_train)
    # Use the forest's predict method on the test data
    y_pred = rf.predict(X_test)
    return eval_result(y_test, y_pred, "random forest"), best_params

## SVM

In [65]:
def SVM(X_train, X_test, y_train, y_test):

    print(f"-------------------- SVM -------------------------")
    print("Randomized searching best parameters...")

    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10],
        'kernel' : ['rbf', 'linear', 'poly', 'sigmoid']
        }
    
    grid_search = RandomizedSearchCV(svm.SVC(random_state=SEED), param_grid, cv = 3)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    print(f"Best parameters found {best_params}")

    # Create a svm Classifier
    clf = svm.SVC(
        C=best_params['C'],
        kernel=best_params['kernel'],
        random_state = SEED) # Linear Kernel
    #Train the model using the training sets
    clf.fit(X_train, y_train)
    #Predict the response for test dataset
    y_pred = clf.predict(X_test)
    return eval_result(y_test, y_pred, "svm"), best_params

## Neural Network

In [66]:
def model(hp):
    dr = 0.2 #dropout rate
    no_neurons_1 = hp.Float('no_neurons_1', 32, 256, step=32)

    rate = hp.Float('learning_rate', 0.001, 0.5, sampling="log")

    model = Sequential([
        Dense(no_neurons_1, input_shape = (X_train.shape[1],), activation = "relu"),    # not sure if it is best practice
        Dropout(rate = dr),
        Dense(1, activation = "sigmoid")
    ])
    model.compile(optimizer = keras.optimizers.SGD(learning_rate = rate),
                  loss = "binary_crossentropy",
                  metrics = ["accuracy"]
    )
    
    return model

In [67]:
def best_model(n,l, X_train, X_test, y_train, y_test):
    dr = 0.2 #dropout rate
    early_stopping = EarlyStopping(monitor = "val_loss", patience = 3)
    max_epochs = 100 #number of maximum epochs
    batch = 64 #batch size


    model = Sequential([
        Dense(n, input_shape = (X_train.shape[1],), activation = "relu"),
        Dropout(rate = dr),
        Dense(1, activation = "sigmoid")
    ])
    model.compile(optimizer = keras.optimizers.SGD(learning_rate = l),
                  loss = "binary_crossentropy",
                  metrics = ["accuracy"]
    )
    model.fit(X_train, y_train,
             validation_data=(X_test, y_test),
             batch_size = batch,
             epochs = max_epochs,
             verbose = 0, 
             callbacks = [early_stopping])
    
    y_pred = np.round(model.predict(X_test))
    
    return eval_result(y_test, y_pred, "neural network"), None

In [68]:
def NN(X_train, X_test, y_train, y_test):
    tuner = kt.RandomSearch(
        model,
        objective="val_accuracy",
        max_trials=10,
        overwrite=True)

    early_stopping = EarlyStopping(monitor = "val_loss", patience = 3)
    max_epochs = 100 #number of maximum epochs
    batch = 64 #batch size

    tuner.search(X_train, y_train,
                 validation_data=(X_test, y_test),
                 batch_size = batch,
                 epochs = max_epochs,
                 verbose = 0, 
                 callbacks = [early_stopping])

    n = tuner.get_best_hyperparameters()[0].get("no_neurons_1")
    l = tuner.get_best_hyperparameters()[0].get("learning_rate")

    return best_model(n,l, X_train, X_test, y_train, y_test)


In [69]:
models = [DT, RF, SVM, NN]
def run_all_models(X_train, X_test, y_train, y_test):
    stats = {'model':[],'accuracy':[],'f1_score':[],'cfmtx_img':[],'model_parameters':[]}
    for model in models:
        (acc, f, ctx), best_params = model(X_train, X_test, y_train, y_test)
        stats['model'].append(model.__name__)
        stats['accuracy'].append(acc)
        stats['f1_score'].append(f)
        stats['cfmtx_img'].append(ctx)
        stats['model_parameters'].append(best_params)
    return stats

## Running all models

In [70]:
# For testing purposes only, need to delete
# X_train, X_test, y_train, y_test = Pumpkin()
# stats = print_dataset_stats(X_train, X_test, y_train, y_test, Mushroom.__name__)
# model_score = run_all_models(X_train, X_test, y_train, y_test)

In [71]:
#German dataset
dataset_funcs = [German, Australian, Crx, Hepatitis, Ionosphere, Pumpkin, Mushroom, Diabetes]
stats_list = []
for dataset_getter in dataset_funcs:
    X_train, X_test, y_train, y_test = dataset_getter()
    stats = print_dataset_stats(X_train, X_test, y_train, y_test, dataset_getter.__name__)
    model_score = run_all_models(X_train, X_test, y_train, y_test)
    model_score['name'] = [dataset_getter.__name__]*4
    model_score['stats'] = [stats]*4
    stats_list.append(pd.DataFrame(model_score))
    print("===============================================\n")
pd.concat(stats_list).to_csv(os.path.join(OUTPUT, "model_scores.tsv"), sep='\t')

Current dataset: German
X_train: (700, 24); y_train: (700,)
X_test: (300, 24); y_test: (300,)

---------------- Decision Tree ---------------------
Accuracy for decision tree: 0.74
F-score for decision tree: 0.5760869565217391

---------------- Random Forest ---------------------
Randomized searching best parameters...
Best parameters found {'n_estimators': 1000, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 30, 'bootstrap': False}
Accuracy for random forest: 0.7733333333333333
F-score for random forest: 0.5277777777777777

-------------------- SVM -------------------------
Randomized searching best parameters...
Best parameters found {'kernel': 'linear', 'C': 0.1}
Accuracy for svm: 0.7633333333333333
F-score for svm: 0.5477707006369428

INFO:tensorflow:Oracle triggered exit
Accuracy for neural network: 0.7666666666666667
F-score for neural network: 0.4776119402985074


Current dataset: Australian
X_train: (483, 14); y_train: (483,)
X_test: (207, 14); y_test: (207,)

---