In [1]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import metrics 

import tensorflow.keras as keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt

In [2]:
SEED = 42

import os
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

import random 
random.seed(SEED)

import numpy as np
np.random.seed(SEED)

import tensorflow as tf
tf.random.set_seed(SEED)


# import datasets

In [3]:
# preprocessing method
def cat_2_num(df:pd.DataFrame):
    cat_columns = df.select_dtypes(['object']).columns
    df[cat_columns] = df[cat_columns].astype('category')
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df

In [4]:
#German dataset
def German():
    df = pd.read_table(os.path.join("datasets","german.data-numeric.txt"),delim_whitespace = True, header = None)
    return train_test_split(df.drop(columns = [24]), df[24], test_size=0.3, random_state=SEED, stratify=df[24]) # 70% training and 30% test

In [5]:
def Australian():
    df = pd.read_table(os.path.join("datasets","australian.dat"),delim_whitespace = True, header = None)
    return train_test_split(df.drop(columns = [14]), df[14], test_size=0.3, random_state=SEED, stratify=df[14]) # 70% training and 30% test

In [6]:
def Crx():
    df = pd.read_csv(os.path.join("datasets","crx.data"), header = None)
    # drop entries with ?
    df = df.replace("?", np.nan).dropna()
    # convert category data to numerical data
    df = cat_2_num(df)
    return train_test_split(df.drop(columns = [15]), df[15], test_size=0.3, random_state=SEED, stratify=df[15]) # 70% training and 30% test

In [7]:
def Horse():
    df = pd.read_table(os.path.join("datasets","horse-colic.data"), delim_whitespace=True, header = None)
    # instead of dropna, treat '?' as a separate class. reason: drop would leave only 68 entries
    df = cat_2_num(df)

    df_test = pd.read_table(os.path.join("datasets","horse-colic.test"), delim_whitespace=True, header = None)
    df_test = cat_2_num(df_test)
    return df.drop(columns = [24]), df_test.drop(columns = [24]), df[24], df_test[24]

In [8]:
def Vehicle():
    vehicles_dfs = [pd.read_table(os.path.join("datasets",f), delim_whitespace=True, header = None) for f in os.listdir('datasets') if f.startswith('xa')]
    df = pd.concat(vehicles_dfs)
    df.replace("?", np.nan).notna()
    df = cat_2_num(df)
    return train_test_split(df.drop(columns = [18]), df[18], test_size=0.3, random_state=SEED, stratify=df[18])

#### Additional 3 datasets

In [9]:
def Pumpkin():
    df = pd.read_excel(os.path.join("datasets",'Pumpkin_Seeds_Dataset.xlsx'), sheet_name='Pumpkin_Seeds_Dataset',engine='openpyxl')
    df = cat_2_num(df)
    return train_test_split(df.drop(columns = ['Class']), df['Class'], test_size=0.3, random_state=SEED, stratify=df['Class'])

In [10]:
def Water():
    df = pd.read_csv(os.path.join("datasets",'water_potability.csv'))
    df = df.dropna()
    return train_test_split(df.drop(columns = ['Potability']), df['Potability'], test_size=0.3, random_state=SEED, stratify=df['Potability'])

In [11]:
def Banking():
    df = pd.read_csv(os.path.join("datasets",'banking.csv'))
    df = df.dropna()
    df = cat_2_num(df)
    # random sample by class ( balance out data )
    g = df.groupby('y')
    df = g.apply(lambda x: x.sample(1500).reset_index(drop=True))
    return train_test_split(df.drop(columns = ['y']), df['y'], test_size=0.3, random_state=SEED, stratify=df['y'])

# decision tree

In [12]:
def DT(X_train, X_test, y_train, y_test):
    
    print("Grid searching best parameters for decision tree")
    param_grid = { 
        'criterion': ['gini','entropy'],
        'splitter': ['best','random'],
        'max_features': ['sqrt','log2'],
        'max_depth':list(range(1,10))
    }

    # grid search for best parameters
    
    grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv= 3)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    # Create Decision Tree classifer object
    clf = DecisionTreeClassifier(criterion = best_params['criterion'],splitter = best_params['splitter'], 
    max_features = best_params['max_features'], max_depth = best_params['max_depth'], random_state=SEED)
    clf = clf.fit(X_train,y_train)
    #Predict the response for test dataset
    y_pred = clf.predict(X_test)
    f = metrics.f1_score(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    print("Accuracy for decision tree:", acc)
    print("F-score for decision tree:", f)
    print("==============================================================")
    return acc, f, best_params

# Model Accuracy, how often is the classifier correct?


# ramdom forest

In [13]:
def RF(X_train, X_test, y_train, y_test):
    print("Random searching best parameters for random forest")
    param_grid = {'n_estimators': list(range(800, 1600, 200)),
               'max_depth': list(range(10,110,10)),
               'min_samples_split': [2,5,10],
               'min_samples_leaf': [1,2,4],
               'bootstrap': [True, False]}
    # Instantiate model with 1000 decision trees
    grid_search = RandomizedSearchCV(RandomForestClassifier(), param_grid, cv= 3)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    rf = RandomForestClassifier(
        n_estimators = best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split = best_params['min_samples_split'], 
        min_samples_leaf=best_params['min_samples_leaf'],
        bootstrap=best_params['bootstrap'],
        random_state = SEED)
    # Train the model on training data
    rf = rf.fit(X_train, y_train)
    # Use the forest's predict method on the test data
    y_pred = rf.predict(X_test)
    f = metrics.f1_score(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    print("Accuracy for random forest:", acc)
    print("Best F-score for random forest:",f)
    print("==============================================================")
    return acc, f, best_params

# svm

In [14]:
def SVM(X_train, X_test, y_train, y_test):

    print("Random searching best parameters for SVM")

    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10],
        'kernel' : ['rbf', 'linear', 'poly','sigmoid']
        }
    
    grid_search = RandomizedSearchCV(svm.SVC(), param_grid, cv = 3)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_

    #Create a svm Classifier
    clf = svm.SVC(
        C=best_params['C'],
        kernel=best_params['kernel'],
        random_state = SEED) # Linear Kernel
    #Train the model using the training sets
    clf.fit(X_train, y_train)
    #Predict the response for test dataset
    y_pred = clf.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    f = metrics.f1_score(y_test, y_pred)
#         print("kernel", i)
#         print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
#         print("F-score:",f)

    print("Accuracy for svm:",acc)
    print("Best F-score for svm:",f)
    print("==========================================")
    return acc, f, best_params

# neural network

In [15]:
def model(hp):
    dr = 0.2 #dropout rate
    no_neurons_1 = hp.Float('no_neurons_1', 32, 256, step=32)

    rate = hp.Float('learning_rate', 0.001, 0.5, sampling="log")

    model = Sequential([
        Dense(no_neurons_1, input_shape = (X_train.shape[1],), activation = "relu"),    # not sure if it is best practice
        Dropout(rate = dr),
        Dense(1, activation = "sigmoid")
    ])
    model.compile(optimizer = keras.optimizers.SGD(learning_rate = rate),
                  loss = "binary_crossentropy",
                  metrics = ["accuracy"]
    )
    
    return model

In [16]:
def best_model(n,l, X_train, X_test, y_train, y_test):
    dr = 0.2 #dropout rate
    early_stopping = EarlyStopping(monitor = "val_loss", patience = 3)
    max_epochs = 100 #number of maximum epochs
    batch = 64 #batch size


    model = Sequential([
        Dense(n, input_shape = (X_train.shape[1],), activation = "relu"),
        Dropout(rate = dr),
        Dense(1, activation = "sigmoid")
    ])
    model.compile(optimizer = keras.optimizers.SGD(learning_rate = l),
                  loss = "binary_crossentropy",
                  metrics = ["accuracy"]
    )
    model.fit(X_train, y_train,
             validation_data=(X_test, y_test),
             batch_size = batch,
             epochs = max_epochs,
             verbose = 0, 
             callbacks = [early_stopping])
    
    y_pred = model.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    f = metrics.f1_score(y_test, y_pred)
    print("Accuracy for neural network:",acc)
    print("Best F-score for neural network:",f)
    print("==============================================================")
    
    return acc, f, None

In [17]:
def NN(X_train, X_test, y_train, y_test):
    tuner = kt.RandomSearch(
        model,
        objective="val_accuracy",
        max_trials=10,
        overwrite=True)

    early_stopping = EarlyStopping(monitor = "val_loss", patience = 3)
    max_epochs = 100 #number of maximum epochs
    batch = 64 #batch size

    tuner.search(X_train, y_train,
                 validation_data=(X_test, y_test),
                 batch_size = batch,
                 epochs = max_epochs,
                 verbose = 0, 
                 callbacks = [early_stopping])

    n = tuner.get_best_hyperparameters()[0].get("no_neurons_1")
    l = tuner.get_best_hyperparameters()[0].get("learning_rate")

    return best_model(n,l, X_train, X_test, y_train, y_test)


In [18]:
def run_all_models(X_train, X_test, y_train, y_test):
    acc_dt, f_dt, best_params_dt = DT(X_train, X_test, y_train, y_test)
    acc_rf, f_rf, best_params_rf = RF(X_train, X_test, y_train, y_test)
    acc_svm, f_svm, best_params_svm = SVM(X_train, X_test, y_train, y_test)
    acc_nn, f_nn, best_params_nn = NN(X_train, X_test, y_train, y_test)
    return {
        'model':['decision tree', 'random forest', 'svm', 'neural network'],
        'accuracy':[acc_dt, acc_rf, acc_svm, acc_nn],
        'f1_score':[f_dt, f_rf, f_svm, f_nn],
        'model_parameters':[best_params_dt, best_params_rf, best_params_svm, best_params_nn]
        }

# evaluate all models

In [32]:
X_train, X_test, y_train, y_test = Australian()

In [33]:
y_train.value_counts()
NN(X_train, X_test, y_train, y_test)

0    268
1    215
Name: 14, dtype: int64

In [38]:
# for testing only
X_train, X_test, y_train, y_test = German()
NN(X_train, X_test, y_train, y_test)
# all prediction value is 1

INFO:tensorflow:Oracle triggered exit


ValueError: Data must be 1-dimensional

In [19]:
#German dataset
dataset_funcs = [German, Australian, Crx, Horse, Vehicle, Pumpkin, Water, Banking]
stats_list = []
for dataset_getter in dataset_funcs:
    print(f"Current dataset: {dataset_getter.__name__}")
    X_train, X_test, y_train, y_test = dataset_getter()
    print("=========== Training Set ===========")
    print(f"X_train: {X_train.shape}\ny_train: {y_train.shape}\nvalue counts:\n{y_train.value_counts()}")
    print("=========== Testing Set ===========")
    print(f"X_test: {X_test.shape}\ny_test: {y_test.shape}\nvalue counts:\n{y_test.value_counts()}")
    print("=========================")
    # model_score = run_all_models(X_train, X_test, y_train, y_test)
    # model_score['dataset'] = [dataset_getter.__name__]*4
    # stats_list.append(pd.DataFrame(model_score))
pd.concat(stats_list).to_csv("model_scores.tsv", sep='\t')

Current dataset: German
X_train: (700, 24)
y_train: (700,)
value counts:
1    490
2    210
Name: 24, dtype: int64
X_test: (300, 24)
y_test: (300,)
value counts:
1    210
2     90
Name: 24, dtype: int64
Current dataset: Australian
X_train: (483, 14)
y_train: (483,)
value counts:
0    268
1    215
Name: 14, dtype: int64
X_test: (207, 14)
y_test: (207,)
value counts:
0    115
1     92
Name: 14, dtype: int64
Current dataset: Crx
X_train: (457, 15)
y_train: (457,)
value counts:
1    250
0    207
Name: 15, dtype: int64
X_test: (196, 15)
y_test: (196,)
value counts:
1    107
0     89
Name: 15, dtype: int64
Current dataset: Horse
X_train: (300, 27)
y_train: (300,)
value counts:
0        56
3111     33
3205     29
2208     20
2205     13
         ..
5205      1
2305      1
5000      1
7400      1
11300     1
Name: 24, Length: 61, dtype: int64
X_test: (68, 27)
y_test: (68,)
value counts:
0        11
3111      8
3205      6
2209      4
2205      4
2208      3
7111      3
4124      2
4206      2
3

ValueError: No objects to concatenate