In [1]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import metrics 

import tensorflow.keras as keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt


In [2]:
SEED = 42

import os
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

import random 
random.seed(SEED)

import numpy as np
np.random.seed(SEED)

import tensorflow as tf
tf.random.set_seed(SEED)


# import datasets

In [87]:
#German dataset
def German():
    df = pd.read_table(os.path.join("datasets","german.data-numeric.txt"),delim_whitespace = True, header = None)
#     display(df.head())
    X = df.drop(columns = [24])
    y = df[24]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED) # 70% training and 30% test
    return X_train, X_test, y_train, y_test

In [None]:
def Australian():
    df = pd.read_table(os.path.join("datasets","german.data-numeric.txt"),delim_whitespace = True, header = None)
#     display(df.head())
    X = df.drop(columns = [24])
    y = df[24]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED) # 70% training and 30% test
    return X_train, X_test, y_train, y_test

# decision tree

In [88]:
def DT(X_train, X_test, y_train, y_test):
    
    print("Grid searching best parameters for decision tree")
    param_grid = { 
        'criterion': ['gini','entropy'],
        'splitter': ['best','random'],
        'max_features': ['sqrt','log2'],
        'max_depth':list(range(1,10))
    }

    # grid search for best parameters
    
    grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv= 3)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    # Create Decision Tree classifer object
    clf = DecisionTreeClassifier(criterion = best_params['criterion'],splitter = best_params['splitter'], 
    max_features = best_params['max_features'], max_depth = best_params['max_depth'], random_state=SEED)
    clf = clf.fit(X_train,y_train)
    #Predict the response for test dataset
    y_pred = clf.predict(X_test)
    f = metrics.f1_score(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    print("Accuracy for decision tree:", acc)
    print("F-score for decision tree:", f)
    print("\n")
    return acc, f, best_params

# Model Accuracy, how often is the classifier correct?


# ramdom forest

In [98]:
def RF(X_train, X_test, y_train, y_test):
    print("Grid searching best parameters for random forest")
    param_grid = {'n_estimators': list(range(800, 1600, 200)),
               'max_depth': list(range(10,110,10)),
               'min_samples_split': [2,5,10],
               'min_samples_leaf': [1,2,4],
               'bootstrap': [True, False]}
    # Instantiate model with 1000 decision trees
    grid_search = RandomizedSearchCV(RandomForestClassifier(), param_grid, cv= 3)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    rf = RandomForestClassifier(
        n_estimators = best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split = best_params['min_samples_split'], 
        min_samples_leaf=best_params['min_samples_leaf'],
        bootstrap=best_params['bootstrap'],
        random_state = SEED)
    # Train the model on training data
    rf = rf.fit(X_train, y_train)
    # Use the forest's predict method on the test data
    y_pred = rf.predict(X_test)
    f = metrics.f1_score(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    print("Accuracy for random forest:", acc)
    print("Best F-score for random forest:",f)
    print("\n")
    return acc, f, best_params

# svm

In [1]:
def SVM(X_train, X_test, y_train, y_test):

    print("Grid searching best parameters for SVM")

    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10],
        'gamma' : [0.05, 0.1, 0.15, 0.20, 0.25], 
        'degree' : list(range(1,7)), 
        'kernel' : ['rbf', 'linear', 'poly','sigmoid']
        }
    
    grid_search = RandomizedSearchCV(svm.SVC(), param_grid, cv = 3)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_

    #Create a svm Classifier
    clf = svm.SVC(
        C=best_params['C'],
        gamma=best_params['gamma'],
        degree=best_params['degree'],
        kernel=best_params['kernel'],
        random_state = SEED) # Linear Kernel
    #Train the model using the training sets
    clf.fit(X_train, y_train)
    #Predict the response for test dataset
    y_pred = clf.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    f = metrics.f1_score(y_test, y_pred)
#         print("kernel", i)
#         print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
#         print("F-score:",f)

    print("Accuracy for svm:",acc)
    print("Best F-score for svm:",f)
    print("\n")
    return acc, f, best_params

# neural network

In [91]:
def model(hp):
    dr = 0.2 #dropout rate
    no_neurons_1 = hp.Float('no_neurons_1', 32, 256, step=32)

    rate = hp.Float('learning_rate', 0.001, 0.5, sampling="log")

    model = Sequential([
        Dense(no_neurons_1, input_shape = (X_train.shape[1],), activation = "relu"),    # not sure if it is best practice
        Dropout(rate = dr),
        Dense(1, activation = "sigmoid")
    ])
    model.compile(optimizer = keras.optimizers.SGD(learning_rate = rate),
                  loss = "binary_crossentropy",
                  metrics = ["accuracy"]
    )
    
    return model

In [92]:
def best_model(n,l, X_train, X_test, y_train, y_test):
    dr = 0.2 #dropout rate
    early_stopping = EarlyStopping(monitor = "val_loss", patience = 3)
    max_epochs = 100 #number of maximum epochs
    batch = 64 #batch size


    model = Sequential([
        Dense(n, input_shape = (X_train.shape[1],), activation = "relu"),
        Dropout(rate = dr),
        Dense(1, activation = "sigmoid")
    ])
    model.compile(optimizer = keras.optimizers.SGD(learning_rate = l),
                  loss = "binary_crossentropy",
                  metrics = ["accuracy"]
    )
    model.fit(X_train, y_train,
             validation_data=(X_test, y_test),
             batch_size = batch,
             epochs = max_epochs,
             verbose = 0, 
             callbacks = [early_stopping])
    
    y_pred = model.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    f = metrics.f1_score(y_test, y_pred)
    print("Accuracy for neural network:",acc)
    print("Best F-score for neural network:",f)
    print("\n")
    
    return acc, f, None

In [93]:
def NN(X_train, X_test, y_train, y_test):
    tuner = kt.RandomSearch(
        model,
        objective="val_accuracy",
        max_trials=10,
        overwrite=True)

    early_stopping = EarlyStopping(monitor = "val_loss", patience = 3)
    max_epochs = 100 #number of maximum epochs
    batch = 64 #batch size

    tuner.search(X_train, y_train,
                 validation_data=(X_test, y_test),
                 batch_size = batch,
                 epochs = max_epochs,
                 verbose = 0, 
                 callbacks = [early_stopping])

    n = tuner.get_best_hyperparameters()[0].get("no_neurons_1")
    l = tuner.get_best_hyperparameters()[0].get("learning_rate")

    return best_model(n,l, X_train, X_test, y_train, y_test)


In [94]:
def run_all_models(X_train, X_test, y_train, y_test):
    acc_dt, f_dt, best_params_dt = DT(X_train, X_test, y_train, y_test)
    acc_rf, f_rf, best_params_rf = RF(X_train, X_test, y_train, y_test)
    acc_svm, f_svm, best_params_svm = SVM(X_train, X_test, y_train, y_test)
    acc_nn, f_nn, best_params_nn = NN(X_train, X_test, y_train, y_test)
    return {
        'model':['decision tree', 'random forest', 'svm', 'neural network'],
        'accuracy':[acc_dt, acc_rf, acc_svm, acc_nn],
        'f1_score':[f_dt, f_rf, f_svm, f_nn],
        'model_parameters':[best_params_dt, best_params_rf, best_params_svm, best_params_nn]
        }
    

# evaluate all models

In [99]:
#German dataset
X_train, X_test, y_train, y_test = German()
german_model_score = run_all_models(X_train, X_test, y_train, y_test)
german_model_score['dataset'] = ['german']*4

Grid searching best parameters for decision tree
Accuracy for decision tree: 0.6866666666666666
F-score for decision tree: 0.7991452991452991


Grid searching best parameters for random forest
Accuracy for random forest: 0.76
Best F-score for random forest: 0.8461538461538461


Grid searching best parameters for SVM
