In [2]:
import numpy as np
import pandas as pd

# models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# data manipulation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, auc

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
wine_dataset = np.load("../data/wine_dataset.npy")

In [7]:
wine_dataset.shape

(1143, 12)

In [24]:
def training(model_name):
    test_sizes = [0.8, 0.5, 0.2]
    # test_sizes = [0.2] # for sanity checking process
    splits = {}
    for size in test_sizes:

        # getting the new split
        X_train, X_test, y_train, y_test = train_test_split(
            wine_dataset[:, 0:-1], wine_dataset[:, -1], 
            test_size=size, random_state=1
        )

        param_grid = None

        # define model
        if model_name == "LogisticRegression":
            model = LogisticRegression()
            param_grid = {
                'C': np.linspace(0.001, 0.999, 10)
            }
        elif model_name == "XGBClassifier":
            model = XGBClassifier()
        elif model_name == "RandomForestClassifier":
            model = RandomForestClassifier()
        else:
            model = DecisionTreeClassifier()

        # parameters to hypertune
        if param_grid == None:
            param_grid = {
                'max_depth': [i for i in range(1, 12)]
            }

        # cross validation to search for the best model hyperparameters
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=True, error_score='raise')
        grid_search.fit(X_train, y_train)
        splits[size] = grid_search
    return splits

In [25]:
def draw_heatmap(errors, C_list, title):
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(errors.reshape(-1, 1), annot=True, fmt='.3f', yticklabels=C_list.reshape(-1, 1), xticklabels=[])
    ax.collections[0].colorbar.set_label('error')
    ax.set(ylabel='Hyperparameter')
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    plt.title(title)
    plt.show()

Logistic Regression

In [28]:
splits = training("LogisticRegression")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
