In [3]:
# Imports
import importlib.util, os
import numpy as np
import pandas as pd

from sklearn.pipeline import FunctionTransformer
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
import pickle
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
import pickle

# Custom imports for visualisation
spec = importlib.util.spec_from_file_location("utilities", os.path.join("..", "Python-Files", "utilities.py"))
utilities = importlib.util.module_from_spec(spec)
spec.loader.exec_module(utilities)
line_graph = utilities.line_graph
boxplot = utilities.boxplot

# Constants
K_FOLDS = 5
RANDOM_STATE = 420

# Settings  
pd.set_option('display.max_columns', None)

# Import data and split (the random state is important here to ensure that the same test set is used across models)
data = pd.read_csv('../Processed-Datasets/forestCover-Preprocessed.csv')
train, test = train_test_split(data, test_size=0.2, random_state=42)

t-test

In [4]:
with open('../Pickle-Objects/ClassifierTree-Final-GridSearch.pkl', 'rb') as f:
    tree_grid_search = pickle.load(f)

In [5]:
print(tree_grid_search.best_params_)

tree_pipeline = Pipeline([
    ("scaler", RobustScaler(unit_variance=True)), 
    ("dt", DecisionTreeClassifier(ccp_alpha=tree_grid_search.best_params_['dt__ccp_alpha'],
                                  class_weight=tree_grid_search.best_params_['dt__class_weight'],
                                  max_depth=tree_grid_search.best_params_['dt__max_depth'],
                                  max_features=tree_grid_search.best_params_['dt__max_features'],
                                  min_samples_leaf=tree_grid_search.best_params_['dt__min_samples_leaf'],
                                  min_samples_split=tree_grid_search.best_params_['dt__min_samples_split']))
])

{'dt__ccp_alpha': 0.0, 'dt__class_weight': None, 'dt__max_depth': 32, 'dt__max_features': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2}


In [6]:
knn_pipeline = Pipeline([
    ("scaler", RobustScaler(unit_variance=True)), 
    ("knn", KNeighborsClassifier(weights='distance', 
                                 n_neighbors=4))
])

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy import stats
import numpy as np
RANDOM_STATE = 420
train_subset = train.sample(n=int(train.shape[0] * 0.5), random_state=RANDOM_STATE)

X = train_subset.drop(columns=['Cover_Type'])
y = train_subset['Cover_Type']

knn_scores = cross_val_score(knn_pipeline, X, y, cv=10, scoring='f1_macro')
tree_scores = cross_val_score(tree_pipeline, X, y, cv=10, scoring='f1_macro')

In [9]:
t_stat, p_value = stats.ttest_rel(knn_scores, tree_scores)

In [10]:
t_stat

np.float64(-4.510944095451361)

In [11]:
p_value

np.float64(0.0014657212985050437)

In [12]:
knn_scores

array([0.84539201, 0.86390949, 0.84582565, 0.85279219, 0.84738556,
       0.85492138, 0.84680692, 0.84802461, 0.84438997, 0.84820712])

In [13]:
tree_scores

array([0.85892957, 0.86409771, 0.85622224, 0.87096765, 0.85908199,
       0.85267082, 0.85529706, 0.85407471, 0.85653897, 0.85698209])

In [14]:
print(f'KNN min: {min(knn_scores):.4f}')
print(f'Classifier Tree min: {min(tree_scores):.4f}')

print(f'KNN max: {max(knn_scores):.4f}')
print(f'Classifier Tree max: {max(tree_scores):.4f}')

print(f'KNN mean: {np.mean(knn_scores):.4f}')
print(f'Classifier Tree mean: {np.mean(tree_scores):.4f}')

print(f'KNN std dev: {np.std(knn_scores):.4f}')
print(f'Classifier Tree std dev: {np.std(tree_scores):.4f}')

print(f'T-statistic: {t_stat:.4f}')
print(f'P-value: {p_value:.4f}')

KNN min: 0.8444
Classifier Tree min: 0.8527
KNN max: 0.8639
Classifier Tree max: 0.8710
KNN mean: 0.8498
Classifier Tree mean: 0.8585
KNN std dev: 0.0056
Classifier Tree std dev: 0.0051
T-statistic: -4.5109
P-value: 0.0015


Wilcoxon

In [16]:
will_stat, will_p = stats.wilcoxon(knn_scores, tree_scores)

In [17]:
will_stat

np.float64(2.0)

In [18]:
will_p

np.float64(0.005859375)