In [1]:
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

from imblearn.under_sampling import NearMiss

In [2]:
df = pd.read_csv('dataset.csv')
X = df.drop(columns=['Activity'])
X = StandardScaler().fit_transform(X.copy())
y = df['Activity']

X, y = NearMiss(n_jobs=-1).fit_sample(X, y)

# balanced_accuracy = (recall + specificity) / 2
# roc_auc_ovr = ROC AUC OneVsRest (sensitive to class imbalance)
# roc_auc_ovo = ROC AUC OneVsOne (insensitive to class imbalance)
metrics = ['f1_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'accuracy']

## KNN

In [3]:
knn_results = []

# KNN classifiers with odd K in range of (1,19)
# search for best K
for i in range(10):
    k = i*2+1
    knnClass = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
    result = cross_validate(knnClass, X, y, cv=10, scoring=metrics,
                            return_train_score=True, n_jobs=-1)
    knn_results.append(result)
    print("k = " + str(k) + ":")

    print("\tf1:")
    print("\t\tmean = {mean:.2f}%;\n\t\tstd = {std:.2f};"
        .format(mean = result.get('test_f1_weighted').mean()*100,
                std = result.get('test_f1_weighted').std()))

    print("\tROC AUC OneVsRest:")
    print("\t\tmean = {mean:.2f}%;\n\t\tstd = {std:.2f};"
        .format(mean = result.get('test_roc_auc_ovr_weighted').mean()*100,
                std = result.get('test_roc_auc_ovr_weighted').std()))

    print("\tROC AUC OneVsOne:")
    print("\t\tmean = {mean:.2f}%;\n\t\tstd = {std:.2f};"
        .format(mean = result.get('test_roc_auc_ovo_weighted').mean()*100,
                std = result.get('test_roc_auc_ovo_weighted').std()))

    print("\tbalanced accuracy:")
    print("\t\tmean = {mean:.2f}%;\n\t\tstd = {std:.2f};"
        .format(mean = result.get('test_balanced_accuracy').mean()*100,
                std = result.get('test_balanced_accuracy').std()))

    print("\taccuracy:")
    print("\t\tmean = {mean:.2f}%;\n\t\tstd = {std:.2f};"
          .format(mean = result.get('test_accuracy').mean()*100,
                  std = result.get('test_accuracy').std()))

# spoiler it's: 5 for F1

k = 1:
	f1:
		mean = 78.85%;
		std = 0.04;
	ROC AUC OneVsRest:
		mean = 89.12%;
		std = 0.02;
	ROC AUC OneVsOne:
		mean = 89.23%;
		std = 0.02;
	balanced accuracy:
		mean = 80.42%;
		std = 0.04;
	accuracy:
		mean = 80.05%;
		std = 0.04;
k = 3:
	f1:
		mean = 76.93%;
		std = 0.06;
	ROC AUC OneVsRest:
		mean = 94.29%;
		std = 0.02;
	ROC AUC OneVsOne:
		mean = 94.40%;
		std = 0.02;
	balanced accuracy:
		mean = 79.10%;
		std = 0.06;
	accuracy:
		mean = 78.32%;
		std = 0.06;
k = 5:
	f1:
		mean = 79.29%;
		std = 0.05;
	ROC AUC OneVsRest:
		mean = 95.43%;
		std = 0.02;
	ROC AUC OneVsOne:
		mean = 95.52%;
		std = 0.02;
	balanced accuracy:
		mean = 81.32%;
		std = 0.05;
	accuracy:
		mean = 80.81%;
		std = 0.04;
k = 7:
	f1:
		mean = 75.52%;
		std = 0.06;
	ROC AUC OneVsRest:
		mean = 96.66%;
		std = 0.02;
	ROC AUC OneVsOne:
		mean = 96.69%;
		std = 0.02;
	balanced accuracy:
		mean = 78.06%;
		std = 0.05;
	accuracy:
		mean = 77.78%;
		std = 0.05;
k = 9:
	f1:
		mean = 75.14%;
		std = 0.06;
	ROC AUC 

## KNN report

In [4]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.7, random_state=42)
# knn = KNeighborsClassifier(n_neighbors=15, n_jobs=-1)
# knn.fit(X_train, y_train)
# y_pred = knn.predict(X_test)
# report = classification_report(y_test, y_pred)
# print(report)

# skf = StratifiedKFold(n_splits=10)
# for train_index, test_index in skf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]
#     knn = KNeighborsClassifier(n_neighbors=7, n_jobs=-1)
#     knn.fit(X_train, y_train)
#     y_pred = knn.predict(X_test)
#     report = classification_report(y_test, y_pred)
#     print(report)

## Decision Tree

In [5]:
tree_results = []

# Search for the best tree depth
for i in range(1, 17):
    curr = DecisionTreeClassifier(random_state=42, max_depth=i)
    result = cross_validate(curr, X, y, cv=10, scoring=metrics,
                            return_train_score=True, n_jobs=-1)
    tree_results.append(result)
    print("Depth = " + str(i) + ":")

    print("\tf1:")
    print("\t\tmean = {mean:.2f}%;\n\t\tstd = {std:.2f};"
        .format(mean = result.get('test_f1_weighted').mean()*100,
                std = result.get('test_f1_weighted').std()))

    print("\tROC AUC OneVsRest:")
    print("\t\tmean = {mean:.2f}%;\n\t\tstd = {std:.2f};"
        .format(mean = result.get('test_roc_auc_ovr_weighted').mean()*100,
                std = result.get('test_roc_auc_ovr_weighted').std()))

    print("\tROC AUC OneVsOne:")
    print("\t\tmean = {mean:.2f}%;\n\t\tstd = {std:.2f};"
        .format(mean = result.get('test_roc_auc_ovo_weighted').mean()*100,
                std = result.get('test_roc_auc_ovo_weighted').std()))

    print("\tbalanced accuracy:")
    print("\t\tmean = {mean:.2f}%;\n\t\tstd = {std:.2f};"
        .format(mean = result.get('test_balanced_accuracy').mean()*100,
                std = result.get('test_balanced_accuracy').std()))

    print("\taccuracy:")
    print("\t\tmean = {mean:.2f}%;\n\t\tstd = {std:.2f};"
          .format(mean = result.get('test_accuracy').mean()*100,
                  std = result.get('test_accuracy').std()))

# spoiler it's 7

Depth = 1:
	f1:
		mean = 5.31%;
		std = 0.01;
	ROC AUC OneVsRest:
		mean = 69.26%;
		std = 0.02;
	ROC AUC OneVsOne:
		mean = 69.19%;
		std = 0.02;
	balanced accuracy:
		mean = 16.39%;
		std = 0.01;
	accuracy:
		mean = 15.40%;
		std = 0.01;
Depth = 2:
	f1:
		mean = 19.86%;
		std = 0.02;
	ROC AUC OneVsRest:
		mean = 79.02%;
		std = 0.02;
	ROC AUC OneVsOne:
		mean = 79.12%;
		std = 0.03;
	balanced accuracy:
		mean = 31.74%;
		std = 0.02;
	accuracy:
		mean = 30.56%;
		std = 0.02;
Depth = 3:
	f1:
		mean = 32.21%;
		std = 0.05;
	ROC AUC OneVsRest:
		mean = 86.18%;
		std = 0.04;
	ROC AUC OneVsOne:
		mean = 86.24%;
		std = 0.04;
	balanced accuracy:
		mean = 41.81%;
		std = 0.05;
	accuracy:
		mean = 40.67%;
		std = 0.05;
Depth = 4:
	f1:
		mean = 46.30%;
		std = 0.05;
	ROC AUC OneVsRest:
		mean = 88.01%;
		std = 0.04;
	ROC AUC OneVsOne:
		mean = 87.99%;
		std = 0.04;
	balanced accuracy:
		mean = 53.06%;
		std = 0.07;
	accuracy:
		mean = 52.54%;
		std = 0.06;
Depth = 5:
	f1:
		mean = 62.69%;
		st

## Decision Tree Report

In [6]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.7, random_state=42)
# dtree = DecisionTreeClassifier(random_state=42, max_depth=10)
# dtree.fit(X_train, y_train)
# y_pred = dtree.predict(X_test)
# report = classification_report(y_test, y_pred)
# print(report)

# skf = StratifiedKFold(n_splits=10)
# for train_index, test_index in skf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]
#     dtree = DecisionTreeClassifier(random_state=42, max_depth=10)
#     dtree.fit(X_train, y_train)
#     y_pred = dtree.predict(X_test)
#     report = classification_report(y_test, y_pred)
#     print(report)