In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics, model_selection
from math import log, pi, sqrt, exp
import warnings
warnings.filterwarnings('ignore')

In [2]:
def NormZscore(data):
    mean = np.mean(data, axis = 0)
    data_int = data - mean
    sdt = np.std(data, axis = 0)
    data_norm = data_int / sdt

    return data_norm, mean, sdt

def DesnZscore(data,m,s):
    return data * s + m

def kfolds(dataset, k):
    shuf = np.random.permutation(dataset)
    n = shuf.shape[0]
    fold_size = n // k

    folds = []

    for i in range(k-1):
        folds.append(shuf[i*fold_size:(i+1)*fold_size,:])

    folds.append(shuf[(k-1)*fold_size:,:])

    return folds

def distance_euclidian(x1, x2):
    return sqrt(np.sum([abs(i - j) for i, j in zip(x1,x2)]))

def distance_mahalanobis(x1, x2, covar):
    inv_covar = np.linalg.inv(covar)

    return sqrt(np.sum((x1 - x2).T @ inv_covar @ (x1 - x2)))

def covar_matrix(x):
    mean = np.mean(x, axis=0)
    x_mean = x - mean

    covar = (np.transpose(x_mean) @ x_mean) / x.shape[0]

    return covar

def Train_Test(fold):
    traintest = []

    for i in range(len(fold)):
        test = fold[i]
        train = []
        for f in range(len(fold)):
            if i != f:
                train.extend(fold[f])

        x_train = np.array(train)[:, 0:21]
        y_train = np.array(train)[:, -1]

        x_test = np.array(test)[:, 0:21]
        y_test = np.array(test)[:, -1]

        traintest.append((x_train, y_train, x_test, y_test))

    return traintest

def metrics(y, y_pred):

    y = y.astype('bool')
    y_pred = y_pred.astype('bool')

    TP = sum(y & y_pred) 
    TN = sum(~y & ~y_pred)
    FP = sum(~y & y_pred)
    FN = sum(y & ~y_pred)

    precision = TP / (TP + FP)
    accuracy = (TP + TN)/(TP + FP + TN + FN)
    recall = TP /(TP+FN)
    f1 = 2*(precision * recall)/(precision + recall)
    cm = np.array([[ TP, TN ], [ FP, FN ]])

    if np.isnan(precision):
        precision = 0
    if np.isnan(accuracy):
        accuracy = 0
    if np.isnan(recall):
        recall = 0
    if np.isnan(f1):
        f1 = 0

    return precision, accuracy, recall, f1

In [3]:
kc_or = np.genfromtxt('kc2.csv', delimiter=',')
k = [1, 5]
X = kc_or[:,0:21]
y = kc_or[:,21]

x, x_mean, x_sdt = NormZscore(X)

kc = np.c_[x, y]

covar = covar_matrix(x)
folds = kfolds(kc, 10)
traintest = Train_Test(folds)

# KNN {K=1 ; K=5}

In [4]:
def near_neighbors_euc(train, test, k):
    dist = np.array([])

    for r_train in train:
        dist = np.append(dist, distance_euclidian(r_train, test))
        
    idx_sorted = dist.argsort()[:k]
     
    return idx_sorted

def near_neighbors_mah(train, test, k, covar):
    dist = np.array([])

    for r_train in train:
        dist = np.append(dist, distance_mahalanobis(r_train, test, covar))
        
    idx_sorted = dist.argsort()[:k]
     
    return idx_sorted

def pred_knn_euc(train, test, k):
    neighb = near_neighbors_euc(train, test, k)
    top_neigh = [x[-1] for x in neighb]
    pred_out = max(set(top_neigh), key=top_neigh.count)

    return pred_out

def pred_knn_mah(train, test, k, covar):
    neighb = near_neighbors_mah(train, test, k, covar)
    top_neigh = [x[-1] for x in neighb]
    pred_out = max(set(top_neigh), key=top_neigh.count)

    return pred_out

In [5]:
metric_all_pred_euc = []
metric_all_pred_mah = []

for t in range(len(traintest)):
    metric_n_pred_euc = []
    metric_n_pred_mah = []
    
    for i in k:
        y_pred_euc = []
        y_pred_mah = []
    
        for r_test in traintest[t][2]:
            #Eucl
            neighb_euc = near_neighbors_euc(traintest[t][0], r_test, i)
            top_n_y_euc = list(y[neighb_euc])
            pred_out_euc = max(set(top_n_y_euc), key=top_n_y_euc.count)
            y_pred_euc.append(pred_out_euc)

            #=========================================================================

            #Mahal
            neighb_mah = near_neighbors_mah(traintest[t][0], r_test, i, covar)
            top_n_y_mah = list(y[neighb_mah])
            pred_out_mah = max(set(top_n_y_mah), key=top_n_y_mah.count)
            y_pred_mah.append(pred_out_mah)

        metrics_euc = metrics(traintest[t][3].astype('int'), np.array(y_pred_euc).astype('int'))
        metrics_mah = metrics(traintest[t][3].astype('int'), np.array(y_pred_mah).astype('int'))

        metric_n_pred_euc.append(metrics_euc)
        metric_n_pred_mah.append(metrics_mah)
    
    metric_all_pred_euc.append(metric_n_pred_euc)
    metric_all_pred_mah.append(metric_n_pred_mah)

metric_all_pred_euc = np.array(metric_all_pred_euc)
metric_all_pred_mah = np.array(metric_all_pred_mah)

meanknn_euc = np.mean(metric_all_pred_euc, axis = 0)
stdknn_euc = np.std(metric_all_pred_euc, axis = 0)

meanknn_mah = np.mean(metric_all_pred_mah, axis = 0)
stdknn_mah = np.std(metric_all_pred_mah, axis = 0)

print("\033[1m" , "\nEuclidian Distance: ", "\033[0m")

for i in range(len(k)):
    print("=" * 50)
    print("KNN k = ", k[i])
    print("\nPrecision: \t", meanknn_euc[i,0], "\nAccuracy: \t", meanknn_euc[i,1], "\nRecall: \t", meanknn_euc[i,2], \
        "\nF1Score: \t", meanknn_euc[i,3],"\nStd: \t\t", stdknn_euc[i], "\n")
    print("=" * 50, "\n")

print("\033[1m" , "\nMahalanobis Distance: ", "\033[0m")

for i in range(len(k)):
    print("=" * 50)
    print("KNN k = ", k[i])
    print("\nPrecision: \t", meanknn_mah[i,0], "\nAccuracy: \t", meanknn_mah[i,1], "\nRecall: \t", meanknn_mah[i,2], \
        "\nF1Score: \t", meanknn_mah[i,3],"\nStd: \t\t", stdknn_mah[i], "\n")
    #print("Confusion Matrix: ","K = ",i, "\n", mainfunctions.confusion_matrix(cm))
    print("=" * 50)

[1m 
Euclidian Distance:  [0m
KNN k =  1

Precision: 	 0.17797619047619045 
Accuracy: 	 0.72008547008547 
Recall: 	 0.09986633249791144 
F1Score: 	 0.12202332450158535 
Std: 		 [0.13506659 0.08088108 0.0786339  0.09428156] 


KNN k =  5

Precision: 	 0.1 
Accuracy: 	 0.7889601139601139 
Recall: 	 0.007142857142857143 
F1Score: 	 0.013333333333333332 
Std: 		 [0.3        0.08646034 0.02142857 0.04      ] 


[1m 
Mahalanobis Distance:  [0m
KNN k =  1

Precision: 	 0.23363636363636364 
Accuracy: 	 0.7297720797720798 
Recall: 	 0.15399331662489557 
F1Score: 	 0.17690927188753275 
Std: 		 [0.18441025 0.08982394 0.13841762 0.15693603] 

KNN k =  5

Precision: 	 0.05 
Accuracy: 	 0.7735754985754986 
Recall: 	 0.014285714285714285 
F1Score: 	 0.022222222222222223 
Std: 		 [0.15       0.09384721 0.04285714 0.06666667] 



# Decision Tree {Gini, Entropy}

In [7]:
criteria = ['entropy', 'gini']

for c in range(len(criteria)):
    metrics_criteria = []

    for t in range(len(folds)):
        clf = DecisionTreeClassifier(criterion = criteria[c])
        clf = clf.fit(traintest[t][0],traintest[t][1])

        y_pred = clf.predict(traintest[t][2])
        
        #metrics = metrics(traintest[t][3].astype('int'), np.array(y_pred).astype('int'))
        metrics_criteria.append(metrics(traintest[t][3].astype('int'), np.array(y_pred).astype('int')))

    mean = np.mean(metrics_criteria, axis = 0)
    std = np.std(metrics_criteria, axis = 0)

    print("\033[1m" , "\nCriteria: ", criteria[c],"\033[0m")
    print("=" * 50)
    print("\nPrecision: \t", mean[0], "\nAccuracy: \t", mean[1], "\nRecall: \t", mean[2], \
        "\nF1Score: \t", mean[3],"\nStd: \t\t", std, "\n")
    #print("Confusion Matrix: ","K = ",i, "\n", mainfunctions.confusion_matrix(cm))
    print("=" * 50, "\n")

[1m 
Criteria:  entropy [0m

Precision: 	 0.45672438672438675 
Accuracy: 	 0.7833333333333333 
Recall: 	 0.43237259816207185 
F1Score: 	 0.43369290755351386 
Std: 		 [0.17212243 0.06926637 0.11925921 0.1347292 ] 


[1m 
Criteria:  gini [0m

Precision: 	 0.5058730158730158 
Accuracy: 	 0.7948717948717949 
Recall: 	 0.5236675020885548 
F1Score: 	 0.4955913921967622 
Std: 		 [0.16344176 0.06969219 0.14034861 0.11011208] 


