In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from scipy import stats
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot  as plt

from ProbKNN_functions import *

In [6]:
data = pd.read_csv('Bankrupt/data.csv')
Xtrain, Xtest, Ytrain, Ytest = train_test_split(data.iloc[:,1:], data.iloc[:,0], test_size=0.20, random_state=42)
#train/test split (80/20)
#split_idx = round(len(data)*0.8)
#Xtrain = data.iloc[0:split_idx, 1:]
#Xtest = data.iloc[split_idx:, 1:]
#Ytrain = data.iloc[0:split_idx, 0]
#Ytest = data.iloc[split_idx:, 0]

In [8]:
ypred = probKnn(Xtrain, Ytrain, Xtest, 5, len(Xtrain.columns))
getPerformanceMetrics(Ytest, ypred)

Accuracy: 0.9633431085043989
Specificity: (TNR): 0.9893373952779894
Sensitivity: (TPR): 0.29411764705882354
Precision: (PPV): 0.5172413793103449


In [7]:
from sklearn.naive_bayes import MultinomialNB
gnb = MultinomialNB()
y_pred = gnb.fit(Xtrain, Ytrain).predict(Xtest)
getPerformanceMetrics(Ytest, y_pred)

Accuracy: 0.7543988269794721
Specificity: (TNR): 0.7677075399847677
Sensitivity: (TPR): 0.4117647058823529
Precision: (PPV): 0.06441717791411043


In [9]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(Xtrain, Ytrain)
pred = neigh.predict(Xtest)
getPerformanceMetrics(Ytest, pred)

Accuracy: 0.9618768328445748
Specificity: (TNR): 0.9984767707539984
Sensitivity: (TPR): 0.0196078431372549
Precision: (PPV): 0.3333333333333333


In [None]:
#over different K
tpr_krange = np.zeros((10,1))
tnr_krange = np.zeros((10,1))
i = 0
for k in range(0,50,5):
    # take majority vote of top k+1 closest points from training data 
    ypred = probKNN_bylabel(Xtrain, Ytrain, Xtest, k+1)
    tn, fp, fn, tp = confusion_matrix(Ytest, ypred).ravel()
    tpr_krange[i] = tp / (tp + fn)
    tnr_krange[i] = tn / (tn+fp)
    i = i +1

kmax = 10
plt.figure()
plt.title("KNN Classifier")
plt.plot(range(1,kmax+1), tpr_krange, linestyle="-", color='green', linewidth=4, marker=None,label="tpr")
plt.plot(range(1,kmax+1), tnr_krange, linestyle="-", color='skyblue', linewidth=4, marker=None, label="tnr")
plt.xlabel("k")
plt.legend()
plt.xlim((0,kmax+1))
plt.show()

In [None]:
#Over different d
tpr_drange = np.zeros((23,1))
tnr_drange = np.zeros((23,1))
i = 0
for d in range(1,93,4):
    # take majority vote of top 5 closest points from training data 
    ypred = probKNN_bylabel(Xtrain, Ytrain, Xtest, 5, d)
    tn, fp, fn, tp = confusion_matrix(Ytest, ypred).ravel()
    tpr_drange[i] = tp / (tp + fn)
    tnr_drange[i] = tn / (tn+fp)
    i = i +1


In [None]:
dmax = 93
plt.figure()
plt.title("KNN Classifier")
plt.plot(range(1,93,4), tpr_drange[:,0], linestyle="-", color='green', linewidth=4, marker=None,label="tpr")
plt.plot(range(1,93,4), tnr_drange[:,0], linestyle="-", color='skyblue', linewidth=4, marker=None, label="tnr")
plt.xlabel("# of Features")
plt.legend()
plt.xlim((0,dmax))
plt.show()

In [None]:
#compare to knn with d = all
#d = 2 showed even worse results

from numpy import linalg as LA
k = 5
# Calculate the distance matrix between all training samples and test samples. Dimension is ntrain by ntest
colnormtrain = (LA.norm(Xtrain,axis=1)**2).reshape((len(Xtrain),1))
colnormtest = (LA.norm(Xtest,axis=1)**2).reshape((len(Xtest),1))
D = np.repeat(colnormtest.T,len(Xtrain),axis = 0) - 2*np.matmul(Xtrain.to_numpy(),(Xtest.T.to_numpy())) + np.repeat(colnormtrain,len(Xtest),axis = 1)
# sort the labels based on distances from training samples to each test sample 
ind = np.argsort(D, axis=0)

# get the labels associated with the indicies found above
Y = np.repeat(Ytrain.to_numpy().reshape((len(Xtrain),1)),len(Xtest),axis = 1)
Y_allnn = np.take_along_axis(Y,ind,axis = 0)
ypred, _ = stats.mode(Y_allnn[:k+1,:],axis=0)
getMetrics(Ytest, ypred[0])

# k>1
tpr_krange = np.zeros((50,1))
for k in range(50):
    # take majority vote of top k+1 closest points from training data 
    ypred, _ = stats.mode(Y_allnn[:k+1,:],axis=0)
    tn, fp, fn, tp = confusion_matrix(Ytest, ypred[0]).ravel()
    tpr_krange[k] = tp / (tp + fn)
    

kmax = 50
plt.figure()
plt.title("KNN Classifier")
plt.plot(range(1,kmax+1), tpr_krange, linestyle="-", marker=None)
plt.xlabel("k")
plt.xlim((0,kmax+1))
plt.ylabel("TPR")
plt.show()

In [None]:
#compare to knn with k = 5 and varying d

from numpy import linalg as LA
k = 5
tpr_drange = np.zeros((23,1))
tnr_drange = np.zeros((23,1))
i = 0
for d in range(1,93,4):
    # Calculate the distance matrix between all training samples and test samples. Dimension is ntrain by ntest
    n_train = len(Xtrain)
    n_test = len(Xtest)
    colnormtrain = (LA.norm(Xtrain.iloc[:, 0:d],axis=1)**2).reshape((n_train,1))
    colnormtest = (LA.norm(Xtest.iloc[:, 0:d],axis=1)**2).reshape((n_test,1))
    D = np.repeat(colnormtest.T,n_train,axis = 0) - 2*np.matmul(Xtrain.iloc[:,0:d].to_numpy(),(Xtest.iloc[:, 0:d].T.to_numpy())) + np.repeat(colnormtrain,n_test,axis = 1)
    # sort the labels based on distances from training samples to each test sample 
    ind = np.argsort(D, axis=0)

    # get the labels associated with the indicies found above
    Y = np.repeat(Ytrain.to_numpy().reshape((n_train,1)),n_test,axis = 1)
    Y_allnn = np.take_along_axis(Y,ind,axis = 0)
    ypred, _ = stats.mode(Y_allnn[:k+1,:],axis=0)
    ypred, _ = stats.mode(Y_allnn[:k+1,:],axis=0)
    tn, fp, fn, tp = confusion_matrix(Ytest, ypred[0]).ravel()
    tpr_drange[i] = tp / (tp + fn)
    tnr_drange[i] = tn / (tn+fp)
    i = i + 1


In [None]:
kmax = 93
plt.figure()
plt.title("KNN Classifier")
plt.plot(range(1,93,4), tpr_drange, linestyle="-", color='green', linewidth=4, marker=None,label="tpr")
plt.plot(range(1,93,4), tnr_drange, linestyle="-", color='skyblue', linewidth=4, marker=None, label="tnr")
plt.xlabel("# of Features")
plt.legend()
plt.xlim((0,93))
plt.show()