In [1]:
# import necessary libraries 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import functools
import random
import math
from operator import itemgetter

In [2]:
test = np.genfromtxt('DS2_test.csv', delimiter = ',')
valid = np.genfromtxt('DS2_valid.csv', delimiter = ',')
train = np.genfromtxt('DS2_train.csv', delimiter = ',')
print(np.shape(test))
print(np.shape(valid))
print(np.shape(train))

(800, 21)
(800, 21)
(2400, 21)


In [3]:

def countsPropMu(train, classes):
    c = 0;
    mu = 0;
    
    for row in train:
        if row[-1] == classes:
            c += 1
            mu += row[:-1]
    mu /= c
    p = c/len(train)
    
    return c, p, mu




#Solve for formula \Sigma = \sum_{i=1}^2 s_i *p_i
#where p_i = \frac{N_i}{N}
#and s_i = N_i^{-1}\sum\{(x -\mu_i)(x-\mu_i)^T\}
def get_sigma(train, label, c, μ):
    x = 0
    sigma = 0
    for row in train:
        if row[-1] == label:
            x = np.reshape((np.array(row[:-1])-μ),(20,1))    
           
            sigma += x@x.T
    sigma /= c

    return sigma

def get_Cov(p0, p1, s0, s1):
    Sigma = p0*s0 + p1*s1
    return Sigma



#Now plug in the values obtained from our data

c0, p0, mu0 = countsPropMu(train, 0)
c1, p1, mu1 = countsPropMu(train, 1)

Sigma = get_Cov(p0, p1, get_sigma(train, 0, c0, mu0), get_sigma(train, 1, c1, mu1))

#We will get our respective coefficient vector and bias \beta and \beta_0 
#for the linear predictor \beta^t x  + \beta_0
#\beta = Sigma^{-1} (mu0-mu1)
#\beta_0 = \frac{-1}{2}mu0^T Sigma^{-1} mu0 + \frac{1}{2} mu_1^T Sigma^{-1}mu1 + ln(\frac{P(C0)}{P(C1)})
#Where P(C0) and P(C1) are modeled by the proportions we found
def get_Betas(Sigma, mu0, mu1):
    SigmaInv = np.linalg.inv(Sigma)
    Beta = SigmaInv@(mu1-mu0)
    Beta0 = -0.5*(mu1.T @ SigmaInv @ mu1) + 0.5*(mu0.T @ SigmaInv @ mu0)+ np.log(p1/p0)
    return Beta, Beta0

beta, beta0 = get_Betas(Sigma, mu0, mu1)
print(beta)
print(beta0)

[ 0.03536196 -0.01068875 -0.00038642 -0.00916522 -0.06555391 -0.01634108
  0.04971012  0.02155161 -0.04725721 -0.01966905 -0.02844503 -0.06206904
 -0.00700645 -0.04527776  0.07074852 -0.00875106  0.05179928 -0.04166657
  0.05683159 -0.00696062]
0.0932302347357511


In [4]:
#Define a sigmoid map

sigmoid = lambda x: 1/(1+np.exp(-x)) 

#We use sigmoid(0) = 1/2 as our decision boundary

def predClass(row, beta, beta0):
    x = np.array(row[:-1])
    xb = beta.T@x + beta0
    
    obProb = np.array(sigmoid(xb))
    
    predClass = 0
    
    #if the classifier identified it correctly as 1 sigmoid (1) > 1/2
    if obProb >= 0.5:
        predClass = 1;
    return predClass
results = []
tp = 0
tn = 0 
fp = 0
fn = 0
print(np.shape(test))
for i in range(800):
    tClass = test[i][-1]
    row = test[i][:21]
    pClass = predClass(row, beta, beta0)
    if pClass == 1 and tClass == 1 :
        tp += 1
    elif pClass == 1 and tClass == 0 :
        fp += 1
    elif pClass == 0 and tClass == 0 :
        tn += 1
    else :
        fn += 1
    
    

(800, 21)


In [5]:
#Define L2 Norm
def L2Norm(vec1, vec2):
   
    return math.sqrt((vec1-vec2)@(vec1-vec2))

L2Norm(np.array([1,2]), np.array([0,0]))

2.23606797749979

In [14]:
#Getting nearest K neighbors : iteratively defined
def class_map(x,k): #Implementation of K-NN
    nn = []
    for i,row in enumerate(train):
        point = np.array(row[:-1])
        dif = x-point
        dist = L2Norm(dif, 0) # l2
        
        if len(nn)<k:
            nn.append([dist,i]) #save the L2norm and the index of the point
        else:
            nn.sort(key=lambda x: x[0]) #sort our nearest neighbours based on dist
            if dist<nn[k-1][0]: #if we are closer than the farthest neighbour we replace it with the current val
                del nn[-1]
                nn.append([dist,i])
       
    mean = 0
    for n in nn:
        i = n[1] # row in train data
        val = train[i][-1] #grab 0,1 val of row
        mean += val
    mean /= k
    

    if mean<.5:
        return 0
    else:
        return 1
    
    

In [8]:
def evaluate(tp, tn, fp, fn):
   
    accuracy = float(tp+tn)/float(tp+fp+fn+tn)
    precision = float(tp)/float(tp+fp)
    recall = float(tp)/float(tp+fn)
    f1_measure = (2*precision*recall)/(precision+recall)
    return accuracy, precision, recall, f1_measure
#Acc, precision, recall, f1_measure = evaluate(tp, tn, fp, fn)

In [9]:
# calculate evaluation measures for GDA
accuracy, precision, recall, f1_measure = evaluate(tp, tn, fp, fn)
print("The model on test set had following measures:")
print("Accuracy would be ", accuracy)
print("Precision would be ", precision)
print("Recall would be", recall)
print("F-measure would be", f1_measure)

The model on training set had following measures:
Accuracy would be  0.51125
Precision would be  0.5115089514066496
Recall would be 0.5
F-measure would be 0.5056890012642226


In [15]:
# test accuracy, precision, recall, and F-measure to get the ideal k
tp = 0
tn = 0
fp = 0
fn = 0
total = 0
k=2

for k in range(1,10):
    for row in valid:
        total +=1
        x = np.array(row[:-1])
        true_val = row[-1]

        decision = class_map(x,k)

        if decision == 0:
            if true_val == 0: #specificity
                tn+=1
            else: #Error Type II
                fn+=1
        else:
            if true_val == 0: #Error Type I
                fp+=1
            else: #sensitivity
                tp+=1



    acc = (tn+tp)/total
    prec = tp/(tp+fp) 
    rec = tp/(tp+fn)
    f = 2*prec*rec/(prec+rec)

    print(f"k: {k}")
    print(f"Accuracy would be: {acc}")
    print(f"Precision obtained: {prec}")
    print(f"Recall is: {rec}")
    print(f"F Measure observed: {f}")
    print('\n')
 

k: 1
Accuracy would be: 0.4775
Precision obtained: 0.4763157894736842
Recall is: 0.4525
F Measure observed: 0.4641025641025641


k: 2
Accuracy would be: 0.4825
Precision obtained: 0.48541666666666666
Recall is: 0.5825
F Measure observed: 0.5295454545454545


k: 3
Accuracy would be: 0.4975
Precision obtained: 0.49774774774774777
Recall is: 0.5525
F Measure observed: 0.523696682464455


k: 4
Accuracy would be: 0.5034375
Precision obtained: 0.5029302077783697
Recall is: 0.59
F Measure observed: 0.5429968363531781


k: 5
Accuracy would be: 0.50675
Precision obtained: 0.5059708093763822
Recall is: 0.572
F Measure observed: 0.5369631541891575


k: 6
Accuracy would be: 0.508125
Precision obtained: 0.5070524412296564
Recall is: 0.5841666666666666
F Measure observed: 0.5428848015488866


k: 7
Accuracy would be: 0.5076785714285714
Precision obtained: 0.5068319034000636
Recall is: 0.5696428571428571
F Measure observed: 0.5364049100386749


k: 8
Accuracy would be: 0.50859375
Precision obtained: 0.

In [17]:
# test accuracy, precision, recall, and F-measure to get the ideal k
#The valid set suggests using 2 might be a good choice
tp = 0
tn = 0
fp = 0
fn = 0
total = 0
k=2


for row in test:
    total +=1
    x = np.array(row[:-1])
    true_val = row[-1]

    decision = class_map(x,k)

    if decision == 0:
        if true_val == 0: #specificity
            tn+=1
        else: #Error Type II
            fn+=1
    else:
        if true_val == 0: #Error Type I
            fp+=1
        else: #sensitivity
            tp+=1



acc = (tn+tp)/total
prec = tp/(tp+fp) 
rec = tp/(tp+fn)
f = 2*prec*rec/(prec+rec)

print(f"k: {k}")
print(f"Accuracy would be: {acc}")
print(f"Precision obtained: {prec}")
print(f"Recall is: {rec}")
print(f"F Measure observed: {f}")
print('\n')

k: 2
Accuracy would be: 0.525
Precision obtained: 0.5165562913907285
Recall is: 0.78
F Measure observed: 0.6215139442231076


