In [1]:
# import necessary libraries 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import functools
import random
import math

We consider cases where we classify input $\textbf{x}$ in the input space $C(\mathbf{X})$ to a class $C_k$ and we have a discriminant function (or any map) that maps $\textbf{x}$ to a class $C_k \in  \{C_1, ..., C_K\}$

Evaluation metrics for classification problem:

 $P_{\text{true}}:=|\{\mathbf{x} \in C(\mathbf{X}): y(\mathbf{x}) = C_k | \mathbf{x} \in C_k \}| $
  
 $P_{\text{false}}:=|\{\mathbf{x} \in C(\mathbf{X}): y(\mathbf{x}) = C_k | \mathbf{x} \notin C_k \}| $
 
 $N_{\text{true}}:=|\{\mathbf{x} \in C(\mathbf{X}): y(\mathbf{x}) \neq C_k | \mathbf{x} \notin C_k \}| $
  
 $N_{\text{false}}:=|\{\mathbf{x} \in C(\mathbf{X}): y(\mathbf{x}) \neq C_k | \mathbf{x} \in C_k \}| $
 
 
 and we have total counts $\text{TOTAL} : = P_{\text{true}} +P_{\text{false}} + N_{\text{true}} + N_{\text{false}} $
 
 Accuracy $Acc := \frac{P_{\text{true}} + N_{\text{true}} }{TOTAL}$
 
 Precision $Pcs := \frac{P_{\text{true}}}{P_{\text{true}}+P_{\text{false}} }$
 
 Recall $Rec := \frac{P_{\text{true}}}{P_{\text{true}}+F_{\text{false}} }$
 
 F_1 measure $F_1 := \frac{2*Pcs * Rec}{Pcs+Rec}$


In [2]:
test = np.genfromtxt('DS1_test.csv', delimiter = ',')
valid = np.genfromtxt('DS1_valid.csv', delimiter = ',')
train = np.genfromtxt('DS1_train.csv', delimiter = ',')
print(np.shape(test))
print(np.shape(valid))
print(np.shape(train))

(800, 21)
(800, 21)
(2400, 21)


In [17]:


def countsPropMu(train, classes):
    c = 0;
    mu = 0;
    
    for row in train:
        if row[-1] == classes:
            c += 1
            mu += row[:-1]
    mu /= c
    p = c/len(train)
    
    return c, p, mu




#Solve for formula \Sigma = \sum_{i=1}^2 s_i *p_i
#where p_i = \frac{N_i}{N}
#and s_i = N_i^{-1}\sum\{(x -\mu_i)(x-\mu_i)^T\}
def get_sigma(train, label, c, μ):
    x = 0
    sigma = 0
    for row in train:
        if row[-1] == label:
            x = np.reshape((np.array(row[:-1])-μ),(20,1))    
           
            sigma += x@x.T
    sigma /= c

    return sigma

def get_Cov(p0, p1, s0, s1):
    Sigma = p0*s0 + p1*s1
    return Sigma



#Now plug in the values obtained from our data

c0, p0, mu0 = countsPropMu(train, 0)
c1, p1, mu1 = countsPropMu(train, 1)

Sigma = get_Cov(p0, p1, get_sigma(train, 0, c0, mu0), get_sigma(train, 1, c1, mu1))

#We will get our respective coefficient vector and bias \beta and \beta_0 
#for the linear predictor \beta^t x  + \beta_0
#\beta = Sigma^{-1} (mu0-mu1)
#\beta_0 = \frac{-1}{2}mu0^T Sigma^{-1} mu0 + \frac{1}{2} mu_1^T Sigma^{-1}mu1 + ln(\frac{P(C0)}{P(C1)})
#Where P(C0) and P(C1) are modeled by the proportions we found
def get_Betas(Sigma, mu0, mu1):
    SigmaInv = np.linalg.inv(Sigma)
    Beta = SigmaInv@(mu1-mu0)
    Beta0 = -0.5*(mu1.T @ SigmaInv @ mu1) + 0.5*(mu0.T @ SigmaInv @ mu0)+ np.log(p1/p0)
    return Beta, Beta0

beta, beta0 = get_Betas(Sigma, mu0, mu1)
print(beta)
print(beta0)


[-14.85992178   8.72562473   5.68227111   3.27898269   9.92186112
   4.555445   -17.12291339  24.46050271  29.71995544  -9.3559337
  13.30979447  12.47838008 -15.75210565 -13.07731511   5.72648095
 -13.23577877 -30.10077125   6.80408844   0.65591131   5.05253155]
-27.81483255516115


In [34]:
#Define a sigmoid map

sigmoid = lambda x: 1/(1+np.exp(-x)) 

#We use sigmoid(0) = 1/2 as our decision boundary

def predClass(row, beta, beta0):
    x = np.array(row[:-1])
    xb = beta.T@x + beta0
    
    obProb = np.array(sigmoid(xb))
    
    predClass = 0
    
    #if the classifier identified it correctly as 1 sigmoid (1) > 1/2
    if obProb >= 0.5:
        predClass = 1;
    return predClass
results = []
tp = 0
tn = 0 
fp = 0
fn = 0
print(np.shape(test))
for i in range(800):
    tClass = test[i][-1]
    row = test[i][:21]
    pClass = predClass(row, beta, beta0)
    if pClass == 1 and tClass == 1 :
        tp += 1
    elif pClass == 1 and tClass == 0 :
        fp += 1
    elif pClass == 0 and tClass == 0 :
        tn += 1
    else :
        fn += 1
    
    

(800, 21)


In [35]:
def evaluate(tp, tn, fp, fn):
   
    accuracy = float(tp+tn)/float(tp+fp+fn+tn)
    precision = float(tp)/float(tp+fp)
    recall = float(tp)/float(tp+fn)
    f1_measure = (2*precision*recall)/(precision+recall)
    return accuracy, precision, recall, f1_measure

# calculate evaluation metrics
accuracy, precision, recall, f1_measure = evaluate(tp, tn, fp, fn)
print("LDA returned me the result:")
print("Accuracy would be ", accuracy)
print("Precision would be ", precision)
print("Recall would be", recall)
print("F-measure would be", f1_measure)

LDA returned me the result:
Accuracy would be  0.9575
Precision would be  0.9463414634146341
Recall would be 0.97
F-measure would be 0.9580246913580247
