In [1]:
# importing modules
import numpy as np
import numpy.linalg as LA
import math
from sklearn.naive_bayes import GaussianNB

In [2]:
# read the data file
f = open("/home/tandon/IIIT-H/3rd/SMAI/dorothea/dorothea_train.data")
X = np.zeros((800, 100000))
row = 0
for line in f:
    for token in line.split():
        idx = int(token)
        X[row, idx-1] = 1
#         print(idx)
    row += 1
print('Done')
f.close()
X.shape

Done


(800, 100000)

In [3]:
# read the class labels associated with the training data
f = open("/home/tandon/IIIT-H/3rd/SMAI/dorothea/dorothea_train.labels")
labels = np.zeros(800)
i = 0
for line in f:
    digit = float(line.strip())
    labels[i] = digit
    i += 1
print(labels.shape)
f.close()

(800,)


In [4]:
# segregate data into +1 and -1 classes
k = 1000
Y = X[:, 0:k]
print('Y Shape', Y.shape)
bool_idx = (labels > 0)
positive_class = Y[bool_idx, :]
num_positive = positive_class.shape[0]
print('Number of positive samples', num_positive)

bool_idx = (labels < 0)
negative_class = Y[bool_idx, :]
num_negative = negative_class.shape[0]
print('Number of negative samples', num_negative)

print('Shape of positive class', positive_class.shape)
print('Shape of negative class', negative_class.shape)

positive_mean = positive_class.mean(0)
negative_mean = negative_class.mean(0)

print('Shape of positive mean', positive_mean.shape)
print('Shape of negative mean', negative_mean.shape)

At = positive_class - positive_mean
print('Shape of At positive', At.shape)
A = At.T
S_p = A.dot(At)
print('Shape of S_p', S_p.shape)


At = negative_class - negative_mean
print('Shape of At negative', At.shape)
A = At.T
S_n = A.dot(At)
print('Shape of S_n', S_n.shape)

S_w = S_p + S_n
print('Shape of S_w',S_w.shape)

S_w += S_w + np.identity(k)
S_w.shape

S_wi = np.linalg.inv(S_w)
print('Shape of inverse', S_wi.shape)

mean_dif = positive_mean- negative_mean
w = S_wi.dot(mean_dif)
print('Shape of w', w.shape)



Y Shape (800, 1000)
Number of positive samples 78
Number of negative samples 722
Shape of positive class (78, 1000)
Shape of negative class (722, 1000)
Shape of positive mean (1000,)
Shape of negative mean (1000,)
Shape of At positive (78, 1000)
Shape of S_p (1000, 1000)
Shape of At negative (722, 1000)
Shape of S_n (1000, 1000)
Shape of S_w (1000, 1000)
Shape of inverse (1000, 1000)
Shape of w (1000,)


In [5]:
reduced_X = w.T.dot(Y.T)
print('Reduced x shape', reduced_X.shape)

Reduced x shape (800,)


In [6]:
reduced_mean = np.mean(reduced_X)
reduced_variance = np.var(reduced_X)

In [7]:
def log_multivariate(xi, mean = reduced_mean, variance = reduced_variance):
    b1 = -1.0*(xi - mean)*(xi - mean)/(2*variance)
    a1 = -0.5*np.log(2 * np.pi*variance) 
    final = a1 + b1
    return final

In [8]:
# calculating prior probabilty values
prior_positive = num_positive/800
prior_negative = num_negative/800
log_prior_positive = np.log(prior_positive)
log_prior_negative = np.log(prior_negative)

In [9]:
def positive_discriminant(x):
    final = log_multivariate(x) + log_prior_positive
    return final

def negative_discriminant(x):
    final = log_multivariate(x) + log_prior_negative
    return final

In [10]:
# loading validating data
f = open("/home/tandon/IIIT-H/3rd/SMAI/dorothea/dorothea_valid.data")
V = np.zeros((350, 100000))
row = 0
for line in f:
    for token in line.split():
        idx = int(token)
        V[row, idx-1] = 1
#         print(idx)
    row += 1
print('Done')
f.close()
V = V[:,0:k]
V = V.T
V.shape

Done


(1000, 350)

In [11]:
# read the class labels associated with the training data
f = open("/home/tandon/IIIT-H/3rd/SMAI/dorothea/dorothea_valid.labels")
v_labels = np.zeros(350)
i = 0
for line in f:
    digit = float(line.strip())
    v_labels[i] = digit
    i += 1
print(v_labels.shape)
f.close()


(350,)


In [12]:
# reduce dimension via LDA of the test data
V_projected = w.T.dot(V)


print('Shape of V', V.shape)
print('Shape of V_projected', V_projected.shape)

Shape of V (1000, 350)
Shape of V_projected (350,)


In [13]:
# validating 
correct = 0
for i in range(350):
    x = V_projected[i]
    if positive_discriminant(x) >= negative_discriminant(x):
        if v_labels[i] > 0.0:
            correct += 1
    else:
        if v_labels[i] < 0.0:
            correct += 1
print('Correct', correct)
print('Percentage Correct: ', (correct/350*100))


Correct 316
Percentage Correct:  90.28571428571428


### Observation

LDA worked better than PCA, which gave a max correctness of 89%
Also the Sw was not invertible and to make it, we had to add an I matrix to it.