In [2]:
# importing modules
import numpy as np
import numpy.linalg as LA
import math
from sklearn.naive_bayes import GaussianNB


In [3]:
# read the data file
f = open("/home/tandon/IIIT-H/3rd/SMAI/dorothea/dorothea_train.data")
X = np.zeros((800, 100000))
row = 0
for line in f:
    for token in line.split():
        idx = int(token)
        X[row, idx-1] = 1
#         print(idx)
    row += 1
print('Done')
f.close()


Done


In [4]:
# read the class labels associated with the training data
f = open("/home/tandon/IIIT-H/3rd/SMAI/dorothea/dorothea_train.labels")
labels = np.zeros(800)
i = 0
for line in f:
    digit = float(line.strip())
    labels[i] = digit
    i += 1
print(labels.shape)
f.close()

(800,)


In [73]:
print(X[1,306])

1.0


In [74]:
# calculate mean vector
m = X.mean(0)
print(m)
m.shape

[ 0.0125   0.       0.00125 ...,  0.0125   0.00625  0.035  ]


(100000,)

In [75]:
At = X - m

In [76]:
A = At.T
A.shape
 

(100000, 800)

In [77]:
K = At.dot(A)

In [78]:
eigen_values, eigen_vectors = LA.eig(K)
K.shape

(800, 800)

In [1]:
print('Eigen Values shape', eigen_values.shape)
print('Eigen vectors shape', eigen_vectors.shape)

NameError: name 'eigen_values' is not defined

In [80]:
# Contains indices for largest to smallest eigen values
idx = eigen_values.argsort()[::-1]
# idx.dtype
# eigen_values[idx]

In [81]:
# for K = 500
k = 500
ek = eigen_vectors[:, idx[0:k]]

for i in range(k):
    val = 1 / math.pow(eigen_values[idx[i]], 0.5)
    ek[:, i] = ek[:, i] * val
    
ek.shape 
# e500 = eigen_vectors[:, 300:]
# e500.shape

(800, 100)

In [82]:
# calculating final eigen vector for k = 100
final_eigen = A.dot(ek)
final_eigen.shape

# final500 = A.dot(e500)
# final500.shape

(100000, 100)

In [83]:
# project to lower dimensional space using the above obtained eigen vectors - PCA

A_projected = final_eigen.T.dot(A)
A_projected.shape


(100, 800)

In [84]:
# segregate data into +1 and -1 classes
bool_idx = (labels > 0)
positive_class = A_projected[:, bool_idx]
num_positive = positive_class.shape[1]
print('Number of positive samples', num_positive)

bool_idx = (labels < 0)
negative_class = A_projected[:, bool_idx]
num_negative = negative_class.shape[1]
print('Number of negative samples', num_negative)

print('Shape of positive class', positive_class.shape)
print('Shape of negative class', negative_class.shape)

positive_cov = np.ones(k)
negative_cov = np.ones(k)

for i in range(k):
    positive_cov[i] = np.var(positive_class[i,:])
    negative_cov[i] = np.var(negative_class[i,:])

print('Shape of positive covariance matrix', positive_cov.shape)
print('Shape of negative covariance matrix', negative_cov.shape)

positive_mean = positive_class.T.mean(0)
negative_mean = negative_class.T.mean(0)

print('Shape of positive mean', positive_mean.shape)
print('Shape of negative mean', negative_mean.shape)

Number of positive samples 78
Number of negative samples 722
Shape of positive class (100, 78)
Shape of negative class (100, 722)
Shape of positive covariance matrix (100,)
Shape of negative covariance matrix (100,)
Shape of positive mean (100,)
Shape of negative mean (100,)


In [85]:
'''
Gives the log of the multivariate gaussian, to be used in discriminant function
'''
def log_multivariate(xi, mean, variance):
    b1 = -1.0*(xi - mean)*(xi - mean)/(2*variance)
    a1 = -0.5*np.log(2 * np.pi*variance) 
    final = a1 + b1
    return final

In [86]:
# x = positive_class.T[0,:]
# f = log_multivariate(x, positive_mean, positive_cov)
# print(f)

# calculating prior probabilty values
prior_positive = num_positive/800
prior_negative = num_negative/800
log_prior_positive = np.log(prior_positive)
log_prior_negative = np.log(prior_negative)

In [87]:
def positive_discriminant(x):
    final = log_prior_positive
    for i in range(k):
        final += log_multivariate(x[i], positive_mean[i], positive_cov[i])
    return final

def negative_discriminant(x):
    final = log_prior_negative
    for i in range(k):
        final += log_multivariate(x[i], negative_mean[i], negative_cov[i])
    return final

In [88]:
# loading validating data
f = open("/home/tandon/IIIT-H/3rd/SMAI/dorothea/dorothea_valid.data")
V = np.zeros((350, 100000))
row = 0
for line in f:
    for token in line.split():
        idx = int(token)
        V[row, idx-1] = 1
#         print(idx)
    row += 1
print('Done')
f.close()
V = V.T
V.shape

Done


(100000, 350)

In [89]:
# read the class labels associated with the training data
f = open("/home/tandon/IIIT-H/3rd/SMAI/dorothea/dorothea_valid.labels")
v_labels = np.zeros(350)
i = 0
for line in f:
    digit = float(line.strip())
    v_labels[i] = digit
    i += 1
print(v_labels.shape)
f.close()


(350,)


In [90]:
# reduce dimension via PCA of the test data
V_projected = final_eigen.T.dot(V)

print('Shape of final_eigen', final_eigen.shape)
print('Shape of V', V.shape)
print('Shape of V_projected', V_projected.shape)

Shape of final_eigen (100000, 100)
Shape of V (100000, 350)
Shape of V_projected (100, 350)


In [None]:
# validating 
correct = 0
for i in range(350):
    x = V_projected.T[i,:]
    if positive_discriminant(x) >= negative_discriminant(x):
        if v_labels[i] > 0.0:
            correct += 1
    else:
        if v_labels[i] < 0.0:
            correct += 1
print('Correct', correct)
print('Percentage Correct: ', (correct/350*100))


    

Correct 314
Percentage Correct:  89.71428571428571


In [None]:
clf = GaussianNB()
clf.fit(A_projected.T, labels)
print(clf.score(V_projected.T, v_labels))


0.897142857143


### For k = 100
Percentage Accuracy : 89.71428571428571

### For k = 500
Percentage Accuracy : 84.4569

### For k = 1000
Percentage Accuracy : 10.4569896

### Observations

As we see, for K = 100, the accuracy comes out to be maximum. There can be various reasons for this
* Data getting overfitted
* Improper features getting incorporated due to increase in K
* The approximate number of directions at which data gets properly represented is 100
