In [101]:
import numpy as np
from numpy import random
import matplotlib.pyplot as plt
from numpy.linalg import inv
import math
#Question 1
#generate 2 classes with 20 features each
#same covariance matrix
#generate 2000 samples for each class
#divide into train and test (70:30)
#save DS1

def data_generation(v, pos, size, data):
    if v==0:
        c=np.genfromtxt('hwk2_datasets_corrected/DS1_Cov.txt', delimiter=',')
        if pos==False:
            m=np.genfromtxt('hwk2_datasets_corrected/DS1_m_0.txt', delimiter=',')
        else:
            m=np.genfromtxt('hwk2_datasets_corrected/DS1_m_1.txt', delimiter=',')
    else:
        c=np.genfromtxt('hwk2_datasets_corrected/DS2_Cov' + str(v) + '.txt', delimiter=',')
        if pos==False:
            m=np.genfromtxt('hwk2_datasets_corrected/DS2_c1_m'+ str(v)+ '.txt', delimiter=',')
        else:
            m=np.genfromtxt('hwk2_datasets_corrected/DS2_c2_m'+ str(v)+ '.txt', delimiter=',')
    cov=c[:, :-1]
    mean=m[:-1]
    for i in range(0, size-1):
        sample=random.multivariate_normal(mean, cov)
        if pos==True:
            data.append((sample, 'pos'))
        else:
            data.append((sample, 'neg'))
    return data

def datasets(n):
    if n==1:
        pos_data= data_generation(0, True, 2001, [])
        total_data=data_generation(0, False, 2001, pos_data)
        DS1  = open('DS1.txt', 'w')
        for element in total_data:
            DS1.write(str(element) + '\n')
    else:
        pos_data_m1=data_generation(1, True, int((2000)*0.1)+1, [])
        neg_data_m1=data_generation(1, False, int((2000)*0.1)+1, pos_data_m1)
        pos_data_m2=data_generation(2, True, int((2000)*0.42)+1, neg_data_m1)
        neg_data_m2=data_generation(2, False, int((2000)*0.42)+1, pos_data_m2)
        pos_data_m3=data_generation(3, True, int((2000)*0.48)+1, neg_data_m2)
        total_data=data_generation(3, False, int((2000)*0.48)+1, pos_data_m3)
        DS2  = open('DS2.txt', 'w')
        for element in total_data:
            DS2.write(str(element) + '\n')
    return total_data

def train_test(dataset):
    random.shuffle(dataset)
    train = dataset[:int((len(dataset)+1)*.70)] #Remaining 70% to training set
    test = dataset[int(len(dataset)*.70+1):] #Splits 30% data to test set
    return train, test

#Question 2
#estimate the parameters of the probabilistic LDA model using mle
#report accuracy, precision, recall, f-measure, coefficients learned

#LDA - calculate the probabilities of both classes and choose whichever is higher

def priors_means(data):
    positive=0.0
    negative=0.0
    dimension=len(data[0][0])
    zero=[]
    positive_mean=np.pad(zero, (0, dimension), 'constant')
    negative_mean=np.pad(zero, (0, dimension), 'constant')
    for element in data:
        if element[1]=='pos':
            positive+=1
            positive_mean=[i+j for i,j in zip(positive_mean, element[0])]
        else:
            negative+=1
            negative_mean=[i+j for i,j in zip(negative_mean,element[0])]
            
    for i in range(0, len(positive_mean)):
        positive_mean[i]=positive_mean[i]/positive
    for i in range(0, len(negative_mean)):
        negative_mean[i]=negative_mean[i]/negative
    return positive/len(data), negative/len(data), np.asarray(positive_mean), np.asarray(negative_mean)


def covariance_matrix(data, pos_mean, neg_mean):
    dimension=len(data[0][0])
    pos_sum=np.zeros((20, 20), dtype=float)
    neg_sum=np.zeros((20, 20), dtype=float)
    output=[]
    for element in data:
        if element[1]=='pos':
            m=np.subtract(element[0], pos_mean)
            matrix=m.reshape(-1,1)
            matrix_transposed=m.reshape(1,-1)
            multiplied=np.matmul(matrix, matrix_transposed)
            for i in range (0, len(multiplied)):
                for j in range (0, len(multiplied[0])):
                    multiplied[i][j]=multiplied[i][j]/len(data)
            pos_sum=np.add(pos_sum, multiplied)
        else:
            m=np.subtract(element[0], neg_mean)
            matrix=m.reshape(-1,1)
            matrix_transposed=m.reshape(1,-1)
            multiplied=np.matmul(matrix, matrix_transposed)
            for i in range (0, len(multiplied)):
                for j in range (0, len(multiplied[0])):
                    multiplied[i][j]=multiplied[i][j]/len(data)
            neg_sum=np.add(neg_sum, multiplied)
    return np.add(pos_sum, neg_sum)

def calculate_decision(prior, mean, cov_matrix, x):
    log_prior=math.log(prior)
    mean_transposed=mean.reshape(1,-1)
    mean_reshaped=mean.reshape(-1,1)
    cov_inv=inv(cov_matrix)
    term=(0.5)*np.matmul(np.matmul(mean_transposed, cov_inv), mean_reshaped)
    x_transposed=x.reshape(1,-1)
    last_term=np.matmul(np.matmul(x_transposed, cov_inv), mean_reshaped )
    return log_prior-term+last_term

def predict(pos_prior, neg_prior, pos_mean, neg_mean, cov_matrix, x):
    pos_prediction=calculate_decision(pos_prior, pos_mean, cov_matrix, x)
    neg_prediction=calculate_decision(neg_prior, neg_mean, cov_matrix, x)
    if pos_prediction > neg_prediction:
        return 'pos'
    else:
        return 'neg'
    
def LDA(train, test):
    pos_prior, neg_prior, posmean, negmean=priors_means(train)
    cov_matrix=covariance_matrix(train, posmean, negmean)
    prediction=[]
    for element in test:
        prediction.append((element[0], predict(pos_prior, neg_prior, posmean, negmean, cov_matrix, element[0])))
    print "The coefficients for the LDA calculation are as follows: \n"
    print "The positive prior is " + str(pos_prior) + "\n"
    print "The negative prior is " + str(neg_prior) + "\n"
    print "The positive mean is " + str(posmean) + "\n"
    print "The negative mean is " + str(negmean) + "\n"
    print "The covariance matrix is " + str(cov_matrix) + "\n"
    return prediction

def accuracy(predicted, actual):
    correct=0.0
    for i in range (0, len(predicted)):
        if predicted[i][1]==actual[i][1]:
            correct+=1
    return correct/len(predicted)

def precision(predicted, actual):
    true_positives=0.0
    false_positives=0.0
    for i in range (0, len(predicted)):
        if predicted[i][1]=='pos':
            if actual[i][1]=='pos':
                true_positives+=1
            else:
                false_positives+=1
    return true_positives/(true_positives+false_positives)

def recall(predicted, actual):
    true_positives=0.0
    false_negatives=0.0
    for i in range (0, len(predicted)):
        if actual[i][1]=='pos':
            if predicted[i][1]=='pos':
                true_positives+=1
            else:
                false_negatives+=1
    return true_positives/(true_positives+false_negatives)

def f1(predicted, actual):
    p=precision(predicted, actual)
    r=recall(predicted, actual)
    return 2*((p*r)/(p+r))
    
#run_tests(dataset)

#find the k nearest training samples to x
#y=majority(y1...yi)
#let the distance metric be the square root of the sum of the square differences between points

def distance(x1, x2):
    sum_of_squares=[math.pow((i-j), 2) for i,j in zip(x1, x2)]
    total_sum=0
    for element in sum_of_squares:
        total_sum+=element
    return math.sqrt(total_sum)

def kNN(train, test, k):
    store=[]
    output=[]
    for element in test:
        neighbors={}
        for neighbor in train:
            d=distance(element[0], neighbor[0])
            neighbors[d]=neighbor
        sorted_keys= sorted(neighbors.iterkeys())
        to_add=[]
        for key in sorted_keys[0:k]:
            to_add.append(neighbors[key])
        pos_count=0
        neg_count=0
        for n in to_add:
            if n[1]=='pos':
                pos_count+=1
            else:
                neg_count+=1
        if pos_count>neg_count:
            store.append((element[0], 'pos'))
        else:
            store.append((element[0], 'neg'))
    return store


def run_tests(data):
    train, test=train_test(data)
    predicted_LDA=LDA(train, test)
    print "The accuracy for LDA is " + str(accuracy(predicted_LDA, test)) + "\n"
    print "The precision for LDA is " + str(precision(predicted_LDA, test))+ "\n"
    print "The recall for LDA is " + str(recall(predicted_LDA, test))+ "\n"
    print "The F1 score for LDA is "+ str(f1(predicted_LDA, test))+ "\n"
    
    for element in [1, 5, 10, 15, 20]:
        predicted_knn=kNN(train, test, element)
        print "The accuracy for kNN with k value of " + str(element)+" is " + str(accuracy(predicted_knn, test))+ "\n"
        print "The precision for kNN with k value of " + str(element)+" is " + str(precision(predicted_knn, test))+ "\n"
        print "The recall for kNN with k value of " + str(element)+" is " + str(recall(predicted_knn, test))+ "\n"
        print "The F1 score for kNN with k value of " + str(element)+" is " + str(f1(predicted_knn, test))+ "\n"

    
def main():
    dataset1=datasets(1)
    print "Here are the results for DS1: \n"
    run_tests(dataset1)
    print "\n"
    print "Here are the results for DS2: \n"
    dataset2=datasets(2)
    run_tests(dataset2)

main()

Here are the results for DS1: 

The coefficients for the LDA calculation are as follows: 

The positive prior is 0.497857142857

The negative prior is 0.502142857143

The positive mean is [ 1.93448969  2.00295145  1.99667886  1.92585977  1.9621975   2.01721409
  1.95728451  2.05619103  2.00455781  2.03788155  1.97682959  2.05835065
  2.03150181  2.00768923  2.00110638  2.07775191  2.03905353  2.04529506
  2.02393877  1.95397961]

The negative mean is [ 1.43655335  1.414106    1.39423215  1.43490323  1.38454747  1.41369849
  1.3447257   1.40111909  1.36014397  1.41086586  1.35411001  1.41832625
  1.48031281  1.41075649  1.37840059  1.36338222  1.40519073  1.4533138
  1.38792136  1.44745185]

The covariance matrix is [[ 7.82242096  5.38053996  5.98343811  5.02949709  5.72034393  6.03307943
   4.5242569   5.33693917  4.84640717  5.0894405   3.82798856  5.11219568
   6.88491429  5.87780683  5.99236862  5.85536278  5.69904181  5.62078796
   5.46032337  5.76479469]
 [ 5.38053996  6.8231228  

The accuracy for kNN with k value of 1 is 0.517097581318

The precision for kNN with k value of 1 is 0.505546751189

The recall for kNN with k value of 1 is 0.544368600683

The F1 score for kNN with k value of 1 is 0.524239934265

The accuracy for kNN with k value of 5 is 0.534612176814

The precision for kNN with k value of 5 is 0.522292993631

The recall for kNN with k value of 5 is 0.559726962457

The F1 score for kNN with k value of 5 is 0.540362438221

The accuracy for kNN with k value of 10 is 0.521267723103

The precision for kNN with k value of 10 is 0.512195121951

The recall for kNN with k value of 10 is 0.430034129693

The F1 score for kNN with k value of 10 is 0.467532467532

The accuracy for kNN with k value of 15 is 0.508757297748

The precision for kNN with k value of 15 is 0.497622820919

The recall for kNN with k value of 15 is 0.535836177474

The F1 score for kNN with k value of 15 is 0.516023007395

The accuracy for kNN with k value of 20 is 0.516263552961

The preci