In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [45]:
def knn(test, train, labels, K, measure):
    """ find K nearest neighbors of an instance x among the instances in D """
    if measure == 0:
        # euclidean distances from the other points
        dists = np.sqrt(((train - test)**2).sum(axis=1))
    elif measure == 1:
        # Cosine similarity
        train_norm = np.array([np.linalg.norm(train[i]) for i in range(len(train))])
        test_norm = np.linalg.norm(test)
        sims = np.dot(train,test)/(train_norm * test_norm)
        dists = 1 - sims
    idx = np.argsort(dists)
    n = idx[:K]
    knn_labels = [labels.iloc[i].all() for i in n]
    p = Counter(knn_labels).most_common(1)[0][0]
    return n, p

In [48]:
def evaluation(train, train_l, test, test_l, k, measure):
    tpos = 0
    tneg = 0
    fpos = 0
    fneg = 0 
    for i in range(0, len(test)):
        n, p = knn(test[i], train, train_l, k, measure)
        if p == test_l.iloc[i].all() and p == 1:
            tpos += 1
        elif p == test_l.iloc[i].all() and p == 0:
            tneg += 1
        elif test_l.iloc[i].all() == 0 and p == 0:
            fpos += 1
        else:
            fneg += 1
            
    overall_accuracy = (tpos + tneg) / (tpos + tneg + fpos + fneg)
    w_acc = tpos / (tpos + fneg)
    h_acc = tneg / (tneg + fpos)
    
    return overall_accuracy, w_acc, h_acc

In [31]:
train = pd.read_csv(r'C:\Users\Milin Desai\Desktop\newsgroups\trainMatrixModified.txt', header=None, delimiter='\t')
train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,790,791,792,793,794,795,796,797,798,799
0,2.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
train_l = pd.read_csv(r'C:\Users\Milin Desai\Desktop\newsgroups\trainClasses.txt', header=None, delimiter='\t')
train_l

Unnamed: 0,0,1
0,0,0
1,1,1
2,2,0
3,3,1
4,4,0
...,...,...
795,795,0
796,796,1
797,797,1
798,798,1


In [33]:
test = pd.read_csv(r'C:\Users\Milin Desai\Desktop\newsgroups\testMatrixModified.txt', header=None, delimiter='\t')
test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
test_l = pd.read_csv(r'C:\Users\Milin Desai\Desktop\newsgroups\testClasses.txt', header=None, delimiter='\t')
test_l

Unnamed: 0,0,1
0,0,1
1,1,0
2,2,0
3,3,1
4,4,1
...,...,...
195,195,1
196,196,1
197,197,0
198,198,1


In [35]:
train = train.T

In [36]:
test = test.T

In [37]:
train = np.array(train)
train

array([[2., 2., 2., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [38]:
test = np.array(test)
test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [49]:
accuracy = []
for i in range(1,20):
    overall_accuracy, w_acc, h_acc = evaluation(train, train_l, test, test_l, i, 0)
    accuracy.append(overall_accuracy)

In [50]:
accuracy

[0.775,
 0.775,
 0.805,
 0.805,
 0.81,
 0.83,
 0.76,
 0.795,
 0.745,
 0.845,
 0.79,
 0.84,
 0.77,
 0.81,
 0.78,
 0.795,
 0.755,
 0.78,
 0.735]

In [53]:
accuracy_one = []
for i in range(1,20):
    overall_accuracy, w_acc, h_acc = evaluation(train, train_l, test, test_l, i, 1)
    accuracy_one.append(overall_accuracy)

In [54]:
accuracy_one

[0.98,
 0.98,
 0.965,
 0.98,
 0.965,
 0.98,
 0.975,
 0.975,
 0.97,
 0.98,
 0.975,
 0.97,
 0.975,
 0.975,
 0.98,
 0.975,
 0.97,
 0.97,
 0.97]

In [61]:
numTerms_train=train.shape[0]
NDocs_train = train.shape[1]

In [62]:
DF_train = pd.DataFrame([(train!=0).sum(1)]).T
DF_train

Unnamed: 0,0
0,98
1,61
2,43
3,35
4,26
...,...
795,24
796,33
797,74
798,2


In [66]:
NMatrix_train=np.ones(np.shape(train), dtype=float)*NDocs_train
np.set_printoptions(precision=2,suppress=True,linewidth=120)
print(NMatrix_train)

[[5500. 5500. 5500. ... 5500. 5500. 5500.]
 [5500. 5500. 5500. ... 5500. 5500. 5500.]
 [5500. 5500. 5500. ... 5500. 5500. 5500.]
 ...
 [5500. 5500. 5500. ... 5500. 5500. 5500.]
 [5500. 5500. 5500. ... 5500. 5500. 5500.]
 [5500. 5500. 5500. ... 5500. 5500. 5500.]]


In [67]:
IDF_train = np.log2(np.divide(NMatrix_train, np.array(DF_train)))

In [68]:
IDF_train

array([[ 5.81,  5.81,  5.81, ...,  5.81,  5.81,  5.81],
       [ 6.49,  6.49,  6.49, ...,  6.49,  6.49,  6.49],
       [ 7.  ,  7.  ,  7.  , ...,  7.  ,  7.  ,  7.  ],
       ...,
       [ 6.22,  6.22,  6.22, ...,  6.22,  6.22,  6.22],
       [11.43, 11.43, 11.43, ..., 11.43, 11.43, 11.43],
       [ 6.09,  6.09,  6.09, ...,  6.09,  6.09,  6.09]])

In [70]:
TD_tfidf_train = train * IDF_train

In [71]:
pd.set_option("display.precision", 2)

TD_tfidf_train

array([[11.62, 11.62, 11.62, ...,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       ...,
       [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ]])

In [72]:
numTerms_test=test.shape[0]
NDocs_test = test.shape[1]

In [73]:
DF_test = pd.DataFrame([(test!=0).sum(1)]).T
DF_test

Unnamed: 0,0
0,117
1,47
2,50
3,243
4,54
...,...
195,33
196,171
197,9
198,59


In [74]:
NMatrix_test=np.ones(np.shape(test), dtype=float)*NDocs_test
np.set_printoptions(precision=2,suppress=True,linewidth=120)
print(NMatrix_test)

[[5500. 5500. 5500. ... 5500. 5500. 5500.]
 [5500. 5500. 5500. ... 5500. 5500. 5500.]
 [5500. 5500. 5500. ... 5500. 5500. 5500.]
 ...
 [5500. 5500. 5500. ... 5500. 5500. 5500.]
 [5500. 5500. 5500. ... 5500. 5500. 5500.]
 [5500. 5500. 5500. ... 5500. 5500. 5500.]]


In [75]:
IDF_test = np.log2(np.divide(NMatrix_test, np.array(DF_test)))

In [76]:
IDF_test

array([[5.55, 5.55, 5.55, ..., 5.55, 5.55, 5.55],
       [6.87, 6.87, 6.87, ..., 6.87, 6.87, 6.87],
       [6.78, 6.78, 6.78, ..., 6.78, 6.78, 6.78],
       ...,
       [9.26, 9.26, 9.26, ..., 9.26, 9.26, 9.26],
       [6.54, 6.54, 6.54, ..., 6.54, 6.54, 6.54],
       [7.97, 7.97, 7.97, ..., 7.97, 7.97, 7.97]])

In [77]:
TD_tfidf_test = test * IDF_test
pd.set_option("display.precision", 2)

TD_tfidf_test

array([[0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [6.78, 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       ...,
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ]])

In [78]:
accuracy_one = []
for i in range(1,20):
    overall_accuracy, w_acc, h_acc = evaluation(TD_tfidf_train, train_l, TD_tfidf_test, test_l, i, 1)
    accuracy_one.append(overall_accuracy)

In [79]:
accuracy_one

[0.98,
 0.98,
 0.965,
 0.985,
 0.965,
 0.98,
 0.975,
 0.975,
 0.97,
 0.98,
 0.975,
 0.97,
 0.975,
 0.975,
 0.98,
 0.975,
 0.97,
 0.97,
 0.97]