In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In our data set, we have 15 documents. We assume that the documents have already been preprocessed, converted into word vectors (bags of words), and inserted into an index. After preprocessing and removing "stop words" we are left with 10 index terms (used as dimensions for the document vectors).

In [None]:
DF = pd.read_csv('../data/term-doc-mat.csv', header=None)
DF

Let's remove the column containing the term

In [None]:

# TD = genfromtxt('term-doc-mat.csv',delimiter=',',usecols=(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),dtype=int)
TD = DF.iloc[:,1:]
TD


In [None]:
# terms = genfromtxt('term-doc-mat.csv',delimiter=',',usecols=(0),dtype=str)
terms = DF.iloc[:,0]
terms

Transposing the TD matrix.

In [None]:
DT = TD.T

Now we have a document-term matrix:¶

In [None]:
DT

In [None]:
DT.shape

In [None]:
len(DT.iloc[1])

In [None]:
numTerms=DT.shape[1]
# could also say numTerms = len(DT.ix[1])
NDocs = DT.shape[0]

In [None]:
print (numTerms)
print (NDocs)

#### Next, let's compute term frequencies to get an idea of their distributions across the corpus.

In [None]:
termFreqs = TD.sum(axis=1)
print (termFreqs)

In [None]:
dictTF = {}
for i in range(numTerms):
               dictTF[terms[i]] = termFreqs[i]
print (sorted(dictTF.items()))
sortedTF = sorted(dictTF.values(), reverse=True)
print (sortedTF)

In [None]:
plt.plot(sortedTF)
plt.show()

#### We convert the dataframe into a Numpy array which will be used as input for our search function.

In [None]:
DT = np.array(DT)
DT

#### The search function takes a query object (in this case a vector of word frequencies), and searches for the K most similar (least distant) items in the data (our index of documents). The "measure" parameter allows us to use either the Euclidean distance or the inverse of Cosine similarity as our ditance metric. The function returns the indices of the K most similar neighbors and a list of their distances to the query object.

In [None]:
def knn_search(x, D, K, measure):
    """ find K nearest neighbours of data point x among D """
    if measure == 0:
        # euclidean distances from the other points
        dists = np.sqrt(((D - x)**2).sum(axis=1))
    elif measure == 1:
        D_norm = np.array([np.linalg.norm(D[i]) for i in range(len(D))])
        x_norm = np.linalg.norm(x)
        sims = np.dot(D,x)/(D_norm * x_norm)
        dists = 1 - sims
    idx = np.argsort(dists) # sorting
    # return the indexes of K nearest neighbors
    return idx[:K], sorted(dists)[:K]

#### Let's now try this on a new query object

In [None]:
x = np.array([3, 22, 0, 17, 9, 6, 1, 12, 0, 22])
x

In [None]:
# Finding the k=5 nearest neighbors using inverse of Cosine similarity as a distance metric
neigh_idx, distances = knn_search(x, DT, 5, 1)

In [None]:
neigh_idx

In [None]:
distances

In [None]:
array([11, 10, 12, 14, 13])

In [None]:
DT[neigh_idx]

In [None]:
# Finding the k=5 nearest neighbors using Euclidean distance metric
neigh_idx, distances = knn_search(x, DT, 5, 0)

In [None]:
neigh_idx

In [None]:
distances

In [None]:
DT[neigh_idx]