In [102]:
import re
from numpy import zeros, dot
from numpy.linalg import norm

In [103]:
# Processing text from a file

with open('sentences.txt', 'r') as file_obj:
    lines = sum(1 for _ in file_obj)
    file_obj.seek(0)
    
    words = {}
    lines_count, words_count = 0, 0
    for line in file_obj:
        p = re.compile(r"[^a-z]+")
        tokens = p.split(line.lower())
        
        for token in tokens:
            if token not in words:
                words[token] = {
                    "index": words_count,
                    "occurrences": [0] * lines
                }
                words_count += 1
            elif words[token]["occurrences"][lines_count] != 0:
                        continue

            words[token]["occurrences"][lines_count] = tokens.count(token)    
        lines_count += 1

In [104]:
# Array of words and repetitions

arr = zeros((lines, len(words)))
for word in words:
    i, j = 0, words[word]["index"]
    for occ in words[word]["occurrences"]:
        arr[i, j] = occ
        i += 1

In [105]:
# Find the cosine distance from the sentence in the very first line to 
#all the others using the scipy.spatial.distance.cosine function

def cosine_distance(u, v):
    return 1.0 - (dot(u, v) / (norm(u) * norm(v)))

In [107]:
dist = [] 
u = arr[0,] 
for i in range(1, lines):
    v = arr[i,]
    dist.append({"index": i, "distance": cosine_distance(u, v)})    

dist.sort(key=lambda x: x["distance"])
print("The 1 closest sentence is a sentence %d with a cosine distance of %.2f.\n"\
"The 2 closest sentence is a sentence %d with a cosine distance of %.2f." % (
    dist[0]["index"],
    dist[0]["distance"],
    dist[1]["index"],
    dist[1]["distance"]
))

The 1 closest sentence is a sentence 6 with a cosine distance of 0.70.
The 2 closest sentence is a sentence 4 with a cosine distance of 0.74.
