# Find the closest document using Term Freq-Inverse Document Freq

In [3]:
from __future__ import division
import nltk
import numpy as np
from math import log10, sqrt
from string import punctuation
from sklearn.metrics import jaccard_similarity_score

In [43]:
#list of existing documents
list_of_original_files = ["data/1.txt","data/2.txt", "data/3.txt"]
#the input document to search 
inputFile = "data/input.txt"

for file in list_of_original_files:
    f = open(file)
    print("File", file, ":")
    print(f.read())
    f.close()

f = open(inputFile)
print("\nSearch file", inputFile, ":")
print(f.read())
f.close()

File data/1.txt :
hello guys, please do your task seriously

File data/2.txt :
number one is so difficult. I couldn' t hold it anymore

File data/3.txt :
math assignment really makes me sleepy. please wake me up now


Search file data/input.txt :
history assignment makes me so sleepy. please wake me up later 



### Get a list of unique words (terms) from exisiting documents 

In [23]:
def get_unique_words_from_doc(file):
    # Extract unique words (unigram, bigram, trigram) from the MASTER DOCUMENT
    with open(file, 'r') as f:
        all_text = f.read().replace('\n', ' ')
    # Replace single quote (" ' ") into single white space
    allText = all_text.replace("'", " ")

    # Get a set of unique words, removing all punctuation
    return set(allText.translate(str.maketrans('', '', punctuation)).lower().split())

uniqueWords = []
for file in list_of_original_files:
        uniqueWords += get_unique_words_from_doc(file)
        
print(uniqueWords)

['your', 'seriously', 'task', 'do', 'please', 'hello', 'guys', 'is', 'one', 'i', 'couldn', 'so', 'anymore', 'hold', 'number', 't', 'it', 'difficult', 'really', 'up', 'me', 'wake', 'now', 'assignment', 'sleepy', 'please', 'math', 'makes']


### Remove all stop words from the unique words set

In [24]:
stopwords = nltk.corpus.stopwords.words('english')
def get_unique_word_without_stopwords(uniqueWords):    
    noStopWords = []
    for uniqueWord in uniqueWords:
        if (not uniqueWord in stopwords):
            noStopWords.append(uniqueWord)
    return noStopWords
uniqueWordList = get_unique_word_without_stopwords(uniqueWords)

print(uniqueWordList)

['seriously', 'task', 'please', 'hello', 'guys', 'one', 'anymore', 'hold', 'number', 'difficult', 'really', 'wake', 'assignment', 'sleepy', 'please', 'math', 'makes']


### compute IDF of unique words in the original files

In [25]:
def get_df_of_words(uniqueWordList, list_of_original_files):
    listofdocs = []                       
    for file in list_of_original_files:
        listofdocs.append(get_unique_words_from_doc(file))
    dfDict = {}
    for uniqueWord in uniqueWordList:
        counter = 0
        for doc in listofdocs:
            if (uniqueWord in doc):
                counter += 1
        dfDict[uniqueWord] = counter
    return dfDict

def get_idf_of_words(uniqueWordList, list_of_original_files):
    docsize = len(list_of_original_files)
    idfDict = get_df_of_words(uniqueWordList, list_of_original_files)
    for word in uniqueWordList:
        if idfDict[word] == 0:
            idfDict[word] = 1
        else:
            idfDict[word] = 1 + (log10(docsize / idfDict[word]))
    return idfDict

idfUniqueWordList = get_idf_of_words(uniqueWordList, list_of_original_files)

print(idfUniqueWordList)

{'seriously': 1.4771212547196624, 'task': 1.4771212547196624, 'please': 1.4066802324977494, 'hello': 1.4771212547196624, 'guys': 1.4771212547196624, 'one': 1.4771212547196624, 'anymore': 1.4771212547196624, 'hold': 1.4771212547196624, 'number': 1.4771212547196624, 'difficult': 1.4771212547196624, 'really': 1.4771212547196624, 'wake': 1.4771212547196624, 'assignment': 1.4771212547196624, 'sleepy': 1.4771212547196624, 'math': 1.4771212547196624, 'makes': 1.4771212547196624}


### compute TF of unique words in the original files

In [26]:
def get_tf_of_word(listofwords, word):
    return listofwords.count(word)/len(listofwords)

def get_tf_idf_for_file(file, uniqueWordList):
    listofwords = list(get_unique_words_from_doc(file))
    idfDict = get_idf_of_words(uniqueWordList, list_of_original_files)
    tfIdfDict = {}
    for word in uniqueWordList:
        tfIdfDict[word] = get_tf_of_word(listofwords, word) * idfDict[word]
    return tfIdfDict

tfIdfDict = {}
for file in list_of_original_files:
        tfIdfDict[file] = get_tf_idf_for_file(file, uniqueWordList)
        
inputFileTfIdf = get_tf_idf_for_file(inputFile, uniqueWordList)
print(tfIdfDict)
print(inputFileTfIdf)

{'data/1.txt': {'seriously': 0.2110173221028089, 'task': 0.2110173221028089, 'please': 0.20095431892824991, 'hello': 0.2110173221028089, 'guys': 0.2110173221028089, 'one': 0.0, 'anymore': 0.0, 'hold': 0.0, 'number': 0.0, 'difficult': 0.0, 'really': 0.0, 'wake': 0.0, 'assignment': 0.0, 'sleepy': 0.0, 'math': 0.0, 'makes': 0.0}, 'data/2.txt': {'seriously': 0.0, 'task': 0.0, 'please': 0.0, 'hello': 0.0, 'guys': 0.0, 'one': 0.1342837504290602, 'anymore': 0.1342837504290602, 'hold': 0.1342837504290602, 'number': 0.1342837504290602, 'difficult': 0.1342837504290602, 'really': 0.0, 'wake': 0.0, 'assignment': 0.0, 'sleepy': 0.0, 'math': 0.0, 'makes': 0.0}, 'data/3.txt': {'seriously': 0.0, 'task': 0.0, 'please': 0.14066802324977495, 'hello': 0.0, 'guys': 0.0, 'one': 0.0, 'anymore': 0.0, 'hold': 0.0, 'number': 0.0, 'difficult': 0.0, 'really': 0.14771212547196624, 'wake': 0.14771212547196624, 'assignment': 0.14771212547196624, 'sleepy': 0.14771212547196624, 'math': 0.14771212547196624, 'makes': 0.

### get closest document using euclid distance

In [33]:
def euclid_distance(idf1, idf2, uniqueWordList):
    sum = 0
    for word in uniqueWordList:
        sum += (idf1[word] - idf2[word])*(idf1[word] - idf2[word])
    return sqrt(sum)

distanceList = [(f,euclid_distance(inputFileTfIdf, tfIdfDict[f], uniqueWordList)) for f in list_of_original_files]
print(distanceList)

#get the minimal from the tuple list, using second element of tuple as key
matchTuple = min(distanceList, key = lambda x: x[1])
print("\nHere's the result:\nDocument ", matchTuple[0], " is closest to ", inputFile)

[('data/1.txt', 0.5221662607338883), ('data/2.txt', 0.46584450380170056), ('data/3.txt', 0.20889649116941097)]

Here's the result:
Document  data/3.txt  is closest to  data/input.txt
