# How do I compare document similarity using Python?

In [12]:
import gensim
import nltk
nltk.download('punkt')
print(dir(gensim))
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/naveenkumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['NullHandler', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_matutils', 'corpora', 'interfaces', 'logger', 'logging', 'matutils', 'models', 'parsing', 'scripts', 'similarities', 'summarization', 'topic_coherence', 'utils']


# Let's create some sample documents.

In [6]:
raw_documents = ["I'm taking the show on the road.",
                 "My socks are a force multiplier.",
             "I am the barber who cuts everyone's hair who doesn't cut their own.",
             "Legend has it that the mind is a mad monkey.",
            "I make my own fun."]
print("Number of documents:",len(raw_documents))

Number of documents: 5


# We will use NLTK to tokenize.

# A document will now be a list of tokens.

In [11]:


from nltk.tokenize import word_tokenize
gen_docs = [[w.lower() for w in word_tokenize(text)] 
            for text in raw_documents]
print(gen_docs)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/naveenkumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[['i', "'m", 'taking', 'the', 'show', 'on', 'the', 'road', '.'], ['my', 'socks', 'are', 'a', 'force', 'multiplier', '.'], ['i', 'am', 'the', 'barber', 'who', 'cuts', 'everyone', "'s", 'hair', 'who', 'does', "n't", 'cut', 'their', 'own', '.'], ['legend', 'has', 'it', 'that', 'the', 'mind', 'is', 'a', 'mad', 'monkey', '.'], ['i', 'make', 'my', 'own', 'fun', '.']]


# We will create a dictionary from a list of documents. A dictionary maps every word to a number.

In [None]:
dictionary = gensim.corpora.Dictionary(gen_docs)
print(dictionary[5])
print(dictionary.token2id['road'])
print("Number of words in dictionary:",len(dictionary))
for i in range(len(dictionary)):
    print(i, dictionary[i])

# Now we will create a corpus. 
# A corpus is a list of bags of words. A bag-of-words representation for a document just lists the number of times each word occurs in the document.

In [None]:
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
print(corpus)

# Now we create a tf-idf model from the corpus. Note that num_nnz is the number of tokens.

In [None]:
tf_idf = gensim.models.TfidfModel(corpus)
print(tf_idf)
s = 0
for i in corpus:
    s += len(i)
print(s)

# Now we will create a similarity measure object in tf-idf space.

# tf-idf stands for term frequency-inverse document frequency. Term frequency is how often the word shows up in the document and inverse document fequency scales the value by how rare the word is in the corpus.

In [None]:
sims = gensim.similarities.Similarity('/usr/workdir/',tf_idf[corpus],
                                      num_features=len(dictionary))
print(sims)
print(type(sims))

# Now create a query document and convert it to tf-idf.

In [None]:
query_doc = [w.lower() for w in word_tokenize("Socks are a force for good.")]
print(query_doc)
query_doc_bow = dictionary.doc2bow(query_doc)
print(query_doc_bow)
query_doc_tf_idf = tf_idf[query_doc_bow]
print(query_doc_tf_idf)

# We show an array of document similarities to query. We see that the second document is the most similar with the overlapping of socks and force.

# Exercise: Make up some sentences and guess which ones are most similar in the corpus. Confirm by computing similarity

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/naveenkumar/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Source https://www.oreilly.com/learning/how-do-i-compare-document-similarity-using-python