https://dev.to/thepylot/compare-documents-similarity-using-python-nlp-4odp

from google colab

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
data = "Mars is approximately half the diameter of Earth."
print(word_tokenize(data))

['Mars', 'is', 'approximately', 'half', 'the', 'diameter', 'of', 'Earth', '.']


In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
data = "Mars is a cold desert world. It is half the size of Earth. "
print(sent_tokenize(data))

['Mars is a cold desert world.', 'It is half the size of Earth.']


In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
import numpy as np

In [None]:
file_docs = []

with open ('data/demofile.txt') as f:
    tokens = sent_tokenize(f.read())
    for line in tokens:
        file_docs.append(line)

print("Number of documents:",len(file_docs))

Number of documents: 3


In [None]:
gen_docs = [[w.lower() for w in word_tokenize(text)] 
            for text in file_docs]
gen_docs

[['mars',
  'is',
  'the',
  'fourth',
  'planet',
  'in',
  'our',
  'solar',
  'system',
  '.'],
 ['it',
  'is',
  'second-smallest',
  'planet',
  'in',
  'the',
  'solar',
  'system',
  'after',
  'mercury',
  '.'],
 ['saturn', 'is', 'yellow', 'planet', '.']]

In [None]:
dictionary = gensim.corpora.Dictionary(gen_docs)
print(dictionary.token2id)

{'.': 0, 'fourth': 1, 'in': 2, 'is': 3, 'mars': 4, 'our': 5, 'planet': 6, 'solar': 7, 'system': 8, 'the': 9, 'after': 10, 'it': 11, 'mercury': 12, 'second-smallest': 13, 'saturn': 14, 'yellow': 15}


In [None]:
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1)],
 [(0, 1),
  (2, 1),
  (3, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1)],
 [(0, 1), (3, 1), (6, 1), (14, 1), (15, 1)]]

In [None]:
tf_idf = gensim.models.TfidfModel(corpus)
for doc in tf_idf[corpus]:
    print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])

[['fourth', 0.53], ['in', 0.2], ['mars', 0.53], ['our', 0.53], ['solar', 0.2], ['system', 0.2], ['the', 0.2]]
[['in', 0.17], ['solar', 0.17], ['system', 0.17], ['the', 0.17], ['after', 0.47], ['it', 0.47], ['mercury', 0.47], ['second-smallest', 0.47]]
[['saturn', 0.71], ['yellow', 0.71]]


In [None]:
sims = gensim.similarities.Similarity('workdir/',tf_idf[corpus],
                                        num_features=len(dictionary))
sims

<gensim.similarities.docsim.Similarity at 0x150a229c280>

In [None]:
file2_docs = []

with open ('data/demofile2.txt') as f:
    tokens = sent_tokenize(f.read())
    for line in tokens:
        file2_docs.append(line)

print("Number of documents:",len(file2_docs))  
for line in file2_docs:
    query_doc = [w.lower() for w in word_tokenize(line)]
    query_doc_bow = dictionary.doc2bow(query_doc) #update an existing dictionary and create bag of words
query_doc_bow

Number of documents: 1


[(0, 1), (3, 1), (6, 1), (9, 2), (14, 1)]

In [None]:
# perform a similarity query against the corpus
query_doc_tf_idf = tf_idf[query_doc_bow]
query_doc_tf_idf

[(9, 0.5938758662252933), (14, 0.8045566825992793)]

In [None]:
print('Comparing Result:', sims[query_doc_tf_idf]) 

Comparing Result: [0.11641413 0.10281226 0.56890744]


In [None]:
sum_of_sims =(np.sum(sims[query_doc_tf_idf], dtype=np.float32))
print(sum_of_sims)

0.78813386


In [None]:
percentage_of_similarity = round(float((sum_of_sims / len(file_docs)) * 100))
print(f'Average similarity float: {float(sum_of_sims / len(file_docs))}')
print(f'Average similarity percentage: {float(sum_of_sims / len(file_docs)) * 100}')
print(f'Average similarity rounded percentage: {percentage_of_similarity}')

Average similarity float: 0.2627112865447998
Average similarity percentage: 26.27112865447998
Average similarity rounded percentage: 26
