In [7]:
import nltk
import numpy as np
import gensim
from nltk.tokenize import word_tokenize, sent_tokenize

In [8]:
# Open the txt file, read contents, tokenize each sentence, store each token in an array
sent_token_array = []
with open('../Articles/article-1.txt', 'r') as f:
    sent_tokens = sent_tokenize(f.read())
    for line in sent_tokens:
        sent_token_array.append(line)

In [9]:
sent_token_array

['Tacos are some of the best forms of food.',
 'They are simple, meat wrapped in a tortilla, with some toppings on it.',
 'To me it comes down to fundamentals, because the taco is simple it forces cooks to really know what they are doing with no frills.',
 "One's ability to cook the tortilla, the meat, which toppings to choose, every little thing counts."]

In [10]:
# Convert the sentence tokens into word tokens
word_tokens = [[w.lower() for w in word_tokenize(text)] for text in sent_token_array]

In [11]:
# Create Dictionary of unique ID's for each word
dictionary = gensim.corpora.Dictionary(word_tokens)
dictionary.token2id

{'.': 0,
 'are': 1,
 'best': 2,
 'food': 3,
 'forms': 4,
 'of': 5,
 'some': 6,
 'tacos': 7,
 'the': 8,
 ',': 9,
 'a': 10,
 'in': 11,
 'it': 12,
 'meat': 13,
 'on': 14,
 'simple': 15,
 'they': 16,
 'toppings': 17,
 'tortilla': 18,
 'with': 19,
 'wrapped': 20,
 'because': 21,
 'comes': 22,
 'cooks': 23,
 'doing': 24,
 'down': 25,
 'forces': 26,
 'frills': 27,
 'fundamentals': 28,
 'is': 29,
 'know': 30,
 'me': 31,
 'no': 32,
 'really': 33,
 'taco': 34,
 'to': 35,
 'what': 36,
 "'s": 37,
 'ability': 38,
 'choose': 39,
 'cook': 40,
 'counts': 41,
 'every': 42,
 'little': 43,
 'one': 44,
 'thing': 45,
 'which': 46}

In [20]:
# Create a bag of words, an array that contatins the frequency of each word
freq_corpus = [dictionary.doc2bow(word_tokens) for word_tokens in word_tokens]
freq_corpus

[[(0, 1), (5, 1), (8, 2)],
 [(0, 1), (8, 1), (19, 1), (29, 1), (35, 1)],
 [(0, 1), (1, 1), (7, 1), (35, 1)],
 [(0, 1), (1, 1), (3, 1), (7, 1)]]

In [13]:
# Apply weights to each word based on how often the word appears in the text
tf_idf = gensim.models.TfidfModel(freq_corpus)

In [14]:
# Create Similarity measure object
sims_0 = gensim.similarities.Similarity('tacos_similarity_index', tf_idf[freq_corpus], num_features=len(dictionary))

In [15]:
# Tokenize a second document
sent_token_array_2 = []
with open('../Articles/article-2.txt', 'r') as f:
    sent_tokens = sent_tokenize(f.read())
    for line in sent_tokens:
        sent_token_array_2.append(line)

In [16]:
# Convert sentence tokens into word tokens
word_tokens = [[w.lower() for w in word_tokenize(text)] for text in sent_token_array_2]
word_tokens

[['i', 'love', 'the', 'combination', 'of', 'the', 'ingredients', '.'],
 ['the',
  'added',
  'taste',
  'factor',
  'with',
  'salsa',
  'is',
  'to',
  'die',
  'for',
  '.'],
 ['tacos', 'are', 'super', 'easy', 'to', 'make', '.'],
 ['tacos', 'are', 'top', 'tier', 'food', '.']]

In [17]:
query_doc_bow = [dictionary.doc2bow(word_tokens) for word_tokens in word_tokens]
query_doc_bow

[[(0, 1), (5, 1), (8, 2)],
 [(0, 1), (8, 1), (19, 1), (29, 1), (35, 1)],
 [(0, 1), (1, 1), (7, 1), (35, 1)],
 [(0, 1), (1, 1), (3, 1), (7, 1)]]

In [18]:
# perform a similarity query against the corpus
query_doc_tf_idf = tf_idf[query_doc_bow]
query_doc_tf_idf

<gensim.interfaces.TransformedCorpus at 0x226fe92c888>

In [19]:
sims_0[query_doc_tf_idf]

array([[0.6673406 , 0.        , 0.01818801, 0.04534583],
       [0.01200717, 0.08072653, 0.37604663, 0.13448519],
       [0.31770113, 0.01519053, 0.15945932, 0.12532389],
       [0.4950611 , 0.01208487, 0.00688851, 0.        ]], dtype=float32)