In [1]:
import nltk
import numpy as np
import re
import gensim
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
# Open the txt file, read contents, tokenize each sentence, store each token in an array
sent_token_array = []
with open('../Articles/article-1.txt', 'r') as f:
    sent_tokens = sent_tokenize(f.read())
    for line in sent_tokens:
        new_sent = re.sub(r'\.$|[?!]','\n', line)
        new_sent = re.sub(r',','', new_sent)
        sent_token_array.append(new_sent)

In [3]:
sent_token_array

['Tacos are some of the best forms of food\n',
 'They are simple meat wrapped in a tortilla with some toppings on it\n',
 'To me it comes down to fundamentals because the taco is simple it forces cooks to really know what they are doing with no frills\n',
 "One's ability to cook the tortilla the meat which toppings to choose every little thing counts\n"]

In [6]:
# Convert the sentence tokens into lowercase word tokens
word_tokens = [[w.lower() for w in word_tokenize(text)] for text in sent_token_array]
word_tokens

[['tacos', 'are', 'some', 'of', 'the', 'best', 'forms', 'of', 'food'],
 ['they',
  'are',
  'simple',
  'meat',
  'wrapped',
  'in',
  'a',
  'tortilla',
  'with',
  'some',
  'toppings',
  'on',
  'it'],
 ['to',
  'me',
  'it',
  'comes',
  'down',
  'to',
  'fundamentals',
  'because',
  'the',
  'taco',
  'is',
  'simple',
  'it',
  'forces',
  'cooks',
  'to',
  'really',
  'know',
  'what',
  'they',
  'are',
  'doing',
  'with',
  'no',
  'frills'],
 ['one',
  "'s",
  'ability',
  'to',
  'cook',
  'the',
  'tortilla',
  'the',
  'meat',
  'which',
  'toppings',
  'to',
  'choose',
  'every',
  'little',
  'thing',
  'counts']]

In [7]:
# Create Dictionary of unique ID's for each word
dictionary = gensim.corpora.Dictionary(word_tokens)
dictionary.token2id

{'are': 0,
 'best': 1,
 'food': 2,
 'forms': 3,
 'of': 4,
 'some': 5,
 'tacos': 6,
 'the': 7,
 'a': 8,
 'in': 9,
 'it': 10,
 'meat': 11,
 'on': 12,
 'simple': 13,
 'they': 14,
 'toppings': 15,
 'tortilla': 16,
 'with': 17,
 'wrapped': 18,
 'because': 19,
 'comes': 20,
 'cooks': 21,
 'doing': 22,
 'down': 23,
 'forces': 24,
 'frills': 25,
 'fundamentals': 26,
 'is': 27,
 'know': 28,
 'me': 29,
 'no': 30,
 'really': 31,
 'taco': 32,
 'to': 33,
 'what': 34,
 "'s": 35,
 'ability': 36,
 'choose': 37,
 'cook': 38,
 'counts': 39,
 'every': 40,
 'little': 41,
 'one': 42,
 'thing': 43,
 'which': 44}

In [18]:
# Create a bag of words, an array that contatins the frequency of each word
freq_corpus = [dictionary.doc2bow(word_tokens) for word_tokens in word_tokens]
freq_corpus

[[(4, 1), (7, 2)],
 [(7, 1), (17, 1), (27, 1), (33, 1)],
 [(0, 1), (6, 1), (33, 1)],
 [(0, 1), (2, 1), (6, 1)]]

In [37]:
# Apply weights to each word based on how often the word appears in the text
tf_idf = gensim.models.TfidfModel(freq_corpus)
for doc in tf_idf[freq_corpus]:
    print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])

[['of', 0.71], ['the', 0.71]]
[['the', 0.32], ['with', 0.63], ['is', 0.63], ['to', 0.32]]
[['are', 0.58], ['tacos', 0.58], ['to', 0.58]]
[['are', 0.41], ['food', 0.82], ['tacos', 0.41]]


In [12]:
# Create Similarity measure object
sims_1 = gensim.similarities.Similarity('tacos_similarity_index', tf_idf[freq_corpus], num_features=len(dictionary))

In [13]:
# Tokenize a second document
sent_token_array_2 = []
with open('../Articles/article-2.txt', 'r') as f:
    sent_tokens = sent_tokenize(f.read())
    for line in sent_tokens:
        new_sent = re.sub(r'\.$|[?!]','\n', line)
        new_sent = re.sub(r',','', new_sent)
        sent_token_array_2.append(new_sent)
sent_token_array_2

['I love the combination of the ingredients\n',
 'The added taste factor with salsa is to die for\n',
 'Tacos are super easy to make\n',
 'Tacos are top tier food\n']

In [14]:
# Convert sentence tokens into word tokens
word_tokens = [[w.lower() for w in word_tokenize(text)] for text in sent_token_array_2]
word_tokens

[['i', 'love', 'the', 'combination', 'of', 'the', 'ingredients'],
 ['the',
  'added',
  'taste',
  'factor',
  'with',
  'salsa',
  'is',
  'to',
  'die',
  'for'],
 ['tacos', 'are', 'super', 'easy', 'to', 'make'],
 ['tacos', 'are', 'top', 'tier', 'food']]

In [24]:
# Create unique ID dictionary for the second set of tokens
dictionary_2 = gensim.corpora.Dictionary(word_tokens)
dictionary_2.token2id

{'combination': 0,
 'i': 1,
 'ingredients': 2,
 'love': 3,
 'of': 4,
 'the': 5,
 'added': 6,
 'die': 7,
 'factor': 8,
 'for': 9,
 'is': 10,
 'salsa': 11,
 'taste': 12,
 'to': 13,
 'with': 14,
 'are': 15,
 'easy': 16,
 'make': 17,
 'super': 18,
 'tacos': 19,
 'food': 20,
 'tier': 21,
 'top': 22}

In [30]:
# Merge the dictionaries
dictionary.merge_with(dictionary_2)
#dictionary.token2id

<gensim.models.VocabTransform at 0x28a74791d08>

In [32]:
# Create second set of frequency id
query_doc_bow = [dictionary.doc2bow(word_tokens) for word_tokens in word_tokens]
query_doc_bow

[[(4, 1), (7, 2), (45, 1), (46, 1), (47, 1), (48, 1)],
 [(7, 1),
  (17, 1),
  (27, 1),
  (33, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1)],
 [(0, 1), (6, 1), (33, 1), (55, 1), (56, 1), (57, 1)],
 [(0, 1), (2, 1), (6, 1), (58, 1), (59, 1)]]

In [33]:
# perform a similarity query against the corpus
query_doc_tf_idf = tf_idf[query_doc_bow]
query_doc_tf_idf

<gensim.interfaces.TransformedCorpus at 0x28a7478cf88>

In [34]:
sims_1[query_doc_tf_idf]

array([[0.6673406 , 0.        , 0.01820851, 0.04607699],
       [0.01200717, 0.08186899, 0.37647063, 0.13665368],
       [0.31770113, 0.01540551, 0.15963912, 0.12734467],
       [0.4950611 , 0.0122559 , 0.00689628, 0.        ]], dtype=float32)