In [None]:
import numpy as np
import spacy   # another tokenizer, lemmatizer (has --> be)
import nltk
from nltk import word_tokenize
from nltk.corpus import inaugural
nltk.download('inaugural')
nltk.download('punkt')

nlp = spacy.load('en_core_web_sm')
nlp.disable_pipes('parser', 'ner')


[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Unzipping corpora/inaugural.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['parser', 'ner']

##Get Docs

In [None]:
# list names inaugural addresses
list_doc = inaugural.fileids()
# methods inaugural object
print(dir(inaugural))
print(list_doc)

['CorpusView', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_citation', '_encoding', '_fileids', '_get_root', '_license', '_para_block_reader', '_read_para_block', '_read_sent_block', '_read_word_block', '_readme', '_root', '_sent_tokenizer', '_tagset', '_unload', '_word_tokenizer', 'abspath', 'abspaths', 'citation', 'encoding', 'ensure_loaded', 'fileids', 'license', 'open', 'paras', 'raw', 'readme', 'root', 'sents', 'words']
['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', '1801-Jefferson.txt', '1805-Jefferson.txt', '1809-Madison.txt', '1813-Madison.txt', '1817-Monroe.txt', '1821-Monroe.txt', '1825-Adams.txt', '1829-Jackson.txt', '1833-Jackson.txt', '1837-VanBuren.txt', '184

In [None]:
list_docs = []    # list of raw text documents
for fid in list_doc:
    list_docs.append(inaugural.raw(fid))

print(len(list_docs))

#print(name_doc)
#print(list_docs[0])

59


In [None]:
# access the words
# Print the first 100 words
print(inaugural.words('1789-Washington.txt')[:20])
 # replace words with paras - get paragraphs

['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the', 'House', 'of', 'Representatives', ':', 'Among', 'the', 'vicissitudes', 'incident', 'to', 'life', 'no']


In [None]:
from sklearn.feature_extraction.text import CountVectorizer # extract words and count words in each text
from sklearn.feature_extraction.text import TfidfTransformer
from collections import Counter

import spacy   # another tokenizer, lemmatizer (has --> be)
nlp = spacy.load('en_core_web_sm')

In [None]:
# Step 1: text processing for one document - return lemmas
def nlp_processing(doc):
    tokens = nlp(doc)

    #print(type(tokens))
    # eliminates stop words  and non alpha num and converts all to lower case
    terms = [token.lemma_.lower() for token in tokens if not token.is_stop and token.is_alpha]

    return terms

# Step 2: extract a list of (token, doc_id) from all documents.
# input a list of documents
# output: a list of sorted (token, doc_id) tuples
def extract_token_doc_id(list_doc):
  all_tokens = []
  len_docs = [0]*len(list_doc)
  for ind_doc, doc in enumerate(list_doc):
    tokens = nlp_processing(inaugural.raw(doc))
    len_docs[ind_doc] = len(tokens)
    tokens_doc = [(token, ind_doc) for token in tokens]
    all_tokens.extend(tokens_doc)

  # sort by token name
  all_tokens = sorted(all_tokens, key = lambda x:x[0])

  return all_tokens, len_docs

# count of each term in the collection
def counter(items):
  sort_items = sorted(items) # sorts tokens alphabetically
  count_items = {}
  for item in sort_items:
    if item in count_items.keys():
      count_items[item] += 1
    else:
      count_items[item] = 1

  # sort by the count, in reverse order
  sorted_count_list = sorted(count_items.items(),
                            key = lambda x:x[1], reverse = True)
  sorted_count_dict = dict(sorted_count_list)
  return sorted_count_dict


# Step 3: Extract terms (unique) and document frequency (count tokens)
# change this to account only once for a repeated term in the same document
# all_tokens list of tuples
def doc_freq(all_tokens):
  set_all_tokens = set(all_tokens) # remove duplicate token in the same document
  dict_doc_freq = {}
  for (token, doc) in set_all_tokens:
    if token in dict_doc_freq:
      dict_doc_freq[token] += 1
    else:
      dict_doc_freq[token] = 1

  # sort by key (term)
  tuples_doc_freq = sorted(dict_doc_freq.items(), key = lambda x: x[0])

  dict_doc_freq = {term:doc_freq for (term, doc_freq) in tuples_doc_freq}
  return dict_doc_freq

# Step 4: Extract term frequency of each term in each document it appears in
# dict_term_freq = {term: {doc1:tf1, doc2:tf2, ...}} # includes only docs that have
# non-zero term frequency
def term_freq(all_tokens, dict_doc_freq):
  dict_term_freq = {term:{} for term in dict_doc_freq.keys()} # initialize dictionary with all unique terms
  for (token, doc) in all_tokens:
    if doc in dict_term_freq[token]:
      dict_term_freq[token][doc] += 1
    else: # if doc is not a key in the dictionary
      dict_term_freq[token][doc] = 1

  return dict_term_freq

In [None]:
# nlp processing on all docs, extract len_docs
list_token_doc, len_docs = extract_token_doc_id(list_doc)
print(list_token_doc[:5])
print(len_docs)

[('abandon', 2), ('abandon', 3), ('abandon', 8), ('abandon', 8), ('abandon', 11)]
[578, 56, 976, 723, 902, 478, 501, 1388, 1795, 1245, 490, 486, 1670, 3297, 2033, 487, 1416, 1218, 1411, 283, 465, 532, 1088, 1303, 749, 1904, 916, 1691, 952, 382, 2325, 703, 575, 1557, 1667, 1554, 831, 812, 547, 224, 1051, 1103, 702, 609, 581, 877, 728, 504, 986, 1123, 947, 677, 943, 681, 925, 1029, 891, 618, 1004]


In [None]:
# compute the P(w|collection) = count of a word in the collection/sum length docs
len_collection = sum(len_docs)
all_tokens = [t for (t,d) in list_token_doc]
dict_collection = counter(all_tokens) # count of each word in the collection as a sorted dictionary
print(list(dict_collection.items())[:5])
print(list(dict_collection.items())[-5:])


[('government', 651), ('people', 633), ('nation', 517), ('great', 441), ('country', 359)]
[('wrongfully', 1), ('yearn', 1), ('yearning', 1), ('youthful', 1), ('zone', 1)]


In [None]:
dict_term_freq = term_freq(list_token_doc, dict_collection)
print(list(dict_term_freq.items())[:5])

[('government', {0: 9, 1: 1, 2: 18, 3: 13, 4: 3, 6: 3, 7: 22, 8: 15, 9: 21, 10: 8, 11: 14, 12: 17, 13: 44, 14: 50, 15: 8, 16: 10, 17: 14, 18: 19, 19: 1, 20: 3, 21: 5, 22: 20, 23: 24, 24: 17, 25: 11, 26: 14, 27: 24, 28: 16, 29: 3, 30: 28, 31: 9, 32: 1, 33: 14, 34: 16, 35: 30, 36: 4, 37: 16, 38: 4, 40: 4, 41: 1, 43: 1, 44: 1, 45: 5, 46: 10, 47: 4, 48: 16, 49: 20, 50: 5, 51: 4, 52: 10, 53: 4, 54: 6, 55: 4, 56: 4, 57: 3}), ('people', {0: 4, 1: 1, 2: 20, 3: 2, 5: 1, 6: 3, 7: 15, 8: 11, 9: 7, 10: 4, 11: 9, 12: 20, 13: 38, 14: 16, 15: 3, 16: 6, 17: 13, 18: 20, 20: 2, 21: 7, 22: 9, 23: 21, 24: 18, 25: 29, 26: 22, 27: 25, 28: 12, 29: 6, 30: 7, 31: 2, 32: 8, 33: 12, 34: 16, 35: 18, 36: 8, 37: 11, 38: 9, 39: 2, 40: 21, 41: 18, 42: 15, 43: 2, 44: 9, 45: 15, 46: 6, 47: 7, 48: 9, 49: 17, 50: 7, 51: 12, 52: 11, 53: 1, 54: 7, 55: 8, 56: 11, 57: 10, 58: 10}), ('nation', {0: 3, 2: 20, 3: 4, 4: 6, 5: 8, 6: 5, 7: 10, 8: 10, 9: 15, 10: 2, 11: 2, 12: 7, 13: 7, 14: 8, 15: 2, 16: 12, 17: 11, 19: 4, 20: 6, 21:

In [None]:
# jm similarity method
import numpy as np

def jm_similarity(query, len_docs, dict_freq_collection, dict_term_freq,
                  lam = 0.2):
  # process the query - > word and count in the query
  query_terms = nlp_processing(query)
  dict_freq_query = counter(query_terms)
  words_query = dict_freq_query.keys()

  # find the set of docs with at least 1 query word
  len_all = sum(len_docs) # length of the collection
  docs = []

  for w in words_query:
    docs.extend(dict_term_freq[w].keys())
  docs = set(docs)

  similarity = {d:0 for d in docs}

  # for each word in the query
  for w in words_query:
  #   for each doc the word appears in
    if w not in dict_freq_collection: # if word does not appear in the collection skip over it
      continue
    # check the documents w appears in
    for d in docs:
      # compute P(w|d) = (1-lambda) * c(w,d)/len(d) + lambda*c(w,C)/sum(len)
      c_w_d = 0
      if d in dict_term_freq[w]:
        c_w_d = dict_term_freq[w][d]

      prob_w_d = (1-lam)* c_w_d/len_docs[d] + (lam*dict_freq_collection[w])/len_all

      # add to the similarity of (q,d) =  c(w,q) * log(P(w|d)
      similarity[d] += dict_freq_query[w] * np.log(prob_w_d)

  # sort the similarity by value
  sorted_sim = sorted(similarity.items(), key = lambda x: x[1], reverse = True)
  return sorted_sim

In [None]:
def dirchlet_similarity(query, len_docs, dict_freq_collection, dict_term_freq,
                  mu = 0.2):
  # process the query - > word and count in the query
  query_terms = nlp_processing(query)
  dict_freq_query = counter(query_terms)
  words_query = dict_freq_query.keys()

  # find the set of docs with at least 1 query word
  len_all = sum(len_docs) # length of the collection
  docs = []

  for w in words_query:
    docs.extend(dict_term_freq[w].keys())
  docs = set(docs)

  similarity = {d:0 for d in docs}

  # for each word in the query
  for w in words_query:
  #   for each doc the word appears in
    if w not in dict_freq_collection: # if word does not appear in the collection skip over it
      continue
    # check the documents w appears in
    for d in docs:
      # compute P(w|d) = (1-lambda) * c(w,d)/len(d) + lambda*c(w,C)/sum(len)
      c_w_d = 0
      if d in dict_term_freq[w]:
        c_w_d = dict_term_freq[w][d]

      #prob_w_d = (1-lam)* c_w_d/len_docs[d] + (lam*dict_freq_collection[w])/len_all
      prob_w_d = (c_w_d+mu*dict_freq_collection[w])/(len_docs[d]+mu)

      # add to the similarity of (q,d) =  c(w,q) * log(P(w|d)
      similarity[d] += dict_freq_query[w] * np.log(prob_w_d)

  # sort the similarity by value
  sorted_sim = sorted(similarity.items(), key = lambda x: x[1], reverse = True)
  return sorted_sim

In [None]:
def getTupple(ranked_doc,list_doc):
  # Retrieve the second value from each tuple
  doc_ids = [x[0] for x in ranked_doc]
  return (list_doc[doc_ids[0]],list_doc[doc_ids[1]],list_doc[doc_ids[2]])

In [None]:
print("jm_similarity and smoothiung\n")

ranked_doc = jm_similarity("economic growth policy", len_docs, dict_collection, dict_term_freq, 0.3)
print(f"Top 3 Similarity for \"economic growth policy\" with smoothing .3 \n {ranked_doc[:3]}")
print(f"Associated Titles{getTupple(ranked_doc,list_doc)}")
print(f"Precision Value {3/3}\n")

ranked_doc = jm_similarity("economic growth policy", len_docs, dict_collection, dict_term_freq, 0.7)
print(f"Top 3 Similarity Similarity for \"economic growth policy\" with smoothing .7 \n {ranked_doc[:3]}")
print(f"Associated Titles{getTupple(ranked_doc,list_doc)}")
print(f"Precision Value {3/3}\n")

ranked_doc = jm_similarity("national security defense", len_docs, dict_collection, dict_term_freq, 0.3)
print(f"Top 3 Similarity Similarity for \"national security defense\" with smoothing .3 \n {ranked_doc[:3]}")
print(f"Associated Titles{getTupple(ranked_doc,list_doc)}")
print(f"Precision Value {3/3}\n")

ranked_doc = jm_similarity("national security defense", len_docs, dict_collection, dict_term_freq, 0.7)
print(f"Top 3 Similarity Similarity for \"national security defense\" with smoothing .7 \n {ranked_doc[:3]}")
print(f"Associated Titles{getTupple(ranked_doc,list_doc)}")
print(f"Precision Value {3/3}\n")

ranked_doc = jm_similarity("god bless america", len_docs, dict_collection, dict_term_freq, 0.3)
print(f"Top 3 Similarity Similarity for \"god bless america\" with smoothing .3 \n {ranked_doc[:3]}")
print(f"Associated Titles{getTupple(ranked_doc,list_doc)}")
print(f"Precision Value {3/3}\n")

ranked_doc = jm_similarity("god bless america", len_docs, dict_collection, dict_term_freq, 0.7)
print(f"Top 3 Similarity Similarity for \"god bless america\" with smoothing .7 \n {ranked_doc[:3]}")
print(f"Associated Titles{getTupple(ranked_doc,list_doc)}")
print(f"Precision Value {3/3}\n")



jm_similarity and smoothiung

Top 3 Similarity for "economic growth policy" with smoothing .3 
 [(35, -18.574836780395568), (40, -19.26070813237846), (49, -19.544314985274454)]
Associated Titles('1929-Hoover.txt', '1949-Truman.txt', '1985-Reagan.txt')
Precision Value 1.0

Top 3 Similarity Similarity for "economic growth policy" with smoothing .7 
 [(35, -19.584579246824976), (49, -19.77231245487262), (40, -19.858900627391694)]
Associated Titles('1929-Hoover.txt', '1985-Reagan.txt', '1949-Truman.txt')
Precision Value 1.0

Top 3 Similarity Similarity for "national security defense" with smoothing .3 
 [(40, -17.01965322063667), (49, -17.741735654457493), (9, -17.978621694256972)]
Associated Titles('1949-Truman.txt', '1985-Reagan.txt', '1825-Adams.txt')
Precision Value 1.0

Top 3 Similarity Similarity for "national security defense" with smoothing .7 
 [(40, -18.083309771406142), (49, -18.672308092841725), (9, -18.783942311290573)]
Associated Titles('1949-Truman.txt', '1985-Reagan.txt', '

## Comment on the results you got with respect to lambda, and the best results you obtained with the vector space method

### When putting in lambda values 0.3 and 0.7 for the jm_similarity function, there was a very clear distinction that we observed between the two values, in which the lower lambda score of 0.3 gave a much better similarity value that was closer to 0 than the higher lambda score of 0.7. Furthermore, both lambda values gave an output of 1.0 precison values for the queries, meaning that all top documents were found to be relevant to the query. For example, for the query "god bless America", we observed that although the top 3 ranked documents were the same for both lambda values, the similarity scores for lambda value 0.3 was -14.45, -16.09, and -16.14 while the similarity scores for lambda value 0.7 was higher being -16.24, -17.35, and -17.445. This was the case with all queries when tested with lambda values 0.3 and 0.7 so therefore we had concluded that the lambda value of 0.3 provides a better similarity score for the documents. The best results that we obtained from the vector space method were for the "god bless America" query, since we had gotten the lowest similarity scores.

In [None]:
import statistics

def getMu(list_doc):
  length_arr =[]
  for doc in list_doc:
    length_arr.append(len(inaugural.raw(doc)))
  mean = sum(length_arr)/len(length_arr)
  std_dev_sample = statistics.stdev(length_arr)
  return (mean, std_dev_sample)

In [None]:
print("dirchlet and smoothiung\n")

Mu = getMu(list_doc)

print(Mu)


ranked_doc = dirchlet_similarity("economic growth policy", len_docs, dict_collection, dict_term_freq, Mu[0])
print(f"Top 3 Similarity for \"economic growth policy\" with smoothing based on mean \n {ranked_doc[:3]}")
print(f"Associated Titles{getTupple(ranked_doc,list_doc)}")
print(f"Precision Value {2/3}\n")

ranked_doc = dirchlet_similarity("economic growth policy", len_docs, dict_collection, dict_term_freq, Mu[1])
print(f"Top 3 Similarity Similarity for \"economic growth policy\" with smoothing based on std_dev \n {ranked_doc[:3]}")
print(f"Associated Titles{getTupple(ranked_doc,list_doc)}")
print(f"Precision Value {2/3}\n")

ranked_doc = dirchlet_similarity("national security defense", len_docs, dict_collection, dict_term_freq, Mu[0])
print(f"Top 3 Similarity Similarity for \"national security defense\" with smoothing based on mean \n {ranked_doc[:3]}")
print(f"Associated Titles{getTupple(ranked_doc,list_doc)}")
print(f"Precision Value {1/3}\n")

ranked_doc = dirchlet_similarity("national security defense", len_docs, dict_collection, dict_term_freq, Mu[1])
print(f"Top 3 Similarity Similarity for \"national security defense\" with smoothing based on std_dev \n {ranked_doc[:3]}")
print(f"Associated Titles{getTupple(ranked_doc,list_doc)}")
print(f"Precision Value {1/3}\n")

ranked_doc = dirchlet_similarity("god bless america", len_docs, dict_collection, dict_term_freq, Mu[0])
print(f"Top 3 Similarity Similarity for \"god bless america\" with smoothing based on mean \n {ranked_doc[:3]}")
print(f"Associated Titles{getTupple(ranked_doc,list_doc)}")
print(f"Precision Value {3/3}\n")

ranked_doc = dirchlet_similarity("god bless america", len_docs, dict_collection, dict_term_freq, Mu[1])
print(f"Top 3 Similarity Similarity for \"god bless america\" with smoothing based on std_dev \n {ranked_doc[:3]}")
print(f"Associated Titles{getTupple(ranked_doc,list_doc)}")
print(f"Precision Value {3/3}\n")

dirchlet and smoothiung

(13682.64406779661, 8207.46889406168)
Top 3 Similarity for "economic growth policy" with smoothing based on mean 
 [(29, 11.877620590870288), (20, 11.859964719106703), (5, 11.857211311213392)]
Associated Titles('1905-Roosevelt.txt', '1869-Grant.txt', '1809-Madison.txt')
Precision Value 0.6666666666666666

Top 3 Similarity Similarity for "economic growth policy" with smoothing based on std_dev 
 [(29, 11.82375498379379), (20, 11.794898596195072), (5, 11.790408256650917)]
Associated Titles('1905-Roosevelt.txt', '1869-Grant.txt', '1809-Madison.txt')
Precision Value 0.6666666666666666

Top 3 Similarity Similarity for "national security defense" with smoothing based on mean 
 [(29, 13.173664737937806), (20, 13.15601435773037), (5, 13.15325742736077)]
Associated Titles('1905-Roosevelt.txt', '1869-Grant.txt', '1809-Madison.txt')
Precision Value 0.3333333333333333

Top 3 Similarity Similarity for "national security defense" with smoothing based on std_dev 
 [(29, 13.11

## Comment on the results with respect to the value of mu, and to the results with the JM smoothing and the best results with the vector space method.

### When testing with the value of mu we decided to base the value of mu on the mean length of the documents and the standard deviation of the documents to see which one would display better results. The precision values were the same among the respective queries, in which both the mean and standard deviation produced the same precision value, where the query "economic growth policy" had a precision value of 0.66, "national security defense" had a precision value 0.33, and "god bless America" had a precision value of 1. Additionally, we observed that the mean value outputted better similarity scores than the standard deviation value, in which the values weren't that much greater but it was still recognizable that the mean value performed better with the dirchlet function. When comparing the results from the jm_similarity function and the results from the dirchlet function we found the jm_similarity function to be much more precise with relevancy than the dirchlet function since the precision values for the jm_similarity output were all 1 and the precision values for the dirchlet function ranged from 0.6 for the "economic growth policy" query, to 0.3 for the "national security defense" query, to 1.0 for the "god bless America" query. Once again, the best results found with the vector space method were for the "god bless America" query, where we got the highest values from the mean value being 13.71, 13.67, and 13.66.