# **Milestone 1:**
Text Search using Spacy and Scikit-Learn


### **Setting up the environment**

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


###**Importing the required modules**

In [2]:
# import libraries
from collections import Counter
from collections import defaultdict
import itertools
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

### **Getting the data**

In [3]:
DATA_DIR = '/content/drive/MyDrive/SearchToolwNLP/01_Text_Search_spaCy_and_scikit-learn/data/'

In [4]:
# load a spacy language model
nlp = spacy.load("en_core_web_sm")

In [5]:
# load the json file
with open(DATA_DIR + 'data.json', 'r') as outfile:
    summaries = json.load(outfile)

### **Inspecting the dataset**

In [6]:
# len of the list
print(f'The dataset comprises a list of {len(summaries)} dicts')

The dataset comprises a list of 26 dicts


In [7]:
# get the keys
print(f'Each entry contains the following {summaries[0].keys()}')

Each entry contains the following dict_keys(['title', 'text', 'url'])


In [8]:
# print the first entry
print(summaries[0]['title'])
print('---')
print(summaries[0]['text'])
print('---')
print(summaries[0]['url'])

Pandemic
---
A pandemic (from Greek πᾶν, pan, "all" and δῆμος, demos, "people") is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people. A widespread endemic disease with a stable number of infected people is not a pandemic. Widespread endemic diseases with a stable number of infected people such as recurrences of seasonal influenza are generally excluded as they occur simultaneously in large regions of the globe rather than being spread worldwide.
Throughout human history, there have been a number of pandemics of diseases such as smallpox and tuberculosis. The most fatal pandemic in recorded history was the Black Death (also known as The Plague), which killed an estimated 75–200 million people in the 14th century. The term was not used yet but was for later pandemics including the 1918 influenza pandemic (Spanish flu). Current pandemics include COVID-19 (SARS-CoV-2) and HIV/A

### **Cleaning the dataset**

In [9]:
# get the text content
text = summaries[0]['text']

# create a doc object
doc = nlp(text.lower())

# explore the attributes of each token returned spacy
print(doc[:20])
print('--------------------------------')
for token in doc[:5]:
    print(token.text) 
    print(token.pos_) 
    print(token.dep_)
    print('---')

a pandemic (from greek πᾶν, pan, "all" and δῆμος, demos, "people"
--------------------------------
a
DET
det
---
pandemic
NOUN
nsubj
---
(
PUNCT
punct
---
from
ADP
prep
---
greek
ADJ
amod
---


In [10]:
# identify unclassified tokens
unclassified_tokens = [(token.lemma_, token.dep_) for token in doc if token.dep_ is '']
unclassified_tokens[:10]

[('\n', '')]

In [11]:
# remove stop words and punctuation
token_without_sw = [word for word in doc if not word.is_stop and not word.is_punct]
token_without_sw[:10]

[pandemic,
 greek,
 πᾶν,
 pan,
 δῆμος,
 demos,
 people,
 epidemic,
 infectious,
 disease]

In [12]:
# lemmatize (tokenize) the texts
token_lemmas = [token.lemma_ for token in token_without_sw if token.dep_]
token_lemmas[:10]

['pandemic',
 'greek',
 'πᾶν',
 'pan',
 'δῆμος',
 'demos',
 'people',
 'epidemic',
 'infectious',
 'disease']

In [13]:
# build a tokenizer function
def tokenizer(document):
    """
    this function accepts a text string and:
    1. lowercases it
    2. removes redundant tokens
    3. performs token lemmatization
    """
    doc = nlp(document.lower())
    token_without_sw = [word for word in doc if not word.is_stop and not word.is_punct]
    token_lemmas = [token.lemma_ for token in token_without_sw if token.dep_]  

    return token_lemmas

In [14]:
# apply the tokenizer function
for s in summaries:
  s['tokenized_text'] = tokenizer(s['text']) 

### **Saving the dataset**

In [15]:
# save the tokenized texts to file:
with open(DATA_DIR + 'summaries.json', 'w') as outfile:
    json.dump(summaries, outfile)

### **Loading the dataset**

In [16]:
# load the tokenized dataset
with open(DATA_DIR + 'summaries.json', 'r') as outfile:
    summaries = json.load(outfile)

### **Building a corpus vocabulary**

In [17]:
# concatenate all tokenized texts into a single list
tokenized_texts = [s["tokenized_text"] for s in summaries]
print(tokenized_texts)

# flatten the list of lists (use itertools.chain)
vocab = list(itertools.chain(*tokenized_texts))

# remove duplicates
vocab = list(set(vocab)) 
print(vocab)



In [18]:
# save the vocabulary as a json file
with open(DATA_DIR + 'vocab.json', 'w') as outfile:
    json.dump(vocab, outfile)

### **Calculating term and document frequency**

In [19]:
# count how many times each token occurs in a document
docs_token_counter = []
for s in summaries:
    # for each document, count how many of each token they have
    docs_token_counter.append(Counter(s['tokenized_text']))
print(docs_token_counter)

# alternatively use scikit-learns CountVectorizer
# vectorizer = CountVectorizer()



In [20]:
# count the number of appearances of each token over all documents
number_docs_with_token  = {}
for token in vocab:
   # for each token in corpus vocabulary, count in how many documents it occurs
   count_docs = sum([1 for d in docs_token_counter if token in d.keys()])
   number_docs_with_token[token] = count_docs
print(number_docs_with_token['pandemic'])

17


In [21]:
# compute tfidf
for i, s in enumerate(docs_token_counter):
  doc_length = len(s)
  tfidf_vec = []
  for token in vocab:
    tf = s[token] / len(summaries[i]['tokenized_text'])
    idf = np.log(len(summaries) / number_docs_with_token[token])

    tfidf = tf * idf
    tfidf_vec.append(tfidf)
  
  # add tfidf-vector to the dictionaries
  summaries[i]['tfidf'] = tfidf_vec

# alternatively use scikit-learns TfidfVectorizer 
# vectorizer = TfidfVectorizer()

In [22]:
# save an updated summaries version with computed tfidf-vectors
with open(DATA_DIR + 'summaries.json', 'w') as outfile:
    json.dump(summaries, outfile)

### **Vectorize query**

In [23]:
# build a vectorizer function for search queries
def vectorize(query, vocab=vocab):
    query_tokenized = tokenizer(query)
    query_token_counter = Counter(query_tokenized)
    query_vec = []
    for token in vocab:
        tf = query_token_counter[token] / len(query_tokenized)
        idf = np.log(len(summaries) / number_docs_with_token[token])
        tfidf = tf * idf
        query_vec.append(tfidf)
            
    return query_vec

### **Search documents with scikit-learn**

In [24]:
# build a search function
def search_tfidf(query, summaries):
    
    # vectorize query
    query_vec = vectorize(query)
    query_arr = np.array(query_vec)
    
    # build a list of results using scitkit-learns cosine_similarity function
    results = []
    for s in summaries:
        # compute cosine similarity
        s_result = {}
        s_arr = np.array(s['tfidf'])
        result = cosine_similarity(query_arr.reshape(1,-1), s_arr.reshape(1,-1))[0][0]
        if result > 0:
            # list of dictionaries with 'title' and 'result'
            s_result['title'] = s['title']
            s_result['result'] = result
            results.append(s_result)

    # sort search results by cosine similarity scores 
    return sorted(results, key=lambda k: k['result'], reverse=True)

In [25]:
# test the function
search_tfidf("ebola", summaries)

[{'result': 0.11754261855142299, 'title': 'Plague of Cyprian'},
 {'result': 0.071125289564604, 'title': 'Science diplomacy and pandemics'}]

In [26]:
# check if the article 'Plague of Cyprian' has a word "ebola" in it
print([s["text"] for s in summaries if s["title"] == 'Plague of Cyprian'])

['The Plague of Cyprian was a pandemic that afflicted the Roman Empire about from AD 249 to 262. The plague is thought to have caused widespread manpower shortages for food production and the Roman army, severely weakening the empire during the Crisis of the Third Century. Its modern name commemorates St. Cyprian, bishop of Carthage, an early Christian writer who witnessed and described the plague. The agent of the plague is highly speculative because of sparse sourcing, but suspects have included smallpox, pandemic influenza and viral hemorrhagic fever (filoviruses) like the Ebola virus.']


### **Build an inverted index**

In [27]:
inverted_index = {}

for i, word in enumerate(vocab):
  inverted_index[word] = []

  for s in summaries:
    # list all articles each word in the corpus vocab occurs in + the corresponding word's tfidf-score for this article
    if s['tfidf'][i]!=0:
      inverted_index[word].append((s['title'], s['tfidf'][i]))

In [28]:
# print a sample
inverted_index["ebola"]

[('Plague of Cyprian', 0.047499062175213644),
 ('Science diplomacy and pandemics', 0.027286695292144007)]

In [29]:
# check if "ebola" is indeed in the article
print([s["text"] for s in summaries if s["title"] == "Plague of Cyprian"])

['The Plague of Cyprian was a pandemic that afflicted the Roman Empire about from AD 249 to 262. The plague is thought to have caused widespread manpower shortages for food production and the Roman army, severely weakening the empire during the Crisis of the Third Century. Its modern name commemorates St. Cyprian, bishop of Carthage, an early Christian writer who witnessed and described the plague. The agent of the plague is highly speculative because of sparse sourcing, but suspects have included smallpox, pandemic influenza and viral hemorrhagic fever (filoviruses) like the Ebola virus.']


### **Search inverted index**

In [30]:
# build a search function
def search(query, index=inverted_index):
  query = tokenizer(query)

  # lookup all query tokens in the inverted index
  # build a list of articles including them
  # creates a list of tuples with titles and tfidf score
  token_list = []
  for token in query:
    try:
      token_list.extend(inverted_index[token])
    except KeyError:
      print("Please enter a query containing words from the vocab and try again...")

  # create a dict with compound tfidf scores
  results_dict = defaultdict(int)

  # for each token take the score and accumulate it
  for k, v in token_list:
    results_dict[k] += v
  results = [(x, y) for x, y in results_dict.items()]

  # sort search results by tfidf scores
  return sorted(results, key = lambda x: x[1], reverse=True)


In [31]:
# check a multi-word query
search("is the ebola virus serious?")

[('Virus', 0.06746676589985189),
 ('Plague of Cyprian', 0.0634287152349009),
 ('Crimson Contagion', 0.0339553131009123),
 ('Viral load', 0.03386619154421699),
 ('Disease X', 0.031470777995967494),
 ('Swine influenza', 0.028050041257275376),
 ('Science diplomacy and pandemics', 0.027286695292144007),
 ('HIV/AIDS in Yunnan', 0.022837201731587032),
 ('HIV/AIDS', 0.013653988336874786),
 ('Spanish flu', 0.012903018978346673),
 ('Epidemiology of HIV/AIDS', 0.005973619897382719),
 ('COVID-19 pandemic', 0.005060007442488892)]