In [1]:
import json
import spacy
from collections import Counter
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#Load file from milestone 1 with tokens
token_file = open("token_data.json",'r')
token_data = json.load(token_file)
token_file.close()

nlp = spacy.load("en_core_web_sm")

In [3]:
#Create global token count and corpus vocab
bag_of_words=[]
for item in token_data:
    bag_of_words.extend(item['tokenized_text'])
    
word_count= Counter(bag_of_words)
corpus_vocab = list(word_count.keys())

In [4]:
#Number of articles that include a token
token_article_count={}
for token in word_count:
    doc_count=sum([1 for doc in token_data if token in doc['tokenized_text']])
    token_article_count[token]=doc_count

In [5]:
#Compute tf_idf vector for each document
idf ={token: np.log(len(token_data)/token_article_count[token]) for token in corpus_vocab}

for article in token_data:
    token_count = Counter(article['tokenized_text'])
    token_total = len(article['tokenized_text'])
    
    tfidf_vec=[]
    for token in corpus_vocab:
        tf = token_count[token]/token_total
        tfidf = tf * idf[token]
        tfidf_vec.append(tfidf)
    
    article['tf_idf']=tfidf_vec

In [6]:
with open('tfidf_data.json', 'w') as outfile:
    json.dump(token_data, outfile)

### Query to Vec

In [7]:
def tokenizer(text):
    text_lower = nlp(text.lower())
    clean_tokens = [token.lemma_ for token in text_lower
                   if not token.is_stop
                   and not token.is_punct
                   and len(token.dep_.strip())!=0]
    return clean_tokens

In [8]:
def vectorize(query, vocab = corpus_vocab):
    tokenized_query = tokenizer(query)
    query_token_count = Counter(tokenized_query)
    idf = {token: np.log(len(token_data) /  token_article_count[token]) for token in vocab}
    
    query_vec=[]
    for token in vocab:
        tf = query_token_count[token] / len(tokenized_query)
        tfidf = tf * idf[token]
        query_vec.append(tfidf)
            
    return query_vec
    
    

### Article Search

In [9]:
def art_search (query, articles):
    query_vec = vectorize (query)
    query_arr = np.array(query_vec)
    
    rankings = []
    for art in articles:
        art_rank={}
        art_arr=np.array(art['tf_idf'])
        rank = cosine_similarity(query_arr.reshape(1,-1), art_arr.reshape(1,-1))[0][0]
        if rank > 0:
            art_rank['title']=art['title']
            art_rank['rank']=rank
            rankings.append(art_rank)
            
    return sorted(rankings, key=lambda k: k['rank'], reverse=True)

In [20]:
art_search ('people', token_data)

[{'title': 'Pandemic', 'rank': 0.19377592595564383},
 {'title': 'Epidemiology of HIV/AIDS', 'rank': 0.119507273981255},
 {'title': 'HIV/AIDS in Yunnan', 'rank': 0.11473600135755814},
 {'title': 'HIV/AIDS', 'rank': 0.03836578720617653},
 {'title': 'Swine influenza', 'rank': 0.03593161240096674},
 {'title': 'COVID-19 pandemic', 'rank': 0.024757165823364867},
 {'title': 'Viral load', 'rank': 0.02146143400714939},
 {'title': 'Spanish flu', 'rank': 0.021248318942639555},
 {'title': 'Basic reproduction number', 'rank': 0.020925349281131524},
 {'title': 'Cholera', 'rank': 0.016536575925365078},
 {'title': '1929–1930 psittacosis pandemic', 'rank': 0.014327420984000376}]