# TF-IDF Search Using Cosine Similarity

### 1. Imports
Load all relevant Python libraries and a spaCy language model.

In [69]:
import numpy as np
import json
from collections import Counter
import math
from sklearn.metrics.pairwise import cosine_similarity

### 2. Access tokens
Access the tokenized text in your new dataset from the previous milestone. Each document dictionary should now include a new key-value pair with the lemmatized text of the articles.

In [25]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

### 3. Create a corpus vocabulary.
It should simply be a list of unique tokens in the provided set of documents. Count how many times each unique token appears in the corpus, you will need these counts for the next step.

In [41]:
f = open('output.json',)
data = json.load(f)
corpus = []
for obj in data:
    tokens = obj['tokenized_text']
    corpus = corpus + tokens

In [50]:
N = len(data)
corpus_counts = Counter(corpus)
corpus = list(corpus_counts.keys())

In [52]:
print('Number of documents: ', len(data))
print('Number of unique tokens: ', len(corpus))

Number of documents:  26
Number of unique tokens:  1999


### 4. Calculate TF-IDF vectors
Calculate Tf-Idf vectors for every article in the dataset and add these vectors to the article dictionaries. You should end up the same list of dictionaries as before, but with a new key-value pair containing Tf-Idf vectors: [link](https://towardsdatascience.com/tf-term-frequency-idf-inverse-document-frequency-from-scratch-in-python-6c2b61b78558)

In [59]:
def tokenize(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

def createVector(corpus, tokens):
    # tf(t,d) = count of t in d / number of words in d
    # df(t) = occurrence of t in documents
    # idf(t) = log(N/(df + 1))
    # tf-idf(t, d) = tf(t, d) * log(N/(df + 1))
    doc_counts = Counter(tokens)
    vector = []
    for t in corpus_counts.keys():
        tf = doc_counts[t] / len(tokens)
        df = corpus_counts[t]
        idf = math.log(N / (df + 1))
        tf_idf = tf * idf
        vector.append(tf_idf)
    return vector

In [60]:
for d in data:
    tokens = d['tokenized_text']
    vector = createVector(corpus, tokens)
    d['tf_idfs'] = vector

In [74]:
json_object = json.dumps(data, indent = 4)
with open("output2.json", "w") as outfile:
    outfile.write(json_object)

### 5. Search using cosine_similarity
Now we can try to search our list of dictionaries using this Tf-Idf field using existing tools for similarity. We suggest you use scikit-learn library and its cosine_similarity function.

In [66]:
doc_vectors = [d['tf_idfs'] for d in data]

In [72]:
def search(doc_vectors, query_vector):
    similarities = []
    for doc_vector in doc_vectors:
        x = np.array(doc_vector)
        y = np.array(query_vector)
        # Need to reshape these
        x = x.reshape(1,-1)
        y = y.reshape(1,-1)
        similarities.append(cosine_similarity(x,y)[0][0])
    return similarities

In [73]:
query = data[0]['text']
query_tokens = tokenize(query)
query_vector = createVector(corpus, query_tokens)
search(doc_vectors, query_vector)

[1.0000000000000004,
 0.6577252805096234,
 0.5570125628543748,
 0.524118991779826,
 0.500633081698554,
 0.6155685311968865,
 0.6300714676318564,
 0.535137372425514,
 0.5001747355065149,
 0.48312109538620746,
 0.6869963366040872,
 0.6007230404089505,
 0.42474770533984096,
 0.5377478395195895,
 0.5279890687233489,
 0.5572419093861354,
 0.42052043365896996,
 0.6180176313136561,
 0.4843090681636577,
 0.6734567381836428,
 0.6129360551760736,
 0.6360539609029221,
 0.42589405973007666,
 0.5614056627100512,
 0.5770807248138461,
 0.6712457445301976]