## Problem
Information retrieval using word embeddings.

## Step 1-1 Import the libraries
Here are the libraries:


In [1]:
import gensim
from gensim.models import Word2Vec
import numpy as np
import nltk
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy
from scipy import spatial
from nltk.tokenize.toktok import ToktokTokenizer
import re
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

## Step 1-2 Create/import documents
Randomly taking sentences from the internet:


In [3]:
Doc1 = ["With the Union cabinet approving the amendments to the Motor Vehicles Act, 2016, those caught for drunken driving will  have to have really deep pockets, as the fine payable in court has been enhanced to Rs 10,000 for first-time offenders." ]
Doc2 = ["Natural language processing (NLP) is an area of computer science and artificial intelligence concerned with the interactions between computers and human (natural) languages,in particular how to program computers to process and analyze large amounts of natural language data."]
Doc3 = ["He points out that public transport is very good in Mumbai and New Delhi, where there is a good network of suburban and metro rail systems."]
Doc4 = ["But the man behind the wickets at the other end was watching just as keenly. With an affirmative nod from Dhoni,India captain Rohit Sharma promptly asked for a review. Sure enough, the ball would have clipped the top of middle and leg."]

# Put all the documents in one list
fin= Doc1+Doc2+Doc3+Doc4

## Step 1-3 Download word2vec
As mentioned earlier, we are going to use the word embeddings to solve
this problem. 

Download word2vec from the below link:

https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

In [None]:
#load the model
model = gensim.models.KeyedVectors.load_word2vec_format('/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
Step 1-4 Create IR system
Now we build the information retrieval system:
#Preprocessing
def remove_stopwords(text, is_lower_case=False):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, ", ".join(text))
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

# Function to get the embedding vector for n dimension, we have
used "300"
def get_embedding(word):
    if word in model.wv.vocab:
        return model[x]
    else:
        return np.zeros(300)

In [None]:
# Getting average vector for each document
out_dict = {}
for sen in fin:
    average_vector = (np.mean(np.array([get_embedding(x) for x in nltk.word_tokenize(remove_stopwords(sen))]), axis=0))
    dict = { sen : (average_vector) }
    out_dict.update(dict)

    # Function to calculate the similarity between the query vectorand document vector
def get_sim(query_embedding, average_vector_doc):
    sim = [(1 - scipy.spatial.distance.cosine(query_embedding,average_vector_doc))]
    return sim

# Rank all the documents based on the similarity to get relevant docs
def Ranked_documents(query):
    query_words = (np.mean(np.array([get_embedding(x) for x in nltk.word_tokenize(query.lower())],dtype=float), axis=0))
    rank = []
    for k,v in out_dict.items():
        rank.append((k, get_sim(query_words, v)))
    rank = sorted(rank,key=lambda t: t[1], reverse=True)
    print('Ranked Documents :')
    return rank

## Step 1-5 Results and applications
Let’s see how the information retrieval system we built is working with a
couple of examples.


In [None]:
# Call the IR function with a query
Ranked_documents("cricket")

In [None]:
Ranked_documents("driving")