# Automatic Question Answering

**Using Toolkit/Packages**
Numpy

Scikit-learn

gensim

nltk

Tensorflow=1.13

In [32]:
import pandas as pd;

#Load dataset and examine dataset, rename columns to questions and answers

df=pd.read_csv("/content/dataset250.csv");
df.columns=["questions","answers"];

df

Unnamed: 0,questions,answers
0,When was the battle of Buxar fought,"22 October, 1764"
1,When was the treaty of Surat signed,1775
2,Who devised the Doctrine of Lapse,Lord Dalhousie
3,Which was the main source of income for the co...,Agricultual taxation
4,Which treaty made the French got the Breton is...,Aix la chappelle
...,...,...
99,What disclosed that the people of Bengal were ...,The battle of Plassey
100,How british ensure the political result of the...,"By Mir Jafar, their favourite, the nawab of Be..."
101,Who vanquished Peshwa Baji Rao in a battle nea...,Jaswant Rao.
102,Who accepted the subsidiary coalition imposed ...,Baji rao.


# Preprocessing 

1. Removing all characters that are not alpha numeric
2. Removing stopwords - commonly used words such as 'a', 'to', 'in' and so on.. that do not contribute to the semantic similarity between two sentences.


In [33]:

import re
import gensim 
from gensim.parsing.preprocessing import remove_stopwords

def clean_sentence(sentence, stopwords=False):
    
    sentence = sentence.lower().strip()
    sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
    #sentence = re.sub(r'\s{2,}', ' ', sentence)
    
    if stopwords:
         sentence = remove_stopwords(sentence)
    
    #sent_stemmed='';
    #for word in sentence.split():
    #    sent_stemmed+=' '+st.stem(word) 
    #sentence=sent_stemmed
    
    return sentence

def get_cleaned_sentences(df,stopwords=False):    
    sents=df[["questions"]];
    cleaned_sentences=[]

    for index,row in df.iterrows():
        #print(index,row)
        cleaned=clean_sentence(row["questions"],stopwords);
        cleaned_sentences.append(cleaned);
    return cleaned_sentences;

cleaned_sentences=get_cleaned_sentences(df,stopwords=True)
print(cleaned_sentences);

print("\n")

cleaned_sentences_with_stopwords=get_cleaned_sentences(df,stopwords=False)
print(cleaned_sentences_with_stopwords);


['battle buxar fought', 'treaty surat signed', 'devised doctrine lapse', 'main source income company', 'treaty french got breton island louisberg 1748', 'treaty mangalore signed', 'perilous feature subsidiary coalition', 'appointed governor bengal battle buxar', 'tipu send ambassadors france turkey', 'states surmounted company doctrine lapse', 'zamindars required paid company', 'treaty surat signed', 'benefits british establishing doctrine lapse', 'acceptance principle discriminating aegis fiscal autonomy help country', 'british conquest india commence', 'battles fought nawabs bengal', 'richest fertile regions india', 'company liberty export import goods india paying taxes', 'sanctioned english trade bengal payment tax', 'caused conflict nawab siraj ud daula english', 'nawab siraj ud daula attack calcutta', 'nawab appoint administrator calcutta', 'english reconquer calcutta nawab', 'city hugli eradicated british', 'placed throne dethronement sirajuddaula', 'battle plassey fought', 'naw

## Bag of words Model

In [35]:
# Define function for Taking Question from Users
def askQuestion():
    question_orig==input("Enter Your Question: ") 
    return question_orig


In [37]:
import numpy

sentences=cleaned_sentences_with_stopwords
#sentences=cleaned_sentences

# Split it by white space 
sentence_words = [[word for word in document.split() ]
         for document in sentences]

from gensim import corpora

dictionary = corpora.Dictionary(sentence_words)
for key, value in dictionary.items():
    print(key, ' : ', value)

import pprint
bow_corpus = [dictionary.doc2bow(text) for text in sentence_words]
for sent,embedding in zip(sentences,bow_corpus):
    print(sent)
    print(embedding)


question_orig="When was the battle of Buxar fought"
question=clean_sentence(question_orig,stopwords=False);
question_embedding = dictionary.doc2bow(question.split())


print("\n\n",question,"\n",question_embedding)

0  :  battle
1  :  buxar
2  :  fought
3  :  of
4  :  the
5  :  was
6  :  when
7  :  signed
8  :  surat
9  :  treaty
10  :  devised
11  :  doctrine
12  :  lapse
13  :  who
14  :  company
15  :  for
16  :  income
17  :  main
18  :  source
19  :  which
20  :  1748
21  :  and
22  :  breton
23  :  french
24  :  got
25  :  in
26  :  island
27  :  louisberg
28  :  made
29  :  between
30  :  mangalore
31  :  whom
32  :  coalition
33  :  feature
34  :  most
35  :  perilous
36  :  subsidiary
37  :  after
38  :  appointed
39  :  bengal
40  :  governor
41  :  ambassadors
42  :  did
43  :  france
44  :  his
45  :  send
46  :  tipu
47  :  to
48  :  turkey
49  :  why
50  :  by
51  :  name
52  :  some
53  :  states
54  :  surmounted
55  :  with
56  :  amount
57  :  be
58  :  paid
59  :  required
60  :  what
61  :  zamindars
62  :  where
63  :  benefits
64  :  british
65  :  establishing
66  :  were
67  :  acceptance
68  :  aegis
69  :  autonomy
70  :  country
71  :  discriminating
72  :  fiscal
73  : 

By using BOW, we found the vector representation for every Sentence. So we can compute the distance between two vectors by taking the cosine similarity. The closest matching answer can be retrieved by finding the cosine similarity of the query vector with each of the question vectors. 

#Using SKlearn

In [38]:
import sklearn
from sklearn.metrics.pairwise import cosine_similarity;
def retrieveAndPrintFAQAnswer(question_embedding,sentence_embeddings,FAQdf,sentences):
    max_sim=-1;
    index_sim=-1;
    for index,faq_embedding in enumerate(sentence_embeddings):
        #sim=cosine_similarity(embedding.reshape(1, -1),question_embedding.reshape(1, -1))[0][0];
        sim=cosine_similarity(faq_embedding,question_embedding)[0][0];
        print(index, sim, sentences[index])
        if sim>max_sim:
            max_sim=sim;
            index_sim=index;
       
    print("\n")
    print("Question: ",question)
    print("\n");
    print("Retrieved: ",FAQdf.iloc[index_sim,0]) 
    print(FAQdf.iloc[index_sim,1])        
    
retrieveAndPrintFAQAnswer(question_embedding,bow_corpus,df,sentences);

0 1.0 when was the battle of buxar fought
1 0.31622776601683794 when was the treaty of surat signed
2 0.31622776601683794 who devised the doctrine of lapse
3 0.31622776601683794 which was the main source of income for the company
4 0.4472135954999579 which treaty made the french got the breton island and louisberg in 1748
5 0.31622776601683794 between whom was the treaty of mangalore signed between
6 0.31622776601683794 which was the most perilous feature of the subsidiary coalition
7 1.0 who was appointed the governor of bengal after the battle of buxar
8 0.047565149415449405 why did tipu send his ambassadors to france and turkey
9 0.31622776601683794 name some states surmounted by the company with the doctrine of lapse 
10 0.31622776601683794 what amount of zamindars was required to be paid to the company
11 0.31622776601683794 where was the treaty of surat signed
12 0.31622776601683794 what were the benefits for the british by establishing the doctrine of lapse
13 0.5547001962252291

In [39]:
def retrieveAndPrintFAQAnswer_bow(question_embedding,sentence_embeddings,FAQdf,sentences):
    max_sim=-1;
    index_sim=-1;
    for index,faq_embedding in enumerate(sentence_embeddings):
        #sim=cosine_similarity(embedding.reshape(1, -1),question_embedding.reshape(1, -1))[0][0];
        sim=cosine_similarity(faq_embedding,question_embedding)[0][0];
        #print(index, sim, sentences[index])
        if sim>max_sim:
            max_sim=sim;
            index_sim=index;
       
    print("\n")
    print("Question: ",question)
    print("\n");
    print("Retrieved: ",FAQdf.iloc[index_sim,0]) 
    print(FAQdf.iloc[index_sim,1])        
def answer1(question):
    question_embedding = dictionary.doc2bow(question.split())
    retrieveAndPrintFAQAnswer_bow(question_embedding,bow_corpus,df,sentences); 

In [40]:
flag=1
while(flag):
    question=input("Enter Your Question: ")  
    if question=="END":
        flag=0
    if flag:
        if question[-1]!="?":
            question+="?"
        print(answer1(question))

Enter Your Question: When was the battle of Buxar fought


Question:  When was the battle of Buxar fought?


Retrieved:  When was the battle of Buxar fought
22 October, 1764
None
Enter Your Question: When did the company start import and export without paying taxes?


Question:  When did the company start import and export without paying taxes?


Retrieved:  When did the company get liberty to export and import goods in India without paying taxes
1717
None
Enter Your Question: What was the impact of the first world war?


Question:  What was the impact of the first world war?


Retrieved:  When was the treaty of Surat signed 
1775
None
Enter Your Question: What caused de-industrialization In India?


Question:  What caused de-industrialization In India?


Retrieved:  What had done directly or indirectly through inter-me diaries
Zamindars and Revenue farmers
None
Enter Your Question: What had done directly or indirectly through inter-me diaries


Question:  What had done directly or ind

# Word2Vec Embeddings 

Word2Vec ebeddings are popularly trained using the skipgram model. These embeddings are trained to take a word as input and reconstruct its context. As a result, they are able to take into account semantic similarity of words based  on context information. The resulting embeddings are such that words with similar meaning tend to be closer in terms of cosine similarity.


**Skipgram model** : 

The most popular word2vec model is the skipgram model. Particularly, the most commonly used pre-trained model is based on the Google News dataset that has 3 billion running words and creates upto 300 dimensional embedding for 3 Million words


# Glove Embeddings

Glove is an alternate approach to build word embeddings using matrix factorization techinques on the word-word co-occurance matrix. 

While both the techniques are popular, glove performs better on some datasets while word2vec skipgram model performs better on some. Here, we experiment with both the word2vec and the glove models. 


In [42]:
from gensim.models import Word2Vec 
import gensim.downloader as api


glove_model=None;
try:
    glove_model = gensim.models.KeyedVectors.load("./glovemodel.mod")
    print("Loaded glove model")
except:            
    glove_model = api.load('glove-twitter-25')
    glove_model.save("./glovemodel.mod")
    print("Saved glove model")
    
v2w_model=None;
try:
    v2w_model = gensim.models.KeyedVectors.load("./w2vecmodel.mod")
    print("Loaded w2v model")
except:            
    v2w_model = api.load('word2vec-google-news-300')
    v2w_model.save("./w2vecmodel.mod")
    print("Saved glove model")

w2vec_embedding_size=len(v2w_model['computer']);
glove_embedding_size=len(glove_model['computer']);


Loaded glove model
Loaded w2v model


**Finding Phrase Embeddings from Word Embeddings** 

The most simple technique to convert word embeddings to phrase embeddings, that is applicable with word2vec and glove embeddings, is to sum up the individual word embeddings in the phrase to get a phrase vector. 


In [43]:
def getWordVec(word,model):
        samp=model['computer'];
        vec=[0]*len(samp);
        try:
                vec=model[word];
        except:
                vec=[0]*len(samp);
        return (vec)


def getPhraseEmbedding(phrase,embeddingmodel):
                       
        samp=getWordVec('computer', embeddingmodel);
        vec=numpy.array([0]*len(samp));
        den=0;
        for word in phrase.split():
            #print(word)
            den=den+1;
            vec=vec+numpy.array(getWordVec(word,embeddingmodel));
        #vec=vec/den;
        #return (vec.tolist());
        return vec.reshape(1, -1)


In [44]:
#With w2Vec
question = "Which treaty made the French got the Breton island and Louisberg in 1748"
sent_embeddings=[];
for sent in cleaned_sentences:
    sent_embeddings.append(getPhraseEmbedding(sent,v2w_model));

question_embedding=getPhraseEmbedding(question,v2w_model);

retrieveAndPrintFAQAnswer(question_embedding,sent_embeddings,df, cleaned_sentences);

0 0.20214798385864208 battle buxar fought
1 0.4195254505074264 treaty surat signed
2 0.27536268269129016 devised doctrine lapse
3 0.22753056356968718 main source income company
4 0.7257142760251133 treaty french got breton island louisberg 1748
5 0.47194726116408403 treaty mangalore signed
6 0.1984720202476671 perilous feature subsidiary coalition
7 0.20144731202547012 appointed governor bengal battle buxar
8 0.3615471294693222 tipu send ambassadors france turkey
9 0.31535586460552023 states surmounted company doctrine lapse
10 0.23426110171117345 zamindars required paid company
11 0.4195254505074264 treaty surat signed
12 0.366975899594651 benefits british establishing doctrine lapse
13 0.34184384521897687 acceptance principle discriminating aegis fiscal autonomy help country
14 0.2845191158132607 british conquest india commence
15 0.20248217047864953 battles fought nawabs bengal
16 0.2587694393431386 richest fertile regions india
17 0.30813164441167973 company liberty export import g

In [46]:
def retrieveAndPrintFAQAnswer_w2v(question_embedding,sentence_embeddings,FAQdf,sentences):
    max_sim=-1;
    index_sim=-1;
    for index,faq_embedding in enumerate(sentence_embeddings):
        #sim=cosine_similarity(embedding.reshape(1, -1),question_embedding.reshape(1, -1))[0][0];
        sim=cosine_similarity(faq_embedding,question_embedding)[0][0];
        #print(index, sim, sentences[index])
        if sim>max_sim:
            max_sim=sim;
            index_sim=index;
       
    print("\n")
    print("Question: ",question)
    print("\n");
    print("Retrieved: ",FAQdf.iloc[index_sim,0]) 
    print(FAQdf.iloc[index_sim,1])  

def answer(question):
    question_embedding=getPhraseEmbedding(question,v2w_model);
    retrieveAndPrintFAQAnswer_w2v(question_embedding,sent_embeddings,df, cleaned_sentences);

flag=1
while(flag):
    question=input("Enter Your Question: ")  
    if question=="END":
        flag=0
    if flag:
        if question[-1]!="?":
            question+="?"
        print(answer(question))    

Enter Your Question: Which treaty made the French got the Breton island and Louisberg ?


Question:  Which treaty made the French got the Breton island and Louisberg ?


Retrieved:  Which treaty made the French got the Breton island and Louisberg in 1748
Aix la chappelle
None
Enter Your Question: When did the company start import and export without paying taxes?


Question:  When did the company start import and export without paying taxes?


Retrieved:  When did the company get liberty to export and import goods in India without paying taxes
1717
None
Enter Your Question: What was the impact of the first world war?


Question:  What was the impact of the first world war?


Retrieved:  What is the commercial impact of first world war in India
The First World War engendered far-reaching vicissitudes in the world’s economy and circumstances coerced Britain to transmute her industrial and commercial policies in India.
None
Enter Your Question: What caused de-industrialization In India?




In [47]:
#With Glove

sent_embeddings=[];
for sent in cleaned_sentences:
    sent_embeddings.append(getPhraseEmbedding(sent,glove_model));

question = "When was the Second Anglo-Maratha War fought ?"
question_embedding=getPhraseEmbedding(question,glove_model);

retrieveAndPrintFAQAnswer(question_embedding,sent_embeddings,df, cleaned_sentences);


0 0.8301433436537901 battle buxar fought
1 0.5582168444250387 treaty surat signed
2 0.2902345457706044 devised doctrine lapse
3 0.7015677681591809 main source income company
4 0.8434873180185724 treaty french got breton island louisberg 1748
5 0.5221629407445977 treaty mangalore signed
6 0.4843006461530167 perilous feature subsidiary coalition
7 0.687909597909087 appointed governor bengal battle buxar
8 0.7490336877334041 tipu send ambassadors france turkey
9 0.6570059151267291 states surmounted company doctrine lapse
10 0.7411516957507305 zamindars required paid company
11 0.5582168444250387 treaty surat signed
12 0.5802286098380276 benefits british establishing doctrine lapse
13 0.6533253633364936 acceptance principle discriminating aegis fiscal autonomy help country
14 0.7329515608542523 british conquest india commence
15 0.6983647426646353 battles fought nawabs bengal
16 0.5464621993497707 richest fertile regions india
17 0.6944154725547386 company liberty export import goods india

In [48]:
sent_embeddings=[];
for sent in cleaned_sentences:
    sent_embeddings.append(getPhraseEmbedding(sent,glove_model));
def retrieveAndPrintFAQAnswer_glove(question_embedding,sentence_embeddings,FAQdf,sentences):
    max_sim=-1;
    index_sim=-1;
    for index,faq_embedding in enumerate(sentence_embeddings):
        #sim=cosine_similarity(embedding.reshape(1, -1),question_embedding.reshape(1, -1))[0][0];
        sim=cosine_similarity(faq_embedding,question_embedding)[0][0];
        #print(index, sim, sentences[index])
        if sim>max_sim:
            max_sim=sim;
            index_sim=index;
       
    print("\n")
    print("Question: ",question)
    print("\n");
    print("Retrieved: ",FAQdf.iloc[index_sim,0]) 
    print(FAQdf.iloc[index_sim,1])  

def answer25(question):
    question_embedding=getPhraseEmbedding(question,glove_model);
    retrieveAndPrintFAQAnswer_glove(question_embedding,sent_embeddings,df, cleaned_sentences);

flag=1
while(flag):
    question=input("Enter Your Question: ")  
    if question=="END":
        flag=0
    if flag:
        if question[-1]!="?":
            question+="?"
        print(answer25(question))    

Enter Your Question:  Which treaty made the French got the Breton island and Louisberg ?


Question:   Which treaty made the French got the Breton island and Louisberg ?


Retrieved:  What is the commercial impact of first world war in India
The First World War engendered far-reaching vicissitudes in the world’s economy and circumstances coerced Britain to transmute her industrial and commercial policies in India.
None
Enter Your Question: When did the company start import and export without paying taxes?


Question:  When did the company start import and export without paying taxes?


Retrieved:  What period was the greatest prosperity in cotton industry in western india 
the war years (1914-18)
None
Enter Your Question: What was the impact of the first world war?


Question:  What was the impact of the first world war?


Retrieved:  What is the commercial impact of first world war in India
The First World War engendered far-reaching vicissitudes in the world’s economy and circumstanc