## Doc2Vec approach
Please run all the below cells 

In [122]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk import sent_tokenize
import gensim
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import json

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [133]:
train=pd.read_json("train-v1.1.json")

In [134]:
with open("train-v1.1.json",mode="rt",encoding="utf-8") as file:
    qa_data = json.load(file)

In [135]:
def get_SQuAD_data(qa_data):
    data = list()
    
    for instance in qa_data['data']:
        
        for paragraph in instance['paragraphs']:
            #print ("Paragraph: ",paragraph)
            context = paragraph['context']
            context_wid_list = [w.lower() for w in nltk.word_tokenize(context) ]
            
            qas = paragraph['qas']
            for qas_instance in qas:
                question = qas_instance['question']

                question_wid_list = [w.lower() for w in nltk.word_tokenize(question) ]
                
                answers = qas_instance['answers']

                for answer in answers:
                    ans = answer['text']
                    answer_wid_list = [w.lower() for w in nltk.word_tokenize(ans)]
                 
                  
                    data.append((context, question, ans))

         


    return data

In [136]:
temp=get_SQuAD_data(qa_data)
len(temp)

87599

In [138]:
contexts=[]
questions = []
answers_text = []
for i in range(len(temp)):
    contexts.append(temp[i][0])
    questions.append(temp[i][1])
    answers_text.append(temp[i][2])
    

In [139]:
print(len(contexts),len(answers_text),len(questions))

87599 87599 87599


In [140]:
df = pd.DataFrame({"context":contexts, "question": questions,"answer_text": answers_text})

In [142]:
df.head()

Unnamed: 0,context,question,answer_text
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary


In [143]:
df.to_csv("Squad.csv",index=False)

In [144]:
paras=list(dict.fromkeys(contexts))
len(paras)

18891

In [149]:
df['sentences'] = df['context'].apply(lambda x: [i for i in (sent_tokenize(x))])

In [150]:
df.head()

Unnamed: 0,context,question,answer_text,sentences
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,"[Architecturally, the school has a Catholic ch..."
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,"[Architecturally, the school has a Catholic ch..."
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,"[Architecturally, the school has a Catholic ch..."
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,"[Architecturally, the school has a Catholic ch..."
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,"[Architecturally, the school has a Catholic ch..."


In [151]:
sent=list(df["sentences"])

In [152]:
test=" ".join(paras)
sentences=[i for i in (sent_tokenize(test))]

In [153]:
dataset=sentences+questions
model = Doc2Vec(vector_size=150, min_count=2, epochs=10)
data = [TaggedDocument(doc, [i]) for i, doc in enumerate(dataset)]

In [154]:
len(dataset)

180258

In [155]:
model.build_vocab(data)
model.train(data, total_examples=model.corpus_count, epochs=model.epochs)
model.save("Doc2Vec_SQuAD.model")

CPU times: user 4min 32s, sys: 38.6 s, total: 5min 11s
Wall time: 3min 15s


In [156]:
sent_vectors=[]
for i in range(len(sentences)):
    sent_vectors.append(model.docvecs[i])

In [157]:
len(sent_vectors)

92659

In [158]:
ques_vectors=[]
for i in range(len(questions)):
    ques_vectors.append(model.docvecs[dataset.index(questions[i])].reshape(1,-1))

In [159]:
len(ques_vectors)

87599

In [160]:
dict_embeddings_sentences={}
for i in range(len(sentences)):
    #print(i)
    dict_embeddings_sentences[sentences[i]] = sent_vectors[i]

CPU times: user 56 ms, sys: 0 ns, total: 56 ms
Wall time: 56.8 ms


In [161]:
len(sentences)

92659

In [162]:
len(dict_embeddings_sentences)

92507

In [163]:
import pickle

In [164]:
dict_embeddings_questions={}
for i in range(len(ques_vectors)):
    dict_embeddings_questions[questions[i]] = ques_vectors[i]

CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 40.3 ms


In [165]:
with open('doc2vec_sentences.pickle', 'wb') as handle:
    pickle.dump(dict_embeddings_sentences, handle)

In [166]:
with open('doc2vec_questions.pickle', 'wb') as handle:
    pickle.dump(dict_embeddings_questions, handle)

In [167]:
data=pd.read_csv("Squad1.csv")
df=pd.DataFrame(data[:75000])

In [168]:
import pickle
with open("doc2vec_sentences.pickle", "rb") as f:
    dict_emb_sent = pickle.load(f)
with open("doc2vec_questions.pickle", "rb") as f:
    dict_emb_ques = pickle.load(f)

In [169]:
df['sentences'] = df['context'].apply(lambda x: [i for i in (sent_tokenize(x))])

In [170]:
c=0
df['sent_emb'] = df['sentences'].apply(lambda x: [dict_emb_sent[item] if item in dict_emb_sent else np.zeros(150) for item in x])
df['ques_emb'] = df['question'].apply(lambda x: [dict_emb_ques[x] if x in dict_emb_ques else np.zeros(150)])

In [171]:
df["target"] = df.apply(get_target, axis = 1)
df.head()

Unnamed: 0,context,question,answer_text,sentences,sent_emb,ques_emb,target
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,"[Architecturally, the school has a Catholic ch...","[[-0.012852693, -0.07284439, 0.033906225, 0.02...","[[[-0.045467515, -0.05104817, -0.015394564, -0...",5
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,"[Architecturally, the school has a Catholic ch...","[[-0.012852693, -0.07284439, 0.033906225, 0.02...","[[[-0.13487099, -0.028842488, 0.012200098, 0.0...",2
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,"[Architecturally, the school has a Catholic ch...","[[-0.012852693, -0.07284439, 0.033906225, 0.02...","[[[-0.012634624, 0.024878401, -0.010640636, 0....",3
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,"[Architecturally, the school has a Catholic ch...","[[-0.012852693, -0.07284439, 0.033906225, 0.02...","[[[-0.052588664, -0.03730036, 0.04876959, 0.02...",4
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,"[Architecturally, the school has a Catholic ch...","[[-0.012852693, -0.07284439, 0.033906225, 0.02...","[[[-0.08492059, -0.021992393, 0.006915241, 0.0...",1


In [172]:
def cosine(x):
    li=[]
    for i in x["sent_emb"]:
        s=i.reshape(1,-1)
        q=np.array(x["ques_emb"]).reshape(1,-1)
        li.append(float(cosine_similarity(s,q)))
    return li

In [173]:
df["cosine_sim"]=df.apply(cosine,axis=1)

In [174]:
def cosine_index(d):
    return np.argmax(d)
def euc_index(d):
    return np.argmin(d)

In [175]:
df["cosine_index"] = df["cosine_sim"].apply(lambda x: cosine_index(x))

In [176]:
def accuracy(target, predicted):
    acc = (target==predicted).sum()/len(target)
    return acc

In [177]:
print(accuracy(df["target"], df["cosine_index"]))

0.3119733333333333
