In [1]:
# importing libraries required
import numpy as np
import pandas as pd
import matplotlib.pyplot as py
import json

### Converting SQuAD 1.1 train, dev sets from json to dataframe

In [2]:
training = pd.read_json('train-v1.1.json')
cross_valid = pd.read_json('dev-v1.1.json')

In [3]:
# printing shape of training and cross validation sets
training.shape

(442, 2)

In [4]:
cross_valid.shape

(48, 2)

In [5]:
# printing first 5 entries of train set
training.head()

Unnamed: 0,data,version
0,"{'title': 'University_of_Notre_Dame', 'paragra...",1.1
1,"{'title': 'Beyoncé', 'paragraphs': [{'context'...",1.1
2,"{'title': 'Montana', 'paragraphs': [{'context'...",1.1
3,"{'title': 'Genocide', 'paragraphs': [{'context...",1.1
4,"{'title': 'Antibiotics', 'paragraphs': [{'cont...",1.1


In [6]:
# printing one entry to see all data and keys present in one train example
training.iloc[1,0]['paragraphs'][0]

{'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'qas': [{'answers': [{'answer_start': 269, 'text': 'in the late 1990s'}],
   'question': 'When did Beyonce start becoming popular?',
   'id': '56be85543aeaaa14008c9063'},
  {'answers': [{'answer_start': 207, 'text': 'singing and dancing'}],
   'question': 'What areas did Beyonce compet

In [7]:
cross_valid.shape

(48, 2)

In [8]:
cross_valid.iloc[1,0]['paragraphs'][0]

{'context': 'One of the most famous people born in Warsaw was Maria Skłodowska-Curie, who achieved international recognition for her research on radioactivity and was the first female recipient of the Nobel Prize. Famous musicians include Władysław Szpilman and Frédéric Chopin. Though Chopin was born in the village of Żelazowa Wola, about 60 km (37 mi) from Warsaw, he moved to the city with his family when he was seven months old. Casimir Pulaski, a Polish general and hero of the American Revolutionary War, was born here in 1745.',
 'qas': [{'answers': [{'answer_start': 188, 'text': 'Nobel Prize'},
    {'answer_start': 188, 'text': 'Nobel Prize'},
    {'answer_start': 188, 'text': 'Nobel Prize'}],
   'question': 'What was Maria Curie the first female recipient of?',
   'id': '5733a5f54776f41900660f45'},
  {'answers': [{'answer_start': 517, 'text': '1745'},
    {'answer_start': 517, 'text': '1745'},
    {'answer_start': 517, 'text': '1745'}],
   'question': 'What year was Casimir Pulask

In [9]:
# to convert json to dataframe
def json_to_df(X):
    contexts = []
    questions = []
    answers_text = []
    answers_start_idx = []
    for i in range(X.shape[0]):
        topic = X.iloc[i,0]['paragraphs']
        for para in topic:
            for q_a in para['qas']:
                questions.append(q_a['question'])
                answers_start_idx.append(q_a['answers'][0]['answer_start'])
                answers_text.append(q_a['answers'][0]['text'])
                contexts.append(para['context'])
    df = pd.DataFrame({"Context": contexts, "Question": questions, "Answer Start Index": answers_start_idx, "Answer Text": answers_text})
    return df


In [10]:
train = json_to_df(training)

In [11]:
train.shape

(87599, 4)

In [12]:
# first 5 entries of train dataframe
train.head()

Unnamed: 0,Context,Question,Answer Start Index,Answer Text
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,188,a copper statue of Christ
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,279,the Main Building
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,381,a Marian place of prayer and reflection
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,92,a golden statue of the Virgin Mary


In [13]:
# convert dataframe to csv file and save
train.to_csv("train.csv", index = None)

### Sentence Embeddings

In [14]:
# getting all paragraphs
paragraphs = list(train["Context"].drop_duplicates().reset_index(drop = True))

In [15]:
# total no. of paragraphs
len(paragraphs)

18891

In [16]:
pip install textblob

Note: you may need to restart the kernel to use updated packages.


In [17]:
# importing textblob - library to process text
import textblob
from textblob import TextBlob

In [18]:
# natural language tool kit
import nltk
# punkt - sent tokenizer - divides text into list of sentences
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
# Joining all paras (in context) - using TextBlob package
set_of_all_paras = TextBlob(" ".join(paragraphs))
sentences = [item.raw for item in set_of_all_paras.sentences]

In [20]:
len(sentences)

92659

In [21]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [22]:
# loading facebook sentence embeddings model trained with glove vectors
from models import InferSent
import torch
MODEL_PATH = 'infersent1.pkl' 

params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': 1}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [23]:
# setting path of model to glove vectors downloaded
W2V_PATH = "glove.840B.300d.txt"
infersent.set_w2v_path(W2V_PATH)

In [24]:
# building vocab from our list of sentences that is combination of all paras
infersent.build_vocab(sentences, tokenize=True)

Found 89010(/109674) words with w2v vectors
Vocab size : 89010


In [25]:
# encoding each sentence using above sentence embedding model
dict_embeddings = {}
for i in range(len(sentences)):
    # print(i)
    dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True)

KeyboardInterrupt: 

In [26]:
# printing the corresponding sentence embedding of each sentence
dict_embeddings

{'Architecturally, the school has a Catholic character.': array([[ 0.05519996,  0.05013141,  0.04787038, ...,  0.0082121 ,
         -0.03642813,  0.04468501]], dtype=float32),
 "Atop the Main Building's gold dome is a golden statue of the Virgin Mary.": array([[ 0.07475326,  0.11794455,  0.06240866, ...,  0.01915886,
         -0.02436748,  0.10806958]], dtype=float32),
 'Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".': array([[0.11262652, 0.11146843, 0.14750299, ..., 0.00293286, 0.03322019,
         0.06657628]], dtype=float32),
 'Next to the Main Building is the Basilica of the Sacred Heart.': array([[ 0.08010551,  0.11775322,  0.02186233, ...,  0.01656765,
         -0.01024128,  0.04706631]], dtype=float32),
 'Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.': array([[ 0.04149358,  0.0703306 ,  0.03724371, ...,  0.01096807,
         -0.02892282,  0.04

In [27]:
len(dict_embeddings)

45802

In [28]:
# making a list of the questions column from train set
questions = list(train["Question"])

In [29]:
len(questions)

87599

In [30]:
# creating sentence embedding for each question
for i in range(len(questions)):
    # print(i)
    dict_embeddings[questions[i]] = infersent.encode([questions[i]], tokenize=True)

KeyboardInterrupt: 

In [31]:
dict_embeddings['Architecturally, the school has a Catholic character.'][0]

array([ 0.05519996,  0.05013141,  0.04787038, ...,  0.0082121 ,
       -0.03642813,  0.04468501], dtype=float32)

In [32]:
d1 = {key:dict_embeddings[key] for i, key in enumerate(dict_embeddings) if i % 2 == 0}
d2 = {key:dict_embeddings[key] for i, key in enumerate(dict_embeddings) if i % 2 == 1}

In [33]:
d1

{'Architecturally, the school has a Catholic character.': array([[ 0.05519996,  0.05013141,  0.04787038, ...,  0.0082121 ,
         -0.03642813,  0.04468501]], dtype=float32),
 'Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".': array([[0.11262652, 0.11146843, 0.14750299, ..., 0.00293286, 0.03322019,
         0.06657628]], dtype=float32),
 'Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.': array([[ 0.04149358,  0.0703306 ,  0.03724371, ...,  0.01096807,
         -0.02892282,  0.04280659]], dtype=float32),
 'At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.': array([[ 0.04795521,  0.16508995,  0.09383532, ...,  0.05321151,
         -0.01826631,  0.10806958]], dtype=float32),
 'The nine student-run outlets include three newspapers, both a radio and television sta

In [34]:
d2

{"Atop the Main Building's gold dome is a golden statue of the Virgin Mary.": array([[ 0.07475326,  0.11794455,  0.06240866, ...,  0.01915886,
         -0.02436748,  0.10806958]], dtype=float32),
 'Next to the Main Building is the Basilica of the Sacred Heart.': array([[ 0.08010551,  0.11775322,  0.02186233, ...,  0.01656765,
         -0.01024128,  0.04706631]], dtype=float32),
 'It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.': array([[ 0.1077682 ,  0.08058012,  0.10461737, ...,  0.01522135,
         -0.03814263,  0.14945611]], dtype=float32),
 "As at most other universities, Notre Dame's students run a number of news media outlets.": array([[0.09720326, 0.09345727, 0.05466025, ..., 0.0844364 , 0.00817086,
         0.02197513]], dtype=float32),
 'Begun as a one-page journal in September 1876, the Scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in t

In [35]:
conda install -c conda-forge pickle5

^C

Note: you may need to restart the kernel to use updated packages.


In [39]:
# dump embeddings to pkl files
import pickle
with open('dict_embeddings1.pickle', 'wb') as handle:
    pickle.dump(d1, handle)

In [40]:
with open('dict_embeddings2.pickle', 'wb') as handle:
    pickle.dump(d2, handle)


Building graph of deps:   0%|          | 0/4 [00:00<?, ?it/s]
Examining @/win-64::__win==0=0:   0%|          | 0/4 [00:00<?, ?it/s]
Examining python=3.8:  25%|##5       | 1/4 [00:00<?, ?it/s]          
Examining pickle5:  50%|#####     | 2/4 [00:00<00:00,  3.03it/s]
Examining pickle5:  75%|#######5  | 3/4 [00:00<00:00,  4.54it/s]
Examining @/win-64::__archspec==1=x86_64:  75%|#######5  | 3/4 [00:00<00:00,  4.54it/s]
                                                                                       

Determining conflicts:   0%|          | 0/4 [00:00<?, ?it/s]
Examining conflict for python pickle5:   0%|          | 0/4 [00:00<?, ?it/s]
                                                                            

UnsatisfiableError: The following specifications were found
to be incompatible with the existing python installation in your environment:

Specifications:

  - pickle5 -> python[version='3.6.*|3.7.*']
  - pickle5 -> python[version='>=3.6,<3.7.0a0|>=3.7,<3.8.0a0']

Your pyth

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.
Solving environment: ...working... failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.
Solving environment: ...working... 
Found conflicts! Looking for incompatible packages.
This can take several minutes.  Press CTRL-C to abort.
failed


In [38]:
del dict_embeddings