In [1]:
!nvidia-smi

Thu Oct  8 05:45:03 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8    12W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install tqdm --upgrade >> /dev/null 2>&1
!pip install torchtext --upgrade >> /dev/null 2>&1
!pip install spacy --upgrade >> /dev/null 2>&1
!python -m spacy download en >> /dev/null 2>&1

In [3]:
import json
import tqdm
import spacy
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.display import display, HTML

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Dataset, Example, Field
from torchtext.data.iterator import BucketIterator
from torchtext.data.metrics import bleu_score

In [4]:
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

SEED = 546
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {DEVICE}')

Device: cuda


In [5]:
!mkdir -p ./data
!wget https://raw.githubusercontent.com/kushalj001/pytorch-question-answering/master/data/squad_train.json -O ./data/squad_train.json
!wget https://raw.githubusercontent.com/kushalj001/pytorch-question-answering/master/data/squad_dev.json -O ./data/squad_dev.json

--2020-10-08 05:45:19--  https://raw.githubusercontent.com/kushalj001/pytorch-question-answering/master/data/squad_train.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30288272 (29M) [text/plain]
Saving to: ‘./data/squad_train.json’


2020-10-08 05:45:20 (55.5 MB/s) - ‘./data/squad_train.json’ saved [30288272/30288272]

--2020-10-08 05:45:20--  https://raw.githubusercontent.com/kushalj001/pytorch-question-answering/master/data/squad_dev.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4854279 (4.6M) [text/plain]
Saving to: ‘./data

In [6]:
with open('./data/squad_train.json', 'r', encoding='utf-8') as file:
    train = json.load(file)['data']

with open('./data/squad_dev.json', 'r', encoding='utf-8') as file:
    valid = json.load(file)['data']

len(train), len(valid)

(442, 48)

In [7]:
def parse(data):
    result = []
    for paragraphs in data:
        for paragraph in paragraphs['paragraphs']:
            context = paragraph['context']
            for qas in paragraph['qas']:
                id = qas['id']
                question = qas['question']
                for answers in qas['answers']:
                    answer_start = answers['answer_start']
                    answer = answers['text']
                    result.append({
                        'id': id,
                        'context': context,
                        'question': question,
                        'answer_start': answer_start,
                        'answer': answer
                    })
    return result

In [8]:
train = parse(train)
valid = parse(valid)
len(train), len(valid)

(87599, 34726)

In [9]:
print(train[0]['question'])
print(train[0]['answer'])
train[0]['context']

To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Saint Bernadette Soubirous


'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [10]:
question_len, answer_len, context_len = zip(*[*map(lambda x: (len(x['question']), len(x['answer']), len(x['context'])), train)])

In [22]:
TEXT = Field(init_token='<sos>', eos_token='<eos>', lower=True, tokenize='spacy', tokenizer_language='en', include_lengths=True)


train_examples = [Example.fromdict(
    data=pair,
    fields={
        'context': ('context', TEXT),
        'question': ('question', TEXT),
        'answer': ('answer', TEXT)
    }
) for pair in tqdm.tqdm(train)]

valid_examples = [Example.fromdict(
    data=pair,
    fields={
        'context': ('context', TEXT),
        'question': ('question', TEXT),
        'answer': ('answer', TEXT)
    }
) for pair in tqdm.tqdm(valid)]

print(f'Train examples lengths: {len(train_examples):,}')
print(f'Valid examples lengths: {len(valid_examples):,}')

100%|██████████| 87599/87599 [02:03<00:00, 707.30it/s]
100%|██████████| 34726/34726 [00:49<00:00, 694.72it/s]

Train examples lengths: 87,599
Valid examples lengths: 34,726





In [20]:
# Filter
# examples = [*filter(lambda example: len(example.context) < 400 and len(example.question) < 50 and len(example.answer) < 50, examples)]
# len([*filter(lambda example: len(example.context) < 400 and len(example.question) < 50 and len(example.answer) < 50, examples)])

87369

In [29]:
train_data = Dataset(train_examples, fields={'context': EN, 'question': EN, 'answer': EN})
valid_data = Dataset(valid_examples, fields={'context': EN, 'question': EN, 'answer': EN})
print(f'train size: {len(train_data.examples):,}')
print(f'valid size: {len(valid_data.examples):,}')

print(vars(train_data.examples[0]))

train size: 87,599
valid size: 34,726
{'context': ['architecturally', ',', 'the', 'school', 'has', 'a', 'catholic', 'character', '.', 'atop', 'the', 'main', 'building', "'s", 'gold', 'dome', 'is', 'a', 'golden', 'statue', 'of', 'the', 'virgin', 'mary', '.', 'immediately', 'in', 'front', 'of', 'the', 'main', 'building', 'and', 'facing', 'it', ',', 'is', 'a', 'copper', 'statue', 'of', 'christ', 'with', 'arms', 'upraised', 'with', 'the', 'legend', '"', 'venite', 'ad', 'me', 'omnes', '"', '.', 'next', 'to', 'the', 'main', 'building', 'is', 'the', 'basilica', 'of', 'the', 'sacred', 'heart', '.', 'immediately', 'behind', 'the', 'basilica', 'is', 'the', 'grotto', ',', 'a', 'marian', 'place', 'of', 'prayer', 'and', 'reflection', '.', 'it', 'is', 'a', 'replica', 'of', 'the', 'grotto', 'at', 'lourdes', ',', 'france', 'where', 'the', 'virgin', 'mary', 'reputedly', 'appeared', 'to', 'saint', 'bernadette', 'soubirous', 'in', '1858', '.', 'at', 'the', 'end', 'of', 'the', 'main', 'drive', '(', 'and',

In [30]:
EN.build_vocab(train_data, min_freq=2, specials=['<sos>', '<eos>', '<pad>'])
print(f'Length of EN vocabulary: {len(EN.vocab):,}')

Length of EN vocabulary: 87,843


In [35]:
train[0]['answer_start'], train[0]['context'][515:520]

(515, 'Saint')

In [31]:
Example.fromdict(
    data=train[0],
    fields={
        'context': ('context', EN),
        'question': ('question', EN),
        'answer': ('answer', EN),
        'answer_start': ('answer_start', None)
    }
)

AttributeError: ignored