## Load datasets

In [None]:
"""BERT model for Question Answering (span extraction).
    This module is composed of the BERT model with a linear layer on top of
    the sequence output that computes start_logits and end_logits

    Params:
        `config`: a BertConfig class instance with the configuration to build a new model.

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
            into account for computing the loss.
        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
            into account for computing the loss.

    Outputs:
        if `start_positions` and `end_positions` are not `None`:
            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
        if `start_positions` or `end_positions` is `None`:
            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
            position tokens of shape [batch_size, sequence_length].

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = BertForQuestionAnswering(config)
    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
    ```
"""

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import collections
import logging
import json
import math
import os
import random
import pickle
import numpy as np
import torch

from tqdm import tqdm, trange
from bert_utils.bert_utils import *
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

from pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE

Configuration settings

In [2]:
config = {'language':'kor', 'model_config':123}

## logger settings
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

if 'kor' in config['language']:
    train_file = './datasets/squad_v1.1/train-v1.1.json'
    dev_file = './datasets/squad_v1.1/dev-v1.1.json'
elif 'eng' in config['language']:
    train_file = './datasets/korquad_v1/KorQuAD_v1.0_train.json'
    dev_file = './datasets/korquad_v1/KorQuAD_v1.0_dev.json'    

In [3]:
def read_json(file_name):
    print("reading {}".format(file_name))
    with open(file_name, "r", encoding='utf-8') as reader:
        # >> type(input_data) -> list
        input_data = json.load(reader)["data"]
    print("success to read {}".format(file_name))
    return input_data

In [4]:
input_data = read_json(train_file)

reading ./datasets/squad_v1.1/train-v1.1.json
success to read ./datasets/squad_v1.1/train-v1.1.json


explore the datasets

In [5]:
type(input_data), type(input_data[0]), input_data[0].keys(), type(input_data[0]['paragraphs'])

(list, dict, dict_keys(['title', 'paragraphs']), list)

In [6]:
input_data[1]['paragraphs'][1]

{'context': 'Following the disbandment of Destiny\'s Child in June 2005, she released her second solo album, B\'Day (2006), which contained hits "Déjà Vu", "Irreplaceable", and "Beautiful Liar". Beyoncé also ventured into acting, with a Golden Globe-nominated performance in Dreamgirls (2006), and starring roles in The Pink Panther (2006) and Obsessed (2009). Her marriage to rapper Jay Z and portrayal of Etta James in Cadillac Records (2008) influenced her third album, I Am... Sasha Fierce (2008), which saw the birth of her alter-ego Sasha Fierce and earned a record-setting six Grammy Awards in 2010, including Song of the Year for "Single Ladies (Put a Ring on It)". Beyoncé took a hiatus from music in 2010 and took over management of her career; her fourth album 4 (2011) was subsequently mellower in tone, exploring 1970s funk, 1980s pop, and 1990s soul. Her critically acclaimed fifth studio album, Beyoncé (2013), was distinguished from previous releases by its experimental production an

## parse datasets

In [7]:
class SquadExample(object):
    """A single training/test example for the Squad dataset."""

    def __init__(self,
                 qas_id,
                 question_text,
                 doc_tokens,
                 orig_answer_text=None,
                 start_position=None,
                 end_position=None):
        self.qas_id = qas_id
        self.question_text = question_text
        self.doc_tokens = doc_tokens
        self.orig_answer_text = orig_answer_text
        self.start_position = start_position
        self.end_position = end_position

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        s = ""
        s += "qas_id: %s" % (self.qas_id)
        s += "\n\n, question_text: %s" % (
            self.question_text)
        s += "\n\n, orig_answer_text: %s" % (
            self.orig_answer_text)
        s += "\n\n, doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += "\n\n, start_position: %d" % (self.start_position)
        if self.start_position:
            s += "\n\n, end_position: %d" % (self.end_position)
        return s

In [8]:
#실습 자료

In [10]:
def is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
        return True
    return False

def parse_json_squad(input_data, is_train):
    """Read a SQuAD json file into a list of SquadExample."""
    examples = list()
    for data_entry in input_data:
        for paragraph in data_entry['paragraphs']:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True

            # Q1. doc_tokens에 whitespace(c)를 가지고 context를 토큰화하는 코드를 작성하세요.
            ###################################################################################################
            for char in paragraph_text:
                if is_whitespace(char):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append( #blank# )
                    else:
                        doc_tokens[-1] += #blank#
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)  # Which word is the character in?
            ###################################################################################################
    
            for qa in paragraph["qas"]:
                """
                {'answers': [{'answer_start', 'text'}], 'question', 'id'}
                """
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None

                if is_train:
                    if len(qa["answers"]) != 1:
                        raise ValueError(
                            "For training, each question should have exactly 1 answer.")
                    
                    # Q2. Line 34의 변수를 참고하여 Line 70: SquadExample의 instance를 만들기 위한 파라미터를 채우세요.  
                    ###################################################################################################
                    qas_id = qa["#blank#"]     # fill the black -> assign None
                    question_text = qa["#blank#"] # fill the black # index of word
                    answer = qa["#blank#"][0]
                    orig_answer_text = answer["#blank#"]
                    answer_offset = answer["#blank#"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset] # index of word
                    end_position = char_to_word_offset[answer_offset + answer_length - 1] # index of word 
                    ###################################################################################################

                    # CODE FOR Handling exceptions 
                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.
                    actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(whitespace_tokenize(
                        orig_answer_text))  # segment words from the sentense including the white space
                    if actual_text.find(cleaned_answer_text) == -1:
                        logger.warning("Could not find answer: '%s' vs. '%s'",
                                       actual_text, cleaned_answer_text)
                        continue

                example = SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=doc_tokens,  # a set of tokens(words) in the
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position)
                examples.append(example)
    print("success to convert input data into a set of {} examples".format(len(examples)))
    return examples

SyntaxError: invalid syntax (<ipython-input-10-8260afdadd9f>, line 24)

In [None]:
train_examples = parse_json_squad(input_data, True)
## len of examples 87599

In [None]:
print(train_examples[0])

## Extract Feature

In [None]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self,
                 unique_id,
                 example_index,
                 doc_span_index,
                 tokens,
                 token_to_orig_map,
                 token_is_max_context,
                 input_ids,
                 input_mask,
                 segment_ids,
                 start_position=None,
                 end_position=None):
        
        self.unique_id = unique_id
        self.example_index = example_index
        self.doc_span_index = doc_span_index
        self.tokens = tokens
        self.token_to_orig_map = token_to_orig_map
        self.token_is_max_context = token_is_max_context
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.start_position = start_position
        self.end_position = end_position

In [11]:
def convert_examples_to_features(examples, tokenizer, max_seq_length,
                                 doc_stride, max_query_length, is_training):
    """Loads a data file into a list of `InputBatch`s."""

    unique_id = 1000000000

    features = []
    
    for (example_index, example) in enumerate(examples):
        
        # Q3. Pretrained model을 학습하는데 사용한 Tokenizer를 사용하여 question과 context의 token을 sub-token으로 토큰화해주세요.  
        ###################################################################################################
        query_tokens = tokenizer.tokenize(example.question_text)
        if len(query_tokens) > max_query_length:
            query_tokens = query_tokens[0:max_query_length]
        tok_to_orig_index = []
        orig_to_tok_index = []
        all_doc_tokens = [] 
        for (i, token) in enumerate(example.doc_tokens):
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = tokenizer.tokenize(token)
            for sub_token in sub_tokens: 
                tok_to_orig_index.append(#blank#)                  
                all_doc_tokens.append(#blank#)
        ## fill black with the code for appending some values to the above lists
        ###################################################################################################
                            
        # Q4. sub-tokens에 맞추어 span을 업데이트 해주세요.
        ###################################################################################################
        # fill the code for updating the span of token (tok_start_position, tok_end_position)
        tok_start_position = None
        tok_end_position = None
        if is_training:
            tok_start_position = orig_to_tok_index[example.start_position]
        if example.end_position < len(example.doc_tokens) - 1:
            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
        else:
            tok_end_position = len(all_doc_tokens) - 1
        (tok_start_position, tok_end_position) = improve_answer_span(
            all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
            example.orig_answer_text)
            
        ####################################################################################################
        
        # The -3 accounts for [CLS], [SEP] and [SEP]
        max_tokens_for_doc = #blank# - len(query_tokens) - 3

        # doc_tokens가 설정한 max length를 넘는다면, 몇개의 DocSpan으로 쪼개야 합니다.
        # We can have documents that are longer than the maximum sequence length.
        # To deal with this we do a sliding window approach, where we take chunks
        # of the up to our max length with a stride of `doc_stride`.
        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
            "DocSpan", ["start", "length"])
        doc_spans = []
        start_offset = 0
        while start_offset < len(all_doc_tokens):
            length = len(all_doc_tokens) - start_offset
            if length > max_tokens_for_doc:
                length = max_tokens_for_doc
            doc_spans.append(_DocSpan(start=start_offset, length=length))
            if start_offset + length == len(all_doc_tokens):
                break
            start_offset += min(length, doc_stride)
            
        for (doc_span_index, doc_span) in enumerate(doc_spans):
            tokens = [] # input data
            segment_ids = [] # segment data
            token_to_orig_map = {}
            token_is_max_context = {}

            # Q5. query를 pretrained BERT의 입력값(features) 형식에 따라 바꿔주세요.
              # tokens -> [CLS] question [SEP] context [SEP]
              # segment_ids -> 00000000000000000000 1111111111111
            ###################################################################################################            
            
            for token in query_tokens:
                ## fill the blank ##        
            ###################################################################################################

                
            # Q6. context를 pretrained BERT의 입력값(features) 형식에 따라 바꿔주세요.
            ###################################################################################################
            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
                is_max_context = check_is_max_context(doc_spans, doc_span_index,
                                                       split_token_index)
                token_is_max_context[len(tokens)] = is_max_context
                tokens.append(all_doc_tokens[split_token_index])
                segment_ids.append(1) # segment ids 12 means the context
            tokens.append("[SEP]")
            segment_ids.append(1)
            ###################################################################################################
            
            # convert into the index of emmeding matrix
            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            
            # Q7. pretrained BERT의 입력값 크기에 맞게 zero-padding 해주세요.
            ###################################################################################################
            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1] * len(input_ids)
            while len(input_ids) < #blank#:
                input_ids.append(#blank#)
                input_mask.append(#blank#)
                segment_ids.append(0)
            ###################################################################################################
            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length
            
            start_position = None
            end_position = None
            if is_training:
                # For training, if our document chunk does not contain an annotation
                # we throw it out, since there is nothing to predict.
                doc_start = doc_span.start
                doc_end = doc_span.start + doc_span.length - 1
                if (example.start_position < doc_start or
                        example.end_position < doc_start or
                        example.start_position > doc_end or example.end_position > doc_end):
                    continue # -> next to the DocSpan 

                doc_offset = len(query_tokens) + 2
                start_position = tok_start_position - doc_start + doc_offset
                end_position = tok_end_position - doc_start + doc_offset

            if example_index < 20:
                logger.info("*** Example ***")
                logger.info("unique_id: %s" % (unique_id))
                logger.info("example_index: %s" % (example_index))
                logger.info("doc_span_index: %s" % (doc_span_index))
                logger.info("tokens: %s" % " ".join(tokens))
                logger.info("token_to_orig_map: %s" % " ".join([
                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
                logger.info("token_is_max_context: %s" % " ".join([
                    "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
                ]))
                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
                logger.info(
                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
                logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
                if is_training:
                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
                    logger.info("start_position: %d" % (start_position))
                    logger.info("end_position: %d" % (end_position))
                    logger.info(
                        "answer: %s" % (answer_text))
                    
            features.append(
                InputFeatures(
                    unique_id=unique_id,
                    example_index=example_index,
                    doc_span_index=doc_span_index,
                    tokens=tokens,
                    token_to_orig_map=token_to_orig_map,
                    token_is_max_context=token_is_max_context,
                    input_ids=input_ids,
                    input_mask=input_mask,
                    segment_ids=segment_ids,
                    start_position=start_position,
                    end_position=end_position))
            unique_id += 1

    return features

SyntaxError: invalid syntax (<ipython-input-11-16e652569d77>, line 32)

## Wrap the train_features in a `Dataloader`

In [None]:
config='bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(config)
max_seq_length=128
doc_stride=128
max_query_length=64
cached_train_features_file = train_file+'_{0}_{1}_{2}'.format(config, str(max_seq_length), str(doc_stride))

train_batch_size=15
predict_batch_size=15
num_train_epochs=2
gradient_accumulation_steps=1
warmup_proportion=0.1
learning_rate=5e-5
num_train_steps = int(num_train_epochs * len(train_examples) / train_batch_size) #  /gradient_accumulation_steps *num_train_epochs

In [None]:
try:
    with open(cached_train_features_file, "rb") as reader:
        train_features = pickle.load(reader)
except:
    train_features = convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=True)
    print('finish extracting the features from the examples')
    logger.info("  Saving train features into cached file %s", cached_train_features_file)
    with open(cached_train_features_file, "wb") as writer:
        pickle.dump(train_features, writer)

In [None]:
logger.info("***** Running training *****")
logger.info("  Num orig examples = %d", len(train_examples))
logger.info("  Num split examples = %d", len(train_features))
logger.info("  Batch size = %d", train_batch_size)
logger.info("  Num steps = %d", num_train_steps)


# Q8. 데이터로더에 데이터를 로드하기 위해 tensor로 data type을 torch.Tensor로 바꿔주세요.
###################################################################################################
# fill the blank
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype= #blank#) ## must be long type
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype= #blank#)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype= #blank#)
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=#blank# )
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype= #blank#)
###################################################################################################

train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,all_start_positions, all_end_positions)                           
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

## Model Settings

In [None]:
model = BertForQuestionAnswering.from_pretrained(config)
local_rank = -1
gpu_num =2
device = torch.device(f"cuda:{gpu_num}")
t_total = num_train_steps
model.to(device)  # [2] TITAN Xp         | 43'C,   0 % |  1235 / 12196 MB | gyuhyeon(1223M)

In [None]:
t_total = num_train_steps
param_optimizer = list(model.named_parameters())

# hack to remove pooler, which is not used
# thus it produce None grad that break apex
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]  # remove the first class label('pooler')

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

t_total = num_train_steps

# find BertAdam in the https://huggingface.co/transformers/migration.html
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=learning_rate,
                     warmup=warmup_proportion,
                     t_total=t_total)

In [None]:
global_step = 0
model.train()
# Q9. Pretrained BERT를 fine tuning하는 training 코드를 채워주세요.
###################################################################################################
# [2] TITAN Xp         | 61'C,  65 % |  6757 / 12196 MB | gyuhyeon(6745M)
for _ in trange(int(num_train_epochs), desc="Epoch"):
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        # loss.backward()까지의 코드를 채워주세요.
        # blank:: batch to device#
        input_ids, input_mask, segment_ids, start_positions, end_positions = batch
        loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
        loss.backward()
        
        """
        Added Explaination for the Below Code
        f your data set is highly differentiated, you can suffer from a sort of "early over-fitting". 
        If your shuffled data happens to include a cluster of related, strongly-featured observations,
        your model's initial training can skew badly toward those features -- or worse,
        toward incidental features that aren't truly related to the topic at all.

        Warm-up is a way to reduce the primacy effect of the early training examples.
        Without it, you may need to run a few extra epochs to get the convergence desired,
        as the model un-trains those early superstitions.
        """
        lr_this_step = learning_rate * warmup_linear(global_step/t_total, warmup_proportion)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr_this_step
        optimizer.step()
        optimizer.zero_grad()
        global_step += 1
        
###################################################################################################

## Model Save

In [None]:
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
# output_model_file = os.path.join("/Model", "pytorch_model.bin")
torch.save(model_to_save.state_dict(), "./save/models/squad_finetuned_bert_128_epochs_2.bin")

os.listdir('./save/models')

"""
torch.save({"model_type": self.model_type,
            "start_epoch": epoch + 1,
            "network": self.net.state_dict(),
            "optimizer": self.optim.state_dict(),
            "best_metric": self.best_metric,
            }, str(save_path) + "/%s.pth.tar" % (filename))

## how to load            
self.net.load_state_dict(ckpoint['network'])
self.optim.load_state_dict(ckpoint['optimizer'])
self.start_epoch = ckpoint['start_epoch']
self.best_metric = ckpoint["best_metric"]
"""

## Model load

In [None]:
gpu_num =2
device = torch.device(f"cuda:{gpu_num}")

config = 'bert-base-multilingual-cased'
model = BertForQuestionAnswering.from_pretrained(config)
tokenizer = BertTokenizer.from_pretrained(config)

# 10. fine tuned BERT 모델 파라미터를 로드하는 코드를 채워주세요.
###################################################################################################
saving_point = torch.load("./save/models/squad_finetuned_bert_128_epochs_2.bin")
model.#blank#(saving_point)
model.to(device)
###################################################################################################

In [None]:
max_seq_length=128
doc_stride=128
max_query_length=64
predict_batch_size=50

# 11. dev data를 로드하고 전처리 해주세요
###################################################################################################
input_data = read_json(dev_file)
eval_examples = parse_json_squad(
    input_data=input_data, is_train=False)
eval_features = #blank#(
    examples=eval_examples,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    doc_stride=doc_stride,
    max_query_length=max_query_length,
    is_training=False)
###################################################################################################

logger.info("***** Running predictions *****")
logger.info("  Num orig examples = %d", len(eval_examples))
logger.info("  Num split examples = %d", len(eval_features))
logger.info("  Batch size = %d",predict_batch_size)

all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)

# 12. inference를 위한 데이터로더를 만들어주세요.
###################################################################################################
# Run prediction for full data
eval_sampler = SequentialSampler(#blank#)
eval_dataloader = DataLoader(#blank#, sampler=eval_sampler, batch_size=predict_batch_size)
###################################################################################################

In [None]:
model.eval()
all_results = []
logger.info("Start evaluating")
for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"):
    if len(all_results) % 1000 == 0:
        logger.info("Processing example: %d" % (len(all_results)))
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    with torch.no_grad():
        batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
    for i, example_index in enumerate(example_indices):
        start_logits = batch_start_logits[i].detach().cpu().tolist()
        end_logits = batch_end_logits[i].detach().cpu().tolist()
        eval_feature = eval_features[example_index.item()]
        unique_id = int(eval_feature.unique_id)
        all_results.append(RawResult(unique_id=unique_id,
                                     start_logits=start_logits,
                                     end_logits=end_logits))

![nn](images/image_1.jpg)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def softmax(a):
    exp_a = np.exp(a)
    sum_exp_a = np.sum(exp_a)
    y = exp_a / sum_exp_a
    return y

start_logit = all_results[6].start_logits
end_logit = all_results[6].end_logits
index = np.arange(len(start_logit))

# 13. Infer 모드의 model output을 plot해주세요.
###################################################################################################
# berore softmax


# after softmax

###################################################################################################


pred_start_index = np.argmax(start_logit)
pred_end_index = np.argmax(end_logit)
print(np.argmax(start_logit), np.argmax(end_logit))

In [None]:
pred_start_index = np.argmax(start_logit)
pred_end_index = np.argmax(end_logit)
orig_doc_start = eval_features[6].token_to_orig_map[pred_start_index]
orig_doc_end = eval_features[6].token_to_orig_map[pred_end_index]
orig_tokens = eval_examples[eval_features[6].example_index].doc_tokens[orig_doc_start:(orig_doc_end + 1)]

print(eval_examples[eval_features[6].example_index])
print("\n\nAnswer: {}".format(" ".join(orig_tokens)))

In [None]:
output_dir='./save/predictions'
n_best_size=20
max_answer_length=30
do_lower_case=True
verbose_logging=False

output_prediction_file = os.path.join(output_dir, "squad_test_predictions.json")
output_nbest_file = os.path.join(output_dir, "nbest_squad_test_predictions.json")
all_predictions = write_predictions(eval_examples, eval_features, all_results,
                  n_best_size, max_answer_length,
                  do_lower_case, output_prediction_file,
                  output_nbest_file, verbose_logging)

## Get scores

In [None]:
with open('./datasets/squad_v1.1/dev-v1.1.json') as f:
    dataset_json = json.load(f)
    dataset = dataset_json['data']
with open('./save/predictions/squad_test_predictions.json') as f:
    preds = json.load(f)

In [None]:
qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
exact, f1 = get_raw_scores(dataset, preds)

out_eval = make_eval_dict(exact, f1)
out_eval

# KorQuAD v1