In [1]:
#torch==1.7.1/transformers==3.5.1
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [2]:
import json
import os
import pandas as pd
import codecs

from gluonnlp.data import SentencepieceTokenizer

In [3]:
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'], verbose = 1):
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [4]:
train_raw = squad_json_to_dataframe_train("KorQuAD_v1.0_train.json")
test_raw = squad_json_to_dataframe_train("KorQuAD_v1.0_dev.json")

Reading the json file
processing...
  js = pd.io.json.json_normalize(file , record_path )
  m = pd.io.json.json_normalize(file, record_path[:-1] )
  r = pd.io.json.json_normalize(file,record_path[:-2])
  main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
shape of the dataframe is (60407, 6)
Done
Reading the json file
processing...
shape of the dataframe is (5774, 6)
Done


In [5]:
bertmodel, vocab = get_pytorch_kobert_model()

#token_idx to token_word
#print(vocab.idx_to_token[3460])

using cached model
using cached model


In [6]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [29]:
class BERTDataset(Dataset):
    def __init__(self, dataset, bert_tokenizer, max_len,
                 pad=True, pair=True):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.max = 0
        questions = list(dataset['question'])
        contexts = list(dataset['context'])

        labels_temp = list(dataset['answer_start'])
        # self.sentences = []
        # for q, c in zip(questions, contexts):
        #     len_ = len(bert_tokenizer(q)) + len(bert_tokenizer(c))
        #     if self.max < len_:
        #         self.max = len_
        #         print(self.max)
        #     self.sentences.append(transform([q, c]))
        self.sentences = [transform([q, c]) for q, c in zip(questions, contexts)]
        self.labels = [np.int32(label) for label in labels_temp]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [23]:
MAX_LEN = 512
PAD = True
PAIR = True

In [30]:
data_train = BERTDataset(train_raw,tok,MAX_LEN,PAD,PAIR)
data_test = BERTDataset(test_raw,tok,MAX_LEN,PAD,PAIR)

408
458
502
640
701
707
1002
1009
1150
1598
1604
1618
6712
6717
6718
6721
291
292
295
296
421
588
596
601
611
698
1272
1274
1278
1444
1449
1451
1453
1715
1723
1725
6721
1725


In [None]:
class BERT_QA_model(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)