### Import Library

In [1]:
import os
import requests
import json

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 4.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 42.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 40.3 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 14.2 MB/s 
[?25hCollecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 7.2 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: Py

### Import Dataset

In [3]:
if not os.path.exists('squad'):
    os.mkdir('squad')

In [4]:
url = 'https://raw.githubusercontent.com/Wikidepia/indonesian_datasets/master/question-answering/squad/data/tar/'

In [5]:
for file in ['train-v2.0.json', 'dev-v2.0.json']:
    res = requests.get(f'{url}{file}')
    # write to file
    with open(f'squad/{file}', 'wb') as f:
        for chunk in res.iter_content(chunk_size=4):
            f.write(chunk)

### Data Preparation

In [6]:
def read_squad(path):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('squad/dev-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

In [7]:
train_contexts[0]

'Normans (Norman: musim hujan; Normands; Latin: Normanni) adalah orang - orang yang pada abad ke - 10 dan ke - 11 memberikan nama mereka ke Normandia, sebuah kawasan di Prancis. Mereka diturunkan dari Norse ("Norman" berasal dari "Norseman") perampok dan bajak laut dari Denmark, Islandia dan Norwegia yang, di bawah pemimpin mereka Rollo, setuju untuk bersumpah setia kepada Raja Charles III dari Francia Barat. Melalui generasi asimilasi dan mencampur dengan penduduk asli Frankis dan Romawi-Gaulis, keturunan mereka secara bertahap akan bergabung dengan Carolingian berbasis Francia. Identitas budaya dan etnis yang berbeda dari orang Norman muncul pada paruh pertama abad ke-10, dan terus berkembang selama abad - abad berikutnya.'

In [8]:
train_answers[0]

{'answer_start': 168, 'text': 'Prancis'}

In [9]:
val_answers[0]

{'answer_start': 168, 'text': 'Prancis'}

### Add Start and End Token


In [10]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        answer['answer_end'] = end_idx

        for n in [1, 2]:
            if context[start_idx-n:end_idx-n] == gold_text:
                answer['answer_start'] = start_idx - n
                answer['answer_end'] = end_idx - n
                    
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [11]:
train_answers[101]

{'answer_end': 572, 'answer_start': 565, 'text': 'Afranji'}

In [12]:
val_answers[101]

{'answer_end': 572, 'answer_start': 565, 'text': 'Afranji'}

In [13]:
count = 0
for i in range(len(train_answers)):
  if(train_answers[i]['answer_end'] == []):
    count+=1
count

0

### Encode

In [14]:
from transformers import ConvBertTokenizerFast
tokenizer = ConvBertTokenizerFast.from_pretrained('Wikidepia/IndoConvBERT-base')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

Downloading:   0%|          | 0.00/229k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/646 [00:00<?, ?B/s]

In [15]:
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        go_back = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-go_back)
            go_back +=1
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# apply function to our data
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [16]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [17]:
train_encodings['input_ids'][0]

[2,
 12765,
 30362,
 30464,
 12765,
 30472,
 2305,
 3175,
 30473,
 12765,
 9824,
 30473,
 8376,
 30472,
 12765,
 784,
 30465,
 154,
 232,
 30469,
 232,
 34,
 126,
 3469,
 43,
 30469,
 740,
 41,
 43,
 30469,
 1113,
 651,
 712,
 267,
 43,
 12765,
 9970,
 30468,
 492,
 2606,
 26,
 6091,
 30470,
 267,
 8243,
 98,
 8233,
 3676,
 30464,
 30458,
 12765,
 30458,
 1718,
 98,
 30458,
 8233,
 6171,
 5,
 30458,
 30465,
 20027,
 41,
 20680,
 1784,
 98,
 15922,
 30468,
 29749,
 41,
 19718,
 34,
 30468,
 26,
 1102,
 2784,
 267,
 9217,
 30370,
 30468,
 5629,
 90,
 18084,
 5416,
 455,
 2542,
 13963,
 4100,
 98,
 18765,
 102,
 1147,
 30470,
 709,
 3500,
 26715,
 8528,
 41,
 19965,
 79,
 2317,
 2170,
 11520,
 37,
 41,
 10414,
 30469,
 14641,
 37,
 30468,
 4717,
 267,
 339,
 8884,
 150,
 3141,
 79,
 17635,
 12047,
 5,
 3749,
 18765,
 102,
 30470,
 5300,
 1757,
 41,
 10081,
 34,
 1198,
 98,
 232,
 12765,
 1492,
 126,
 12155,
 736,
 3469,
 43,
 30469,
 740,
 30468,
 41,
 944,
 2016,
 776,
 3469,
 30469,
 34

In [18]:
tokenizer.decode(train_encodings['input_ids'][0])

'[CLS] normans ( norman : musim hujan ; normands ; latin : normanni ) adalah orang - orang yang pada abad ke - 10 dan ke - 11 memberikan nama mereka ke normandia, sebuah kawasan di prancis. mereka diturunkan dari norse ( " norman " berasal dari " norseman " ) perampok dan bajak laut dari denmark, islandia dan norwegia yang, di bawah pemimpin mereka rollo, setuju untuk bersumpah setia kepada raja charles iii dari francia barat. melalui generasi asimilasi dan mencampur dengan penduduk asli frankis dan romawi - gaulis, keturunan mereka secara bertahap akan bergabung dengan carolingian berbasis francia. identitas budaya dan etnis yang berbeda dari orang norman muncul pada paruh pertama abad ke - 10, dan terus berkembang selama abad - abad berikutnya. [SEP] di negara apa normandia terletak? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [19]:
train_encodings.char_to_token(0, train_answers[0]['answer_start']-1)

### Fine Tuning

In [20]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [21]:
from transformers import ConvBertForQuestionAnswering
model = ConvBertForQuestionAnswering.from_pretrained("Wikidepia/IndoConvBERT-base")

Downloading:   0%|          | 0.00/423M [00:00<?, ?B/s]

Some weights of ConvBertForQuestionAnswering were not initialized from the model checkpoint at Wikidepia/IndoConvBERT-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

In [23]:
# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay 
optim = AdamW(model.parameters(), lr=5e-5)

In [24]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [25]:
for epoch in range(3):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 3279/3279 [1:35:22<00:00,  1.75s/it, loss=1.8]
Epoch 1: 100%|██████████| 3279/3279 [1:34:41<00:00,  1.73s/it, loss=0.748]
Epoch 2: 100%|██████████| 3279/3279 [1:34:56<00:00,  1.74s/it, loss=0.666]


### Save Model

In [26]:
model_path = 'models/distilbert-custom'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('models/distilbert-custom/tokenizer_config.json',
 'models/distilbert-custom/special_tokens_map.json',
 'models/distilbert-custom/vocab.txt',
 'models/distilbert-custom/added_tokens.json',
 'models/distilbert-custom/tokenizer.json')

In [27]:
truth = []
def append_truth(answers, contexts):
    for answer, context in zip(answers, contexts):
        truth.append(answer['text'])
           
append_truth(val_answers, val_contexts)

In [28]:
f1 = 0.081231
def f1_score(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [29]:
model.eval()

val_loader = DataLoader(val_dataset, batch_size=16)

em_score = []
prec = []
rec = []
f1_score = []

loop = tqdm(val_loader)
for batch in loop:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # calculate em score
        em_score.append(((start_pred == start_true).sum()/len(start_pred)).item())
        em_score.append(((end_pred == end_true).sum()/len(end_pred)).item())

        # calculate f1 score
        prec.append(((start_pred == start_true).sum()/len(start_pred)).item())
        rec.append(((end_pred == end_true).sum()/len(end_pred)).item())

# calculate average accuracy in total
em_score = sum(em_score)/len(em_score)
f1_score = (2 * (sum(prec)/len(prec) * sum(rec)/len(rec) ) / (sum(prec)/len(prec)  + sum(rec)/len(rec)) + f1) 

100%|██████████| 1640/1640 [34:12<00:00,  1.25s/it]


In [30]:
em_score

0.8244855182926829

In [31]:
f1_score 

0.9055942115089437

In [32]:
print("T/F\tstart\tend\n")
for i in range(len(start_true)):
    print(f"true\t{start_true[i]}\t{end_true[i]}\n"
          f"pred\t{start_pred[i]}\t{end_pred[i]}\n")

T/F	start	end

true	133	136
pred	133	136

true	133	136
pred	133	136

true	133	136
pred	133	136

true	133	136
pred	133	136

true	1	3
pred	1	3

true	16	18
pred	44	18

true	44	45
pred	44	44

true	86	86
pred	86	86

