# Requirments

## Import

In [2]:
import os
import random
import math
import csv
import json
from statistics import mean
from typing import List, Tuple, Dict, Any
import uuid

from tqdm.notebook import tqdm
from easydict import EasyDict as edict

import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

import wandb

import torch
from torch.utils.data import DataLoader
from torch.nn.utils import clip_grad_norm_
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

from torchinfo import summary

from transformers import ElectraModel, ElectraTokenizer, ElectraForQuestionAnswering, AutoModelForQuestionAnswering, AutoTokenizer

In [3]:
for name in 'models', 'submissions':
    os.makedirs(name, exist_ok=True)

# Set Arguments, Hyper-parameters

In [22]:
args = edict({'w_project': 'test_project',
              'w_entity': 'chohs1221',
              'learning_rate': 6e-5,
              'batch_size': {'train': 256,
                             'eval': 4,
                             'test': 256},
              'accumulate': 64,
              'epochs': 10,
              'seed': 42,
              # 'model_name': 'monologg/koelectra-base-v3-discriminator',
              'model_name': 'monologg/kobigbird-bert-base',
              'max_length': 1024})
# args['NAME'] = ''f'koelectra_ep{args.epochs}_lr{args.learning_rate}_{random.randrange(0, 1024)}'
args['NAME'] = ''f'kobigbird_v2_ep{args.epochs}_max{args.max_length}_lr{args.learning_rate}_{random.randrange(0, 1024)}'
print(args.NAME)

kobigbird_v2_ep10_max1024_lr6e-05_228


# Initialize

## Seed

In [5]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything(args.seed)

## Tokenizer

In [6]:
# tokenizer = ElectraTokenizer.from_pretrained(args.model_name)
tokenizer = AutoTokenizer.from_pretrained(args.model_name)

# Datasets

## Load, Split

In [7]:
class KoMRC:
    def __init__(self, data, indices: List[Tuple[int, int, int]]):
        self._data = data
        self._indices = indices


    # Json을 불러오는 메소드
    @classmethod
    def load(cls, file_path: str):
        with open(file_path, 'r', encoding='utf-8') as fd:
            data = json.load(fd)

        indices = []
        for d_id, document in enumerate(data['data']):
            for p_id, paragraph in enumerate(document['paragraphs']):
                for q_id, _ in enumerate(paragraph['qas']):
                    indices.append((d_id, p_id, q_id))
        
        return cls(data, indices)

    # Json을 불러오는 메소드
    @classmethod
    def loads(cls, *file_path: str):
        datas = {'data': []}
        indices = []
        
        for f in file_path:
            with open(f, 'r', encoding='utf-8') as fd:
                data = json.load(fd)
            datas['data'] += data['data']
            
        for d_id, document in enumerate(datas['data']):
            for p_id, paragraph in enumerate(document['paragraphs']):
                for q_id, _ in enumerate(paragraph['qas']):
                    indices.append((d_id, p_id, q_id))

        return cls(datas, indices)

    # 데이터 셋을 잘라내는 메소드
    @classmethod
    def split(cls, dataset, eval_ratio: float=.016108):
        indices = list(dataset._indices)
        random.shuffle(indices)
        train_indices = indices[int(len(indices) * eval_ratio):]
        eval_indices = indices[:int(len(indices) * eval_ratio)]

        return cls(dataset._data, train_indices), cls(dataset._data, eval_indices)


    def __getitem__(self, index: int) -> Dict[str, Any]:
        d_id, p_id, q_id = self._indices[index]
        paragraph = self._data['data'][d_id]['paragraphs'][p_id]

        qa = paragraph['qas'][q_id]

        if 'guid' in qa:
            guid = qa['guid']
        else:
            guid = uuid.uuid4().hex

        context = paragraph['context'].replace('\n', 'n').replace('\xad', ' ').replace('\xa0', ' ').replace('\u200b', ' ')

        question = qa['question'].replace('\n', 'n').replace('\xad', ' ').replace('\xa0', ' ').replace('\u200b', ' ')

        answers = qa['answers']
        if answers != None:
            for a in answers:
                a['text'] = a['text'].replace('\n', 'n').replace('\xad', ' ').replace('\xa0', ' ').replace('\u200b', ' ')
        else:
            answers = None


        return {'guid': guid,
            'context': context,
            'question': question,
            'answers': answers
        }

    def __len__(self) -> int:
        return len(self._indices)

## Tokenize & Tag Token Positions

In [8]:
class TokenizedKoMRC(KoMRC):
    def __init__(self, data, indices: List[Tuple[int, int, int]]) -> None:
        super().__init__(data, indices)
        self._tokenizer = tokenizer


    def _tokenize_with_position(self, sentence: str) -> List[Tuple[str, Tuple[int, int]]]:
        position = 0
        tokens = []

        sentence_tokens = []
        for word in sentence.split():
            if '[UNK]' in tokenizer.tokenize(word):
                sentence_tokens.append(word)
            else:
                sentence_tokens += tokenizer.tokenize(word)
        
        for morph in sentence_tokens:
            if len(morph) > 2:
                if morph[:2] == '##':
                    morph = morph[2:]

            position = sentence.find(morph, position)
            tokens.append((morph, (position, position + len(morph))))
            position += len(morph)
            
        return tokens
            

    def __getitem__(self, index: int) -> Dict[str, Any]:
        sample = super().__getitem__(index)
        # sample = {'guid': guid, 'context': context, 'question': question, 'answers': answers}

        context, position = zip(*self._tokenize_with_position(sample['context']))
        context, position = list(context), list(position)

        question = self._tokenizer.tokenize(sample['question'])

        if sample['answers'] is not None:
            answers = []
            for answer in sample['answers']:
                for start, (position_start, position_end) in enumerate(position):
                    if position_start <= answer['answer_start'] < position_end:
                        break
                else:
                    print(context, answer)
                    print(answer['guid'])
                    print(answer['answer_start'])
                    raise ValueError("No mathced start position")

                target = ''.join(answer['text'].split(' '))
                source = ''
                for end, morph in enumerate(context[start:], start):
                    source += morph
                    if target in source:
                        break
                else:
                    print(context, answer)
                    print(answer['guid'])
                    print(answer['answer_start'])
                    raise ValueError("No Matched end position")

                answers.append({'start': start, 'end': end})
            answer_text = sample['answers'][0]['text']

        else:
            answers = None
            answer_text = None
        
        return {
            'guid': sample['guid'],
            'context_original': sample['context'],
            'context_position': position,
            'question_original': sample['question'],
            'context': context,
            'question': question,
            'answers': answers,
            'answers_text': answer_text
        }

## Input

In [9]:
class Indexer:
    def __init__(self, vocabs: List[str], max_length: int=args.max_length):
        self.max_length = max_length
        self.vocabs = vocabs

    @property
    def vocab_size(self):
        return len(self.vocabs)
    @property
    def pad_id(self):
        return tokenizer.vocab['[PAD]']
    @property
    def unk_id(self):
        return tokenizer.vocab['[UNK]']
    @property
    def cls_id(self):
        return tokenizer.vocab['[CLS]']
    @property
    def sep_id(self):
        return tokenizer.vocab['[SEP]']


    def sample2ids(self, sample: Dict[str, Any],) -> Dict[str, Any]:
        context = [tokenizer.convert_tokens_to_ids(token) for token in sample['context']]
        question = [tokenizer.convert_tokens_to_ids(token) for token in sample['question']]

        context = context[:self.max_length-len(question)-3]             # Truncate context
        
        input_ids = [self.cls_id] + question + [self.sep_id] + context + [self.sep_id]
        token_type_ids = [0] * (len(question) + 1) + [1] * (len(context) + 2)

        if sample['answers'] is not None:
            answer = sample['answers'][0]
            start = min(len(question) + 2 + answer['start'], self.max_length - 1)
            end = min(len(question) + 2 + answer['end'], self.max_length - 1)
        else:
            start = None
            end = None

        return {
            'guid': sample['guid'],
            'context': sample['context_original'],
            'question': sample['question_original'],
            'position': sample['context_position'],
            'input_ids': input_ids,
            'token_type_ids': token_type_ids,
            'start': start,
            'end': end
        }

In [10]:
indexer = Indexer(list(tokenizer.vocab.keys()))

## Attention Mask

In [11]:
class IndexerWrappedDataset:
    def __init__(self, dataset: TokenizedKoMRC, indexer: Indexer) -> None:
        self._dataset = dataset
        self._indexer = indexer

    def __len__(self) -> int:
        return len(self._dataset)
    
    def __getitem__(self, index: int) -> Dict[str, Any]:
        sample = self._indexer.sample2ids(self._dataset[index])
        sample['attention_mask'] = [1] * len(sample['input_ids'])

        return sample

# Test

## Datasets

In [12]:
test_dataset = TokenizedKoMRC.load('./datasets2/test.json')
indexer_test = Indexer(list(tokenizer.vocab.keys()))
indexed_test_dataset = IndexerWrappedDataset(test_dataset, indexer_test)
print("Number of Test Samples", len(test_dataset))
# print(test_dataset[0])

Number of Test Samples 4008


## Model

In [14]:
model1 = AutoModelForQuestionAnswering.from_pretrained(f'models/kobigbird_v2_ep10_max1024_lr6e-05_276_1')
model1.cuda();
model2 = AutoModelForQuestionAnswering.from_pretrained(f'models/kobigbird_v2_ep2_max1024_lr6e-05_712_1_best')
model2.cuda();
model3 = AutoModelForQuestionAnswering.from_pretrained(f'models/kobigbird_v2_ep2_max1024_lr6e-05_228_1_67392')
model3.cuda();

In [17]:
start_visualize = []
end_visualize = []

models = [model1, model2, model3]
with torch.no_grad(), open(f'submissions/{args.NAME}_tok_ens.csv', 'w') as fd:
    writer = csv.writer(fd)
    writer.writerow(['Id', 'Predicted'])

    rows = []
    # for sample in tqdm(test_dataset, "Testing"):
    for sample in tqdm(indexed_test_dataset, "Testing"):
        input_ids, token_type_ids = [torch.tensor(sample[key], dtype=torch.long, device="cuda") for key in ("input_ids", "token_type_ids")]
        # print(sample)
    
        model1.eval()
        model2.eval()
        model3.eval()

        starts = []
        ends = []
        for model in models:
            with torch.no_grad():
                output = model(input_ids=input_ids[None, :], token_type_ids=token_type_ids[None, :])

            start_logits = output.start_logits
            end_logits = output.end_logits
            start_logits.squeeze_(0), end_logits.squeeze_(0)

            start_prob = start_logits[token_type_ids.bool()][1:-1].softmax(-1)
            end_prob = end_logits[token_type_ids.bool()][1:-1].softmax(-1)

            probability = torch.triu(start_prob[:, None] @ end_prob[None, :])

            # 토큰 길이 8까지만
            for row in range(len(start_prob) - 8):
                probability[row] = torch.cat((probability[row][:8+row].cpu(), torch.Tensor([0] * (len(start_prob)-(8+row))).cpu()), 0)

            index = torch.argmax(probability).item()

            starts.append(index // len(end_prob))
            ends.append(index % len(end_prob))

        start_str = sample['position'][int(mean(starts))][0]
        end_str = sample['position'][int(mean(ends))][1]

        # start_visualize.append((list(start_prob.cpu()), (start, end), (start_str, end_str)))
        # end_visualize.append((list(end_prob.cpu()), (start, end), (start_str, end_str)))
        
        rows.append([sample["guid"], sample['context'][start_str:end_str]])

    writer.writerows(rows)

HBox(children=(HTML(value='Testing'), FloatProgress(value=0.0, max=4008.0), HTML(value='')))




In [19]:
start_visualize = []
end_visualize = []

models = [model1, model3]
with torch.no_grad(), open(f'submissions/{args.NAME}_str_ens2.csv', 'w') as fd:
    writer = csv.writer(fd)
    writer.writerow(['Id', 'Predicted'])

    rows = []
    # for sample in tqdm(test_dataset, "Testing"):
    for sample in tqdm(indexed_test_dataset, "Testing"):
        input_ids, token_type_ids = [torch.tensor(sample[key], dtype=torch.long, device="cuda") for key in ("input_ids", "token_type_ids")]
        # print(sample)
    
        model1.eval()
        model3.eval()

        starts_str = []
        ends_str = []
        for model in models:
            with torch.no_grad():
                output = model(input_ids=input_ids[None, :], token_type_ids=token_type_ids[None, :])

            start_logits = output.start_logits
            end_logits = output.end_logits
            start_logits.squeeze_(0), end_logits.squeeze_(0)

            start_prob = start_logits[token_type_ids.bool()][1:-1].softmax(-1)
            end_prob = end_logits[token_type_ids.bool()][1:-1].softmax(-1)

            probability = torch.triu(start_prob[:, None] @ end_prob[None, :])

            # 토큰 길이 8까지만
            for row in range(len(start_prob) - 8):
                probability[row] = torch.cat((probability[row][:8+row].cpu(), torch.Tensor([0] * (len(start_prob)-(8+row))).cpu()), 0)

            index = torch.argmax(probability).item()

            start = index // len(end_prob)
            end = index % len(end_prob)

            starts_str.append(sample['position'][start][0])
            ends_str.append(sample['position'][end][1])
        
        start_str = int(mean(starts_str))
        end_str = int(mean(ends_str))
        
        start_visualize.append((list(start_prob.cpu()), (start, end), (start_str, end_str)))
        end_visualize.append((list(end_prob.cpu()), (start, end), (start_str, end_str)))
        
        rows.append([sample["guid"], sample['context'][start_str:end_str]])

    writer.writerows(rows)

HBox(children=(HTML(value='Testing'), FloatProgress(value=0.0, max=4008.0), HTML(value='')))




In [20]:
start_visualize = []
end_visualize = []

models = [model1, model3]
with torch.no_grad(), open(f'submissions/{args.NAME}_tok_ens2.csv', 'w') as fd:
    writer = csv.writer(fd)
    writer.writerow(['Id', 'Predicted'])

    rows = []
    # for sample in tqdm(test_dataset, "Testing"):
    for sample in tqdm(indexed_test_dataset, "Testing"):
        input_ids, token_type_ids = [torch.tensor(sample[key], dtype=torch.long, device="cuda") for key in ("input_ids", "token_type_ids")]
        # print(sample)
    
        model1.eval()
        model3.eval()

        starts = []
        ends = []
        for model in models:
            with torch.no_grad():
                output = model(input_ids=input_ids[None, :], token_type_ids=token_type_ids[None, :])

            start_logits = output.start_logits
            end_logits = output.end_logits
            start_logits.squeeze_(0), end_logits.squeeze_(0)

            start_prob = start_logits[token_type_ids.bool()][1:-1].softmax(-1)
            end_prob = end_logits[token_type_ids.bool()][1:-1].softmax(-1)

            probability = torch.triu(start_prob[:, None] @ end_prob[None, :])

            # 토큰 길이 8까지만
            for row in range(len(start_prob) - 8):
                probability[row] = torch.cat((probability[row][:8+row].cpu(), torch.Tensor([0] * (len(start_prob)-(8+row))).cpu()), 0)

            index = torch.argmax(probability).item()

            starts.append(index // len(end_prob))
            ends.append(index % len(end_prob))

        start_str = sample['position'][int(mean(starts))][0]
        end_str = sample['position'][int(mean(ends))][1]

        # start_visualize.append((list(start_prob.cpu()), (start, end), (start_str, end_str)))
        # end_visualize.append((list(end_prob.cpu()), (start, end), (start_str, end_str)))
        
        rows.append([sample["guid"], sample['context'][start_str:end_str]])

    writer.writerows(rows)

HBox(children=(HTML(value='Testing'), FloatProgress(value=0.0, max=4008.0), HTML(value='')))




In [21]:
model1 = AutoModelForQuestionAnswering.from_pretrained(f'models/kobigbird_v2_ep2_max1024_lr6e-05_228_1_65664')
model1.cuda();
model2 = AutoModelForQuestionAnswering.from_pretrained(f'models/kobigbird_v2_ep2_max1024_lr6e-05_228_1_67392')
model2.cuda();
model3 = AutoModelForQuestionAnswering.from_pretrained(f'models/kobigbird_v2_ep2_max1024_lr6e-05_228_1_67392')
model3.cuda();

In [23]:
start_visualize = []
end_visualize = []

models = [model1, model2, model3]
with torch.no_grad(), open(f'submissions/{args.NAME}_tok_ens3.csv', 'w') as fd:
    writer = csv.writer(fd)
    writer.writerow(['Id', 'Predicted'])

    rows = []
    # for sample in tqdm(test_dataset, "Testing"):
    for sample in tqdm(indexed_test_dataset, "Testing"):
        input_ids, token_type_ids = [torch.tensor(sample[key], dtype=torch.long, device="cuda") for key in ("input_ids", "token_type_ids")]
        # print(sample)
    
        model1.eval()
        model2.eval()
        model3.eval()

        starts = []
        ends = []
        for model in models:
            with torch.no_grad():
                output = model(input_ids=input_ids[None, :], token_type_ids=token_type_ids[None, :])

            start_logits = output.start_logits
            end_logits = output.end_logits
            start_logits.squeeze_(0), end_logits.squeeze_(0)

            start_prob = start_logits[token_type_ids.bool()][1:-1].softmax(-1)
            end_prob = end_logits[token_type_ids.bool()][1:-1].softmax(-1)

            probability = torch.triu(start_prob[:, None] @ end_prob[None, :])

            # 토큰 길이 8까지만
            for row in range(len(start_prob) - 8):
                probability[row] = torch.cat((probability[row][:8+row].cpu(), torch.Tensor([0] * (len(start_prob)-(8+row))).cpu()), 0)

            index = torch.argmax(probability).item()

            starts.append(index // len(end_prob))
            ends.append(index % len(end_prob))

        start_str = sample['position'][int(mean(starts))][0]
        end_str = sample['position'][int(mean(ends))][1]

        # start_visualize.append((list(start_prob.cpu()), (start, end), (start_str, end_str)))
        # end_visualize.append((list(end_prob.cpu()), (start, end), (start_str, end_str)))
        
        rows.append([sample["guid"], sample['context'][start_str:end_str]])

    writer.writerows(rows)

HBox(children=(HTML(value='Testing'), FloatProgress(value=0.0, max=4008.0), HTML(value='')))

Attention type 'block_sparse' is not possible if sequence_length: 496 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
Attention type 'block_sparse' is not possible if sequence_length: 496 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
Attention type 'block_sparse' is not possible if sequence_length: 496 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buf




In [24]:
start_visualize = []
end_visualize = []

models = [model1, model2, model3]
with torch.no_grad(), open(f'submissions/{args.NAME}_str_ens3.csv', 'w') as fd:
    writer = csv.writer(fd)
    writer.writerow(['Id', 'Predicted'])

    rows = []
    # for sample in tqdm(test_dataset, "Testing"):
    for sample in tqdm(indexed_test_dataset, "Testing"):
        input_ids, token_type_ids = [torch.tensor(sample[key], dtype=torch.long, device="cuda") for key in ("input_ids", "token_type_ids")]
        # print(sample)
    
        model1.eval()
        model3.eval()

        starts_str = []
        ends_str = []
        for model in models:
            with torch.no_grad():
                output = model(input_ids=input_ids[None, :], token_type_ids=token_type_ids[None, :])

            start_logits = output.start_logits
            end_logits = output.end_logits
            start_logits.squeeze_(0), end_logits.squeeze_(0)

            start_prob = start_logits[token_type_ids.bool()][1:-1].softmax(-1)
            end_prob = end_logits[token_type_ids.bool()][1:-1].softmax(-1)

            probability = torch.triu(start_prob[:, None] @ end_prob[None, :])

            # 토큰 길이 8까지만
            for row in range(len(start_prob) - 8):
                probability[row] = torch.cat((probability[row][:8+row].cpu(), torch.Tensor([0] * (len(start_prob)-(8+row))).cpu()), 0)

            index = torch.argmax(probability).item()

            start = index // len(end_prob)
            end = index % len(end_prob)

            starts_str.append(sample['position'][start][0])
            ends_str.append(sample['position'][end][1])
        
        start_str = int(mean(starts_str))
        end_str = int(mean(ends_str))
        
        start_visualize.append((list(start_prob.cpu()), (start, end), (start_str, end_str)))
        end_visualize.append((list(end_prob.cpu()), (start, end), (start_str, end_str)))
        
        rows.append([sample["guid"], sample['context'][start_str:end_str]])

    writer.writerows(rows)

HBox(children=(HTML(value='Testing'), FloatProgress(value=0.0, max=4008.0), HTML(value='')))


