In [278]:
import os
import random
import time
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import ElectraModel, ElectraTokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

## Load BERT and Tokenizer

In [279]:
bert = ElectraModel.from_pretrained('monologg/koelectra-base-v3-discriminator')
tokenizer = ElectraTokenizer.from_pretrained('tokenizer')

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Label Dicts

In [280]:
labels = ['E_B', 'E_I', 'O', '[PAD]']
num_labels = len(labels)
id2label = {k: v for k, v in enumerate(labels)}
label2id = {v: k for k, v in id2label.items()}

## Load Data and Preprocess for Training

In [281]:
data = pd.read_pickle('data/preprocessed.pkl')

In [282]:
data['tokens'] = data.tokens.apply(lambda x: ['[CLS]'] + x + ['[SEP]'])
data['labels'] = data.labels.apply(lambda x: ['O'] + x + ['O'])

In [283]:
data.tokens.apply(lambda x: len(x)).max()

233

In [284]:
max_len = 256

In [285]:
data['tokens'] = data.tokens.apply(lambda x: x + ['[PAD]'] * (max_len - len(x)))
data['labels'] = data.labels.apply(lambda x: x + ['[PAD]'] * (max_len - len(x)))

In [286]:
tokens_lst = data.tokens.to_list()
labels_lst = data.labels.to_list()

In [287]:
X_train, X_eval, y_train, y_eval = train_test_split(tokens_lst, 
                                                    labels_lst, 
                                                    test_size=0.2, shuffle=True, random_state=42)

In [288]:
train_data = []
for tokens, labels in zip(X_train, y_train):
    length = tokens.index('[PAD]')
    mask = [1] * length + [0] * (max_len - length)

    label_ids = []
    for label in labels:
        label_ids.append(label2id[label])
        
    train_data.append([tokenizer.convert_tokens_to_ids(tokens), mask, label_ids])

In [289]:
eval_data = []
for tokens, labels in zip(X_eval, y_eval):
    length = tokens.index('[PAD]')
    mask = [1] * length + [0] * (max_len - length)
    
    label_ids = []
    for label in labels:
        label_ids.append(label2id[label])
        
    eval_data.append([tokenizer.convert_tokens_to_ids(tokens), mask, label_ids])

In [290]:
# idx = random.randrange(0, len(train_data) - 1)
# for x, xm, y in zip(train_data[idx][0], train_data[idx][1], train_data[idx][2]):
#     print(x, xm, y)

# idx = random.randrange(0, len(eval_data) - 1)
# for x, xm, y in zip(eval_data[idx][0], eval_data[idx][1], eval_data[idx][2]):
#     print(x, xm, y)

In [291]:
class TaggerDataset(Dataset): 
    def __init__(self, data):
        self.data = data
    
    def __len__(self): 
        return len(self.data)

    def __getitem__(self, idx):
        input_ids = self.data[idx][0]
        mask = self.data[idx][1]
        label_ids = self.data[idx][2]
        return (torch.LongTensor(input_ids), torch.LongTensor(mask), torch.LongTensor(label_ids))

In [292]:
train_dataset = TaggerDataset(train_data)
eval_dataset = TaggerDataset(eval_data)

In [293]:
train_loader = DataLoader(train_dataset, batch_size = 128, shuffle = True)
eval_loader = DataLoader(eval_dataset, batch_size = 128, shuffle = True)

In [294]:
class BERTPoSTagger(nn.Module):
    def __init__(self,
                 bert,
                 output_dim, 
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        self.dropout = nn.Dropout(dropout)
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text, mask):
        embedded = self.dropout(self.bert(text, mask)[0])
        predictions = self.fc(self.dropout(embedded))
        return predictions

In [295]:
OUTPUT_DIM = num_labels
DROPOUT = 0.25

model = BERTPoSTagger(bert,
                      OUTPUT_DIM, 
                      DROPOUT)
model.bert.resize_token_embeddings(len(tokenizer))

Embedding(36223, 768)

In [296]:
input_ids, mask, label_ids = train_dataset[0]
input_ids, mask = input_ids.unsqueeze(0), mask.unsqueeze(0)

output = model(input_ids, mask)
output.shape

torch.Size([1, 256, 4])