In [1]:
import os
from tqdm import tqdm

import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange, repeat, reduce

from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler

from transformers import AutoTokenizer, AutoConfig, AutoModel

from sklearn.metrics import accuracy_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(1111)

In [3]:
'''
main ref) https://www.kaggle.com/cdeotte/pytorch-bigbird-ner-cv-0-615?scriptVersionId=83230719
detr ref) https://colab.research.google.com/github/facebookresearch/detr/blob/colab/notebooks/detr_demo.ipynb
detr ref) https://www.kaggle.com/tanulsingh077/end-to-end-object-detection-with-transformers-detr
코드 실행 전 준비 사항 :
    1) 먼저 kaggle 에서 데이터를 다운받고, './input' 에 압축해제 시켜준다.
    2) detr 을 fork 해준다. # !git clone https://github.com/facebookresearch/detr.git
'''

"\nmain ref) https://www.kaggle.com/cdeotte/pytorch-bigbird-ner-cv-0-615?scriptVersionId=83230719\ndetr ref) https://colab.research.google.com/github/facebookresearch/detr/blob/colab/notebooks/detr_demo.ipynb\ndetr ref) https://www.kaggle.com/tanulsingh077/end-to-end-object-detection-with-transformers-detr\n코드 실행 전 준비 사항 :\n    1) 먼저 kaggle 에서 데이터를 다운받고, './input' 에 압축해제 시켜준다.\n    2) detr 을 fork 해준다. # !git clone https://github.com/facebookresearch/detr.git\n"

In [5]:
# object detection 문제로 전처리하기
# objective : use DETR structure for sentence segmentation.
PATH = os.path.join(os.getcwd(), 'input')
TRAIN_NER_PATH_DETR = os.path.join(PATH, 'train_detr.csv')

In [6]:
# todo
#!# test code for preprocessing

In [7]:
# NER label 로 전처리한 데이터 불러오기
# 만약 starting class 를 원하지 않는다면 이하 코드를 실행할 것.

try:
    from ast import literal_eval
    train_text_df = pd.read_csv(TRAIN_NER_PATH_DETR)
    
    # pandas saves lists as string, we must convert back
    from ast import literal_eval
    train_text_df.segment_label = train_text_df.segment_label.apply(lambda x: literal_eval(x))
    
    original_train_df = pd.read_csv(os.path.join(PATH, 'train.csv'))
    
except:
    print('this is 1st time to run this code...')
    print('try to convert original text to DETR labels...')
    # read original text files0
    train_ids, train_texts = [], []
    for f in tqdm(list(os.listdir(os.path.join(PATH, 'train')))):
        train_ids.append(f.replace('.txt', ''))
        train_texts.append(open(os.path.join(PATH, 'train', f), 'r').read())
    train_text_df = pd.DataFrame({'id': train_ids, 'text': train_texts})

    # convert segment label into object detection label : [segment_type, x, y]
    original_train_df = pd.read_csv(os.path.join(PATH, 'train.csv'))
    label_list = []
    for i, text_df in tqdm(train_text_df.iterrows()):
        total = text_df['text'].split().__len__()
        segment_label_list = []
        for j, segment_df in original_train_df[original_train_df['id'] == text_df['id']].iterrows():
            segment_label = [
                segment_df['discourse_type'],
                int(segment_df['predictionstring'].split(' ')[0]), 
                int(segment_df['predictionstring'].split(' ')[-1])
            ]
            segment_label_list.append(segment_label)

        label_list.append(segment_label_list)

    train_text_df['segment_label'] = label_list
    train_text_df.to_csv(TRAIN_NER_PATH_DETR, index=False)

In [8]:
# CREATE DICTIONARIES THAT WE CAN USE DURING TRAIN AND INFER
output_labels_detr = [
    'O', # detr need dummy class for padding
    'Lead', 
    'Position', 
    'Claim', 
    'Counterclaim', 
    'Rebuttal', 
    'Evidence', 
    'Concluding Statement'
]

labels_to_ids = {v:k for k,v in enumerate(output_labels_detr)}
ids_to_labels = {k:v for k,v in enumerate(output_labels_detr)}

In [9]:
# CHOOSE VALIDATION INDEXES
IDS = original_train_df.id.unique()
print('There are',len(IDS),'train texts. We will split 90% 10% for validation.')

# TRAIN VALID SPLIT 90% 10%
train_idx = np.random.choice(np.arange(len(IDS)),int(0.9*len(IDS)),replace=False)
valid_idx = np.setdiff1d(np.arange(len(IDS)),train_idx)

# CREATE TRAIN SUBSET AND VALID SUBSET
data_df = train_text_df[['id','text', 'segment_label']]
train_df = data_df.loc[data_df['id'].isin(IDS[train_idx]),['text', 'segment_label']].reset_index(drop=True)
valid_df = data_df.loc[data_df['id'].isin(IDS[valid_idx])].reset_index(drop=True)

print("FULL Dataset: {}".format(data_df.shape))
print("TRAIN Dataset: {}".format(train_df.shape))
print("VALID Dataset: {}".format(valid_df.shape))

There are 15594 train texts. We will split 90% 10% for validation.
FULL Dataset: (15594, 3)
TRAIN Dataset: (14034, 2)
VALID Dataset: (1560, 3)


In [10]:
'''test code for preprocessing'''
i = 1
j = 0

# pre-processed
label, start_idx, end_idx = data_df['segment_label'][i][j]
text_id = data_df['id'][i]
print(data_df['text'][i].split()[start_idx:end_idx+1])

# original
original_text = original_train_df[original_train_df['id'] == text_id]
print(original_text[original_text['discourse_type'] == label]['discourse_text'])

['Venus', 'is', 'a', 'worthy', 'pursuit', 'despite', 'the', 'dangers.']
39834    Venus is a worthy pursuit despite the dangers 
Name: discourse_text, dtype: object


In [11]:
data_df.head()

Unnamed: 0,id,text,segment_label
0,7301B174090E,I believe that a B average would be a good thi...,"[[Position, 0, 14], [Claim, 32, 47], [Counterc..."
1,3799E21B6EC3,Venus is a worthy pursuit despite the dangers....,"[[Position, 0, 7], [Claim, 8, 28], [Evidence, ..."
2,29C5DBB0A339,Limiting car usage will have many advantages. ...,"[[Position, 0, 6], [Claim, 11, 12], [Claim, 14..."
3,1613BD216385,"""Making Mona Lisa Smile"" is about a computer h...","[[Lead, 0, 39], [Position, 40, 64], [Evidence,..."
4,D4A3E7EC982E,In this essay i will be explaining the differe...,"[[Lead, 0, 22], [Position, 23, 33], [Claim, 34..."


In [13]:
# dataset 이 잘 작동하는지 확인하는 코드
# #!# 로 표지된 index 를 바꿔주면 해당 dataset_row 에 대해서 전처리된 라벨과 실제 라벨에서 다른 부분을 출력해준다.

# data = data_df
# is_train = True

# index = 2 #!# 바꾸면서 다양한 시도 해보기

# text = data.text[index]        
# text_id = data.id[index]
# segment_label_list = data.segment_label[index] if is_train else None

# # TOKENIZE TEXT
# encoding = tokenizer(
#     text.split(),
#     is_split_into_words=True,
#     padding='max_length', #!# need to check exist seq s.t. longer than 4094
#     truncation=True, #!# need to check exist seq s.t. longer than 4094
#     max_length=500
# )
        
# word_ids = encoding.word_ids()

# segment_ids_list = [[labels_to_ids[label], start_idx, end_idx] for label, start_idx, end_idx in segment_label_list]

# processed_list = []
# for ids, start_idx, end_idx in segment_ids_list:
#     start_word_ids = word_ids.index(start_idx)
#     end_word_ids = word_ids.index(end_idx)
    
#     processed_list.append(tokenizer.decode(encoding.input_ids[start_word_ids:end_word_ids+1]))
    
# original_list = list(train_df[train_df['id'] == text_id]['discourse_text'])

# is_same = True
# for p_discourse, o_discourse in zip(processed_list, original_list):
#     if p_discourse.split() == o_discourse.split():
#         continue
        
#     else: 
#         is_same = False
#         for p, o in zip(p_discourse.split(), o_discourse.split()):
#             if p != o:
#                 print(p, o)
# if is_same:
#     print('every token in the label is same.')

In [14]:
'''baseline : ignore \n\n, 문장 기호들'''
#!# 문장 기호는 상당히 중요한 정보를 담고 있어서 처리해주고 싶은데.. 
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, is_train):
        super(dataset, self).__init__()
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_train = is_train # if test (or validation) period, we won't use word label

    def __getitem__(self, index):
        global max_segment
        # GET TEXT AND WORD LABELS 
        text = self.data.text[index]        
        segment_label_list = self.data.segment_label[index] if self.is_train else None

        # TOKENIZE TEXT
        encoding = self.tokenizer(
            text.split(),
            is_split_into_words=True,
            return_offsets_mapping=False, #!# how to use it for enabling tokenizer to "see" \n\n?
            padding='max_length', #!# need to check exist seq s.t. longer than 4094
            truncation=True, #!# need to check exist seq s.t. longer than 4094
            max_length=self.max_len
        )
        
        word_ids = encoding.word_ids()
        
        # CREATE TARGETS
        #!# detr label padding 구현 : x, y 정보는 어떻게 넣어주는가? 0이어도 되나, random 이 더 좋으려나
        if self.is_train:
            segment_ids_list = torch.as_tensor([[labels_to_ids[label], start_idx, end_idx] for label, start_idx, end_idx in segment_label_list]) # [num_seg, 3]
            segment_ids_pad  = torch.zeros(max_segment - segment_ids_list.size(0), segment_ids_list.size(1)) # [max_seg - num_seg, 3]
            segment_ids_list = torch.cat((segment_ids_list, segment_ids_pad), dim = 0) # [max_seg, 3]
            encoding['labels'] = segment_ids_list

        # CONVERT TO TORCH TENSORS
        item = {k: torch.as_tensor(v) for k, v in encoding.items()}
        return item

    def __len__(self):
        return self.len

# build model

In [15]:
# ref) https://github.com/facebookresearch/detr 를 참고했으나, review 필요함.
class DetrHead(nn.Module):
    def __init__(self, feature_extractor, transformer, prediction_head, pos_emb, max_seq, max_segment, d_model):
        super(DetrHead, self).__init__()
        self.feature_extractor = feature_extractor
        self.transformer = transformer
        self.prediction_head = prediction_head
        
        self.feature_pos = pos_emb # absolute positional encoding (sinusodial, attention is all you need)
        self.query_pos = nn.Parameter(torch.rand(max_segment, d_model))
        
    def forward(self, x):
        out = self.feature_extractor(x) # x -> [b, s, d_model]
        out = self.transformer(out + self.feature_pos(out), repeat(self.query_pos, 'i j -> b i j', b = out.size(0))) # [b, s, d_model]
        out = self.prediction_head(out)
        return out
    
class FeatureExtractor(nn.Module):
    def __init__(self, lm):
        super(FeatureExtractor, self).__init__()
        self.lm = lm
        
    def forward(self, x):
        out = self.lm(input_ids = x['input_ids'], attention_mask = x['attention_mask']).last_hidden_state #!# todo : try other layer
        return out
    
class Transformer(nn.Module): #!# todo : change transformer structure with user defined transformer structure
    def __init__(self, d_model, nhead = 12, num_encoder_layers = 2, num_decoder_layers = 2, dim_feedforward = 256):
        super(Transformer, self).__init__()
        self.transformer = nn.Transformer( # https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html
            d_model, 
            nhead = nhead, 
            num_encoder_layers = num_encoder_layers, 
            num_decoder_layers = num_decoder_layers, 
            dim_feedforward = dim_feedforward,
            batch_first = True
        ) 
        
    def forward(self, f, q):
        out = self.transformer(f, q)
        return out

class PredictionHead(nn.Module): #!# todo : try diff. prediction head
    def __init__(self, d_model, num_class):
        super(PredictionHead, self).__init__()
        self.fc_layer_class = nn.Linear(d_model, num_class + 1) # +1 for null class
        self.fc_layer_segment = nn.Linear(d_model, 2)
        
    def forward(self, x):
        c = self.fc_layer_class(x)
        b = self.fc_layer_segment(x)
        return (c, b)
    
import math
class PositionalEmbedding(nn.Module): #!# ref) https://github.com/codertimo/BERT-pytorch
    def __init__(self, d_model, max_len):
        super().__init__()
        pe = torch.zeros(max_len, d_model, requires_grad = False).float()
        pos = torch.arange(0, max_len).float()
        div = (-(torch.arange(0, d_model, 2).float() / d_model) * math.log(10000.0)).exp()
        
        pe[:, 0::2] = torch.sin(torch.einsum('i,j->ij', pos, div))
        pe[:, 1::2] = torch.cos(torch.einsum('i,j->ij', pos, div))
        pe = rearrange(pe, 'i j -> () i j')
        
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        return self.pe[:, :x.size(1)]

In [16]:
# define loss function
# fork ref) https://github.com/facebookresearch/detr/blob/091a817eca74b8b97e35e4531c1c39f89fbe38eb/models/detr.py#L83
# code ref) https://www.kaggle.com/tanulsingh077/end-to-end-object-detection-with-transformers-detr

import sys
sys.path.append('./detr/')

from detr.models.matcher import HungarianMatcher
from detr.models.detr import SetCriterion

matcher = HungarianMatcher()
weight_dict = weight_dict = {'loss_ce': 1, 'loss_bbox': 1 , 'loss_giou': 1}
losses = ['labels', 'boxes', 'cardinality']

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

# Test code (model)

모든 부분을 믿지 마라. 확인할 수 있는 코드를 만드는 것도 능력이다.

작업중...

In [17]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
lm = AutoModel.from_pretrained('bert-base-uncased') #!# 제출을 위해 local 에 다운받는 과정 필요.

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
train_dataset = dataset(train_df, tokenizer, lm.config.max_position_embeddings, is_train = True)
train_loader = DataLoader(train_dataset, batch_size = 2, shuffle = True)

In [19]:
'''model parameter'''
max_seq = lm.config.max_position_embeddings
d_model = lm.config.hidden_size

max_segment = 20
num_class = len(output_labels_detr)

FEATURE_EXTRACTOR = FeatureExtractor(lm)
TRANSFORMER = Transformer(d_model)
PREDICTION_HEAD = PredictionHead(d_model, num_class)
PE = PositionalEmbedding(d_model, max_seq)
DETR_HEAD = DetrHead(FEATURE_EXTRACTOR, TRANSFORMER, PREDICTION_HEAD, PE, max_seq, max_segment, d_model)

for batch in train_loader:
    break

batch = {k : v.to(device) for k, v in batch.items()}
DETR_HEAD.to(device)

out = DETR_HEAD(batch)

# Setting

In [36]:
tokenizer = AutoTokenizer.from_pretrained('./model')
lm = AutoModel.from_pretrained('google/bigbird-roberta-base') #!# 제출을 위해 local 에 다운받는 과정 필요.

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [38]:
'''model parameter'''
max_seq = lm.config.max_position_embeddings
d_model = lm.config.hidden_size

max_segment = 100
num_class = len(output_labels_detr)

FEATURE_EXTRACTOR = FeatureExtractor(lm)
TRANSFORMER = Transformer(d_model)
PREDICTION_HEAD = PredictionHead(d_model, num_class)
PE = PositionalEmbedding(d_model, max_seq)
DETR_HEAD = DetrHead(FEATURE_EXTRACTOR, TRANSFORMER, PREDICTION_HEAD, PE, max_seq, max_segment, d_model)
DETR_HEAD.to(device)

DetrHead(
  (feature_extractor): FeatureExtractor(
    (lm): BigBirdModel(
      (embeddings): BigBirdEmbeddings(
        (word_embeddings): Embedding(50358, 768, padding_idx=0)
        (position_embeddings): Embedding(4096, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BigBirdEncoder(
        (layer): ModuleList(
          (0): BigBirdLayer(
            (attention): BigBirdAttention(
              (self): BigBirdBlockSparseAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
              )
              (output): BigBirdSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): Laye

In [192]:
os.environ["CUDA_VISIBLE_DEVICES"]= '0,1'

In [50]:
# TRAIN DATASET AND VALID DATASET
train_params = {'batch_size': config['train_batch_size'], 'shuffle': True}

train_loader = DataLoader(train_dataset, **train_params, 
                             # collate_fn = collate_batch
                            )

# valid_params = {'batch_size': config['valid_batch_size'],
#                 'shuffle': False,
#                 'num_workers': 1,
#                 'pin_memory':True
#                 }

# validing_loader = DataLoader(valid_dataset, **valid_params)

In [49]:
for batch in training_loader:
    break

In [None]:
# https://www.kaggle.com/raghavendrakotala/fine-tunned-on-roberta-base-as-ner-problem-0-533
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    # tr_preds, tr_labels = [], []
    
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(config['device'], dtype = torch.long)
        mask = batch['attention_mask'].to(config['device'], dtype = torch.long)
        labels = batch['labels'].to(config['device'], dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 200==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss after {idx:04d} training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        #tr_labels.extend(labels)
        #tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=config['max_grad_norm']
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [None]:
# CREATE MODEL
config_model = AutoConfig.from_pretrained(DOWNLOADED_MODEL_PATH+'/config.json') 
model = AutoModelForTokenClassification.from_pretrained(
    DOWNLOADED_MODEL_PATH+'/pytorch_model.bin',
    config=config_model
)
model.to(config['device'])
optimizer = torch.optim.Adam(params=model.parameters(), lr=config['learning_rates'][0])

In [None]:
# LOOP TO TRAIN MODEL (or load model)
if not LOAD_MODEL_FROM:
    for epoch in range(config['epochs']):
        
        print(f"### Training epoch: {epoch + 1}")
        for g in optimizer.param_groups: 
            g['lr'] = config['learning_rates'][epoch]
        lr = optimizer.param_groups[0]['lr']
        print(f'### LR = {lr}\n')
        
        train(epoch)
        torch.cuda.empty_cache()
        gc.collect()
        
    torch.save(model.state_dict(), f'bigbird_v{VER}.pt')
    
else:
    model.load_state_dict(torch.load(f'{LOAD_MODEL_FROM}/bigbird_v{VER}.pt'))
    print('Model loaded.')

# todo

In [None]:
# code for submission
# ref) https://www.kaggle.com/cdeotte/pytorch-bigbird-ner-cv-0-615?scriptVersionId=83230719
test_names, test_texts = [], []
for f in list(os.listdir('../input/feedback-prize-2021/test')):
    test_names.append(f.replace('.txt', ''))
    test_texts.append(open('../input/feedback-prize-2021/test/' + f, 'r').read())
test_texts = pd.DataFrame({'id': test_names, 'text': test_texts})

test_names, train_texts = [], []
for f in tqdm(list(os.listdir('../input/feedback-prize-2021/train'))):
    test_names.append(f.replace('.txt', ''))
    train_texts.append(open('../input/feedback-prize-2021/train/' + f, 'r').read())
train_text_df = pd.DataFrame({'id': test_names, 'text': train_texts})


# # TEST DATASET
# test_texts_set = dataset(test_texts, tokenizer, config['max_length'], True)
# test_texts_loader = DataLoader(test_texts_set, **test_params)

In [38]:
# define config
config = {'model_name': MODEL_NAME,   
          'max_length': 1024,
          'train_batch_size':4,
          'valid_batch_size':4,
          'epochs':5,
          'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
          'max_grad_norm':10,
          'device': 'cuda' if cuda.is_available() else 'cpu'}

In [13]:
# if you are first running this code, please download LM.
MODEL_NAME = 'google/bigbird-roberta-base' # choose which model to download
DOWNLOADED_MODEL_PATH = 'model'            # choose where to download the model

if DOWNLOADED_MODEL_PATH == 'model':
    os.mkdir('model')
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True) #!# add_prefix_space?
    tokenizer.save_pretrained('model')

    config_model = AutoConfig.from_pretrained(MODEL_NAME) 
    config_model.num_labels = 15
    config_model.save_pretrained('model')

    backbone = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, 
                                                               config=config_model)
    backbone.save_pretrained('model')

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/846k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/775 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/513M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForTokenClassification were no

In [173]:
#!# implement tokenizer \n\n, to see paragraph information
# ref) https://github.com/huggingface/tokenizers/issues/247
# ref) https://www.kaggle.com/c/feedback-prize-2021/discussion/296713
tokenizer.decode(tokenizer(r'\\n\\n', return_offsets_mapping=True)['input_ids'])

'''add new special token to model and tokenizer'''
special_tokens_dict = {'additional_special_tokens': [r'\\n\\n']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        label_ids = []
        for word_idx in word_ids:                
            if word_idx is None:
                label_ids.append(-100)
            else:  
                label_ids.append(label[word_idx])

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs