In [1]:
import random
import os
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from sklearn.model_selection import *
from transformers import *

In [8]:
CFG = {
    'fold_num': 5, 
    'seed': 42,
    'model': 'google/bigbird-roberta-base',
    'max_len': 1024,
    'epochs': 5,
    'train_bs': 24,
    'valid_bs': 32,
    'lr': 2e-5,
    'num_workers': 0,
    'weight_decay': 1e-6,
}

In [9]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['seed'])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
test_df = pd.read_csv('./sample_submission.csv')
test_df

Unnamed: 0,id,class,predictionstring
0,18409261F5C2,,
1,D46BCB48440A,,
2,0FB0700DAF44,,
3,D72CB1C11673,,
4,DF920E0A7337,,


In [11]:
test_names, test_texts = [], []
for f in tqdm(list(os.listdir('./test'))):
    test_names.append(f.replace('.txt', ''))
    test_texts.append(open('./test/' + f, 'r').read())
test_texts = pd.DataFrame({'id': test_names, 'text': test_texts})
test_texts['text'] = test_texts['text'].apply(lambda x:x.split())
test_texts

100%|██████████| 5/5 [00:00<00:00, 2360.86it/s]


Unnamed: 0,id,text
0,DF920E0A7337,"[Have, you, ever, asked, more, than, one, pers..."
1,0FB0700DAF44,"[During, a, group, project,, have, you, ever, ..."
2,D46BCB48440A,"[When, people, ask, for, advice,they, sometime..."
3,18409261F5C2,"[80%, of, Americans, believe, seeking, multipl..."
4,D72CB1C11673,"[Making, choices, in, life, can, be, very, dif..."


In [12]:
tokenizer = AutoTokenizer.from_pretrained(CFG['model'], add_prefix_space=True)

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


In [13]:
class MyDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.text.values[idx]
        
        return text

In [14]:
def collate_fn(data):
    tokenized_inputs = tokenizer(
        data,
        max_length=CFG['max_len'],
        padding='max_length',
        truncation=True,
        is_split_into_words=True,
        return_tensors='pt'
    )

    words = []
    for i in range(len(data)):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        words.append(word_ids)

    tokenized_inputs["word_ids"] = words
    
    return tokenized_inputs

In [17]:
test_loader = DataLoader(MyDataset(test_texts), batch_size=CFG['valid_bs'], collate_fn=collate_fn, shuffle=False)
batch = next(iter(test_loader))
batch

{'input_ids': tensor([[   65,  8293,   446,  ...,     0,     0,     0],
        [   65,  5957,   358,  ...,     0,     0,     0],
        [   65,  1750,   762,  ...,     0,     0,     0],
        [   65,  4120,   105,  ...,   685,   439,    66],
        [   65, 16528,  7848,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'word_ids': [[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 17, 18, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 28, 29, 29, 30, 31, 32, 33, 34, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 90, 91, 92, 93, 94, 95, 96, 97, 98, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 107, 108, 

In [None]:
model =  AutoModelForTokenClassification.from_pretrained(CFG['model'], num_labels=15).to(device)
model.load_state_dict(torch.load('../input/feedback-bigbird/bigbird-roberta-base_fold_0.pt'))
model.eval()

In [None]:
y_pred = []
words = []

with torch.no_grad():
    tk = tqdm(test_loader, total=len(test_loader), position=0, leave=True)
    for step, batch in enumerate(tk):
        word_ids = batch['word_ids']
        words.extend(word_ids)
        batch = {k: v.to(device) for k, v in batch.items() if k != 'word_ids'}

        output = model(**batch).logits

        y_pred.extend(output.argmax(-1).cpu().numpy())
        
y_pred = np.array(y_pred)

In [None]:
labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
          'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']

In [None]:
final_preds = []

for i in tqdm(range(len(test_texts))):
    idx = test_texts.id.values[i]
    pred = ['']*len(test_texts.text.values[i])

    for j in range(len(y_pred[i])):
        if words[i][j] != None:
            pred[words[i][j]] = labels[y_pred[i][j]]

    preds = []
    j = 0
    while j < len(pred):
        cls = pred[j]
        if cls == 'O' or cls == '' or cls[0]== 'I':
            j += 1
        else:
            end = j + 1
            while end < len(pred) and pred[end].replace('B-','').replace('I-','') == cls.replace('B-','').replace('I-',''):
                end += 1

            if end - j > 5:
                final_preds.append((idx, cls.replace('B-',''), ' '.join(map(str, list(range(j, end))))))

            j = end
        
final_preds[0]

In [None]:
sub = pd.DataFrame(final_preds)
sub.columns = test_df.columns
sub

In [None]:
sub.to_csv('submission.csv', index=False)