# pytorch simple bert 필사
competition : Toxic Comment Classification Challenge  
notebook link : https://www.kaggle.com/code/hawkeoni/pytorch-simple-bert

In [1]:
import os
from typing import Tuple, List
from functools import partial
from zipfile import ZipFile as zz

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, BertPreTrainedModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

In [2]:
try:
    with zz("./Desktop/ff/code/data/toxic_comment/train.csv.zip") as zf:
        zf.extractall()
except:
    print("failed")

In [3]:
try:
    with zz("./Desktop/ff/code/data/toxic_comment/test.csv.zip") as zf:
        zf.extractall()
except:
    print("failed")
    

try:
    with zz("./Desktop/ff/code/data/toxic_comment/test_labels.csv.zip") as zf:
        zf.extractall()
except:
    print("failed")
    
try:
    with zz("./Desktop/ff/code/data/toxic_comment/sample_submission.csv.zip") as zf:
        zf.extractall()
except:
    print("failed")

In [4]:
path = "./"

In [5]:
bert_model_name = "bert-base-cased"
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda:0")

tokenizer = BertTokenizer.from_pretrained(bert_model_name)
assert tokenizer.pad_token_id == 0, "패딩 값은 0으로 설정되므로 모든곳에서 변경하세요"

train_df = pd.read_csv(os.path.join(path, "train.csv"))
display(train_df.head())
train_df, val_df = train_test_split(train_df, test_size = 0.05)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


train, validation을 위하여 dataset 및 iterator를 생성합니다. 메모리에 데이터 프레임이 있으므로 게으른것은 아니지만 텐서로 변환되지는 않습니다.  
원문에서는 lazy라고 적혀있는데, 게으르다는 뜻 보다는 "요구가 있을 때 언제든" 이 적당한 말이라고 합니다.  
출처 : https://zorba91.tistory.com/280

In [6]:
class ToxicDataset(Dataset):
    def __init__(self, tokenizer:BertTokenizer, dataframe:pd.DataFrame, lazy:bool = False):
        self.tokenizer = tokenizer
        self.pad_idx = tokenizer.pad_token_id # 0
        self.lazy = lazy
        
        if not self.lazy:
            self.X = []
            self.Y = []
            # iterrows는 행에 대한 순환반복// 인덱스, 값의 형태로 출력됨
            for i, (row) in tqdm(dataframe.iterrows()):
                x,y = self.row_to_tensor(self.tokenizer, row) 
                self.X.append(x)
                self.Y.append(y)
        else:
            self.df = dataframe
            
    @staticmethod
    def row_to_tensor(tokenizer:BertTokenizer, row:pd.Series) -> Tuple[torch.LongTensor, torch.LongTensor]:
        tokens = tokenizer.encode(row["comment_text"], add_special_tokens = True)
        if len(tokens) > 120:
            tokens = tokens[:119] + [tokens[-1]]
        x = torch.LongTensor(tokens)
        y = torch.FloatTensor(row[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]])
        return x, y
    
    def __len__(self):
        if self.lazy:
            return len(self.df)
        else:
            return len(self.X)
        
    def __getitem__(self, index:int) -> Tuple[torch.LongTensor, torch.LongTensor]:
        if not self.lazy:
            return self.X[index], self.Y[index]
        else:
            return self.row_to_tensor(self.tokenizer, self.df.iloc[index])
        
def collate_fn(batch: List[Tuple[torch.LongTensor, torch.LongTensor]], device: torch.device) \
    -> Tuple[torch.LongTensor, torch.LongTensor]:
    x,y = list(zip(*batch))
    x = pad_sequence(x, batch_first=True, padding_value = 0)
    y = torch.stack(y)
    return x.to(device), y.to(device)

In [8]:
# torch.LongTensor, torch.FloatTensor를 찍어본 결과
# 지정한 각 값이 원 핫 인코딩 형태로 출력
# FloatTensor는 32비트 실수, LongTensor는 64비트 정수

def row_to_tensor(tokenizer: BertTokenizer, row: pd.Series) -> Tuple[torch.LongTensor, torch.LongTensor]:
    tokens = tokenizer.encode(row["comment_text"], add_special_tokens=True)
    if len(tokens) > 120:
        tokens = tokens[:119] + [tokens[-1]]
    x = torch.LongTensor(tokens)
    y = torch.FloatTensor(row[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]])
    return x, y

testt = []
for i, (row) in tqdm(train_df.iterrows()):
    xx, yy = row_to_tensor(tokenizer, row)
    print(xx)
    print(yy)
    testt.append(xx)
    if len(testt) ==5:
        break

0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1279 > 512). Running this sequence through the model will result in indexing errors
4it [00:00, 74.48it/s]

tensor([  101,  1130,  1134,  1692,  1128,  1431,  5782,  1155,  5991,  7582,
         1116,  1106,  1992,  2557,  1176, 27730,  1116, 14467,  4964,   117,
         6567,  1647,  3576,   117,  3576,   119,   102])
tensor([0., 0., 0., 0., 0., 0.])
tensor([ 101, 1119, 1110, 1145, 7495, 1126, 6298, 1751, 1106, 1748, 1117, 2357,
        2035, 1113, 1330, 4795,  119,  102])
tensor([0., 0., 0., 0., 0., 0.])
tensor([  101,   107,   134,   134,  3291, 22808,  1306,   147,  1477,   118,
         2896,  4490,  8950,   118, 14890, 11273,  2225,   118,   163,  9359,
         1377,   134,   134,   138,  1403,  8871,   119,  1987,   119,  5728,
          156, 26989, 14021,  1182,  1138,  1103,  1509,  1937,  1164,  3291,
        22808,  1306,  7434, 17926,   119,  1124,   117,  1167,  1190,  2256,
         1950,   117,  1110,  1103,  1211, 27709,  1825,  1115,  1169, 10086,
         4237,  1105,  2276,  1869,  1116,  1164,  3291, 22808,  1306,  7434,
        17926,   117,  1136,   147, 13836,  5253,




In [9]:
train_dataset = ToxicDataset(tokenizer, train_df, lazy = True)
dev_dataset = ToxicDataset(tokenizer, val_df, lazy = True)
collate_fn = partial(collate_fn, device = device)
BATCH_SIZE = 32
train_sampler = RandomSampler(train_dataset)
dev_sampler = RandomSampler(dev_dataset)
train_iterator = DataLoader(train_dataset, batch_size = BATCH_SIZE, sampler = train_sampler, collate_fn = collate_fn)
dev_iterator = DataLoader(dev_dataset, batch_size = BATCH_SIZE, sampler = dev_sampler, collate_fn = collate_fn)

In [10]:
class BertClassifier(nn.Module):
    def __init__(self, bert:BertModel, num_classes: int):
        super().__init__()
        self.bert = bert
        self.classifier = nn.Linear(bert.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask = None, token_type_ids = None, position_ids = None, head_mask = None, labels = None):
        outputs = self.bert(input_ids, attention_mask = attention_mask,
                           token_type_ids = token_type_ids, position_ids = position_ids,
                           head_mask = head_mask)
        cls_output= outputs[1] # batch, hidden
        cls_output = self.classifier(cls_output) # batch,6
        cls_output = torch.sigmoid(cls_output)
        criterion = nn.BCELoss()
        loss = 0
        if labels is not None:
            loss = criterion(cls_output, labels)
        return loss, cls_output

In [11]:
model = BertClassifier(BertModel.from_pretrained(bert_model_name), 6).to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
def train(model, iterator, optimizer, scheduler):
    model.train()
    total_loss = 0
    for x,y in tqdm(iterator):
        optimizer.zero_grad()
        mask = (x!=0).float()
        loss, outputs = model(x, attention_mask=mask, labels=y)
        total_loss +=loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    print(f"train loss {total_loss/ len(iterator)}")
    
def evaluate(model, iterator):
    model.eval()
    pred = []
    true = []
    with torch.no_grad():
        total_loss = 0
        for x, y in tqdm(iterator):
            mask = (x !=0).float()
            loss, outputs = model(x, attention_mask=mask, labels=y)
            total_loss +=loss
            true += y.cpu().numpy().tolist()
            pred += outputs.cpu().numpy().tolist()
    true = np.array(true)
    pred = np.array(pred)
    for i, name in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
        print(f"{name} roc_auc {roc_auc_score(true[:,i], pred[:,i])}")
    print(f"eval loss {total_loss/ len(iterator)}")

In [13]:
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {"params":[p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay" : 0.01},
    {"params":[p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay":0.0}
]
EPOCH_NUM = 2
warmup_steps = 10 **3
total_steps = len(train_iterator) * EPOCH_NUM - warmup_steps
optimizer = AdamW(optimizer_grouped_parameters, lr = 2e-5, eps = 1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)



In [14]:
for i in range(EPOCH_NUM):
    print("-"*30, f"epoch {i}", "="*30)
    train(model, train_iterator, optimizer, scheduler)
    evaluate(model, dev_iterator)



  0%|                                                                              | 1/4738 [00:33<44:07:38, 33.54s/it]


KeyboardInterrupt: 