reference : https://www.kaggle.com/code/suraj520/beginner-friendly-bert

import modules

In [1]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split

Loading the data

In [2]:
train_data = pd.read_csv('./CommonLit_data/summaries_train.csv')
test_data = pd.read_csv('./CommonLit_data/summaries_test.csv')

Preprocessing the data

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# batch_encode_plus는 여러개의 텍스트 문장을 한 번에 인코딩하는데 사용된다.
train_encodings = tokenizer.batch_encode_plus( 
    # tolist()는 pandas의 DataFrame이 제공하는 메서드로, 해당 데이터를 리스트로 변환한다.
    # batch_encode_plus는 입력으로 Python 리스트, 튜플 또는 다른 시퀀스 형태의 데이터를 받을 수 있으므로
    train_data['text'].tolist(), 
    truncation=True,
    padding=True
)
test_encodings = tokenizer.batch_encode_plus(
    test_data['text'].tolist(),
    truncation=True,
    padding=True
)

train_dataset = torch.utils.data.TensorDataset( 
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    
    # torch.tensor가 pandas의 Series객체를 직접 처리할 수 없어서 리스트로 변환
    torch.tensor(train_data['content'].tolist()), 
    torch.tensor(train_data['wording'].tolist())
)

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask'])
)

In [18]:
# train_dataset 내용 확인

print("첫 번째 문장의 Inputs ids",train_dataset[0][0])
print("첫 번째 문장의 attention Mask", train_dataset[0][1])
print("첫 번째 문장의 'content'",train_dataset[0][2])
print("첫 번째 문장의 'wording'",train_dataset[0][3])

print("첫 번째 문장의 Inputs ids shape ",train_dataset[0][0].shape)
print("첫 번째 문장의 attention Mask shape", train_dataset[0][1].shape)

첫 번째 문장의 Inputs ids tensor([  101,  1996,  2353,  4400,  2001,  2019,  7551,  3406,  2156,  2129,
         2111, 14831,  2000,  1037,  2047,  2028,  3003,  2231,  1012,  2009,
         4227,  6217,  2004,  2111,  2359,  2000,  3046,  2047,  2477,  1012,
         1996,  2493,  3582,  2505,  2008,  2003,  2056,  1998,  2707,  3810,
         2006,  2169, 14573,  2121,  2000,  5114,  3020,  2373,  1012,  2027,
         2018,  2000,  2644,  1996,  4654,  4842, 13665,  2004,  2205,  2116,
         2111,  2288,  2000,  7490,  2007,  2009, 25734,  2206,  2045,  3003,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,  

Defining the BERT model

In [20]:
class BERTModel(nn.Module):
    def __init__(self):
        super(BERTModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.1)
        self.linear1 = nn.Linear(768, 256)
        self.linear2 = nn.Linear(256, 2)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # pooler_output? BERT의 마지막 Transformer 계층에서 [CLS] 토큰의 출력을 가져와서
        # 추가적인 Dense 레이어를 통과시킨 결과이다.
        # 이 CLS 토큰의 임베딩은 전체 입력 시퀀스의 문맥적 요약으로 사용된다.
        # 따라서 이 CLS 토큰의 임베딩(pooler_output)을 이용해서 추가 작업을 한다.
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        
        output = self.linear1(pooled_output)
        output = nn.ReLU()(output)
        output = self.linear2(output)
        return output

Training the BERT Model

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BERTModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.MSELoss()

Creating data loader and performing sanity check

In [22]:
batch_size = 8

In [24]:
# Splitting training data into train and validation sets
train_dataset, val_datset = train_test_split(train_dataset, test_size=0.2, random_state=0)

# Creating train loader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Creating validation loader
val_loader = torch.utils.data.DataLoader(val_datset, batch_size=batch_size, shuffle=False)

In [27]:
for batch in train_loader:
    print(batch)

# 여기서 각각의 batch가 뜻하는 것 : (input_ids, attention_mask, content, wording)
# 그래서 len(batch) 찍으면 모두 4가 나온다.

[tensor([[  101,  2093,  3787,  ...,     0,     0,     0],
        [  101,  1996,  4713,  ...,     0,     0,     0],
        [  101,  1996, 22089,  ...,     0,     0,     0],
        ...,
        [  101,  2012,  1996,  ...,     0,     0,     0],
        [  101,  1037, 19817,  ...,     0,     0,     0],
        [  101,  1996,  3252,  ...,     0,     0,     0]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([-0.9019, -1.4279, -0.6486,  0.2057,  0.2057, -0.9813, -0.6556, -0.9385,
         0.1460,  0.3884, -0.3943,  0.2057,  1.2528,  1.4672,  0.2057,  2.1292]), tensor([-0.9165, -0.7907, -0.5480,  0.3805,  0.3805, -1.5489, -1.3454, -1.3016,
         0.0452, -0.7180, -0.6139,  0.3805,  1.3845, -0.5986,  0.3805,  2.3054])]
4
[tensor([[ 101, 1017, 3787,  ...,    0,    0,    0],
        [ 101, 1996, 2493,  ...,    0,

Training the model for 30 epochs

In [29]:
# Training loop
model.train()

for epoch in range(30):
    running_loss = 0.0
    for step, (input_ids, attention_mask, content, wording) in enumerate(train_loader):
        
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        content = content.to(device)
        wording = wording.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask)
        # outputs는 [batch_size, 2]가 될 것이다.
        loss = criterion(outputs[:, 0], content) + criterion(outputs[:, 1], wording)
        loss.backward()
        optimizer.step()
        
        if step % 500 == 0:
            print("Epoch {}, step {}, Loss {}".format(epoch+1, step, loss.item()))
            
        running_loss += loss.item()
        
    print("Epoch {} Loss {}".format(epoch+1, running_loss/len(train_loader)))
    
    # Validation Loop
    model.eval()
    with torch.no_grad():
        val_loss = 0.0
        for val_step, (input_ids, attention_mask, content, wording) in enumerate(val_loader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            content = content.to(device)
            wording = wording.to(device)
            
            val_outputs = model(input_ids, attention_mask)
            val_loss += criterion(val_outputs[:, 0], content) + criterion(val_outputs[:, 1], wording)
            
            print("Validation Loss : {}".format(val_loss/len(val_loader)))
        model.train()