# 專題（二）：訓練 Bert 新聞觀點分類器並提升精準度

## 專案目標
- 目標：請試著使用含有 pair sentence 的 Training Dataset 去訓練 Bert 分類器 (BertForSequenceClassification)，並且在 Test Dataset 上驗證模型的精準度
- 資料集 in archive.zip：
    - 包含：train.csv、test.csv、solution.csv
    - 資料來源：https://www.kaggle.com/wsdmcup/wsdm-fake-news-classification
    - 資料中包含兩個新聞標題 title1_zh 和 title2_zh，並且給予這兩篇新聞的相關性，分別可能是：agreed, unrelated, disagreed

## 實作提示
- STEP1 - STEP4：資料處理
- STEP5：撰寫 train_batch 函數
- STEP6：撰寫 evaluate 函數
- STEP7：組合以上開始訓練，如果正確 validation accuracy 應該可以超過 85% 以上
- STEP8：對 testing dataset 進行測試，並計算 accuracy

## 重要知識點：專題結束後可以學會
- 了解 BERT 的 2-Sequence Classification 任務如何進行
- 使用 TRAIN / VALID DATA 來了解深度學習模型的訓練情形
- 了解預訓練模型在 NLP 上的威力

In [1]:
# from: https://www.kaggle.com/wsdmcup/wsdm-fake-news-classification
!unzip archive.zip

Archive:  archive.zip
  inflating: solution.csv            
  inflating: test.csv                
  inflating: train.csv               


In [2]:
import numpy as np
import pandas as pd

import torch
from torch import optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from tqdm.notebook import tqdm

from transformers import BertTokenizer, BertForSequenceClassification

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [4]:
df_train = df_train[['title1_zh', 'title2_zh', 'label']].dropna(axis=0, how='any').reset_index(drop=True)
df_test = df_test[['id', 'title1_zh', 'title2_zh']].dropna(axis=0, how='any').reset_index(drop=True)

In [5]:
ALL_LABELS = ['agreed', 'unrelated', 'disagreed']

In [6]:
MODEL_NAME = 'bert-base-chinese'

In [7]:
# 建置數據集
class NewsPairDataset(Dataset):
    def __init__(self, tokenizer, df, max_len=512):
        self.tokenizer = tokenizer
        self.df = df
        self.max_len = max_len

    def __getitem__(self, idx):
        text1 = self.df.loc[idx, 'title1_zh']
        text2 = self.df.loc[idx, 'title2_zh']
        label = self.df.loc[idx, 'label'] if 'label' in self.df.columns else None

        text1_tokens = self.tokenizer.tokenize(text1)
        text2_tokens = self.tokenizer.tokenize(text2)
        len_all_tokens = len(text1_tokens) + len(text2_tokens) + 2
        if len_all_tokens > self.max_len:
            limit_num = (self.max_len - 2) // 2
            text1_tokens = text1_tokens[:limit_num]
            text2_tokens = text2_tokens[:limit_num]

        input = {}
        word_pieces = ['[CLS]'] + text1_tokens + ['[SEP]'] + text2_tokens
        input['input_ids'] = torch.LongTensor(self.tokenizer.convert_tokens_to_ids(word_pieces))
        pos_sep = word_pieces.index('[SEP]')
        input['token_type_ids'] = torch.LongTensor(
            [0] * (pos_sep + 1) + [1] * (len(word_pieces) - pos_sep - 1)
        )
        input['attention_mask'] = torch.LongTensor([1] * len(word_pieces))

        if label:
            label = torch.tensor(ALL_LABELS.index(label))

        return input, label

    def __len__(self):
        return len(self.df)

In [8]:
def create_mini_batch(samples):
    input_ids = []
    token_type_ids = []
    attention_mask = []
    labels = []
    for s in samples:
        input_ids.append(s[0]['input_ids'].squeeze(0))
        token_type_ids.append(s[0]['token_type_ids'].squeeze(0))
        attention_mask.append(s[0]['attention_mask'].squeeze(0))
        if s[1] != None:
            labels.append(s[1])

    # zero pad 到同一序列長度
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)

    if len(labels):
        labels = torch.stack(labels)
        return input_ids, token_type_ids, attention_mask, labels
    else:
        return input_ids, token_type_ids, attention_mask

In [9]:
train_batch_size = 32
eval_batch_size = 512

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

dataset = NewsPairDataset(tokenizer, df_train)

train_size = int(0.8 * len(dataset))
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=train_batch_size,
    collate_fn=create_mini_batch,
    shuffle=True
)
valid_loader = DataLoader(
    dataset=valid_dataset,
    batch_size=eval_batch_size,
    collate_fn=create_mini_batch
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




In [10]:
def train_batch(model, data, optimizer, device):
    model.train()
    input_ids, token_type_ids, attention_mask, labels = [d.to(device) for d in data]
    outputs = model(
        input_ids=input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask,
        labels=labels
    )
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()

In [11]:
def evaluate(model, valid_loader):
    model.eval()
    device = 'cuda' if next(model.parameters()).is_cuda else 'cpu'

    tot_count = 0
    tot_loss = 0
    tot_correct = 0

    with torch.no_grad():
        for data in tqdm(valid_loader):
            input_ids, token_type_ids, attention_mask, labels = [d.to(device) for d in data]
            outputs = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            tot_count += input_ids.size(0)
            tot_loss += outputs.loss.item()
            tot_correct += (outputs.logits.argmax(dim=-1) == labels).sum().item()
    
    evaluation = {
        'loss': tot_loss / tot_count,
        'acc': tot_correct / tot_count
    }

    return evaluation

In [12]:
# 訓練模型
max_iter = 3000
lr = 0.00001

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False,
    return_dict=True
)
model.to(device)

optimizer = optim.RMSprop(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

i = 0
is_running = True
while is_running:
    for train_data in train_loader:
        loss = train_batch(model, train_data, optimizer, device)

        if i > 0 and i % 100 == 0:
            train_size = train_data[0].size(0)
            print(f"train_loss: {loss / train_size}")

        if i > 0 and i % 1000 == 0:
            evaluation = evaluate(model, valid_loader)
            print(f"valid_evaluation: loss={evaluation['loss']}, acc={evaluation['acc']}")
            scheduler.step()

        if i == max_iter:
            is_running = False
            break

        i += 1

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=624.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411577189.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

train_loss: 0.013253321871161461
train_loss: 0.01214770507067442
train_loss: 0.010009453631937504
train_loss: 0.00894092209637165
train_loss: 0.010939258150756359
train_loss: 0.007727797608822584
train_loss: 0.006354642100632191
train_loss: 0.007479773834347725
train_loss: 0.009624685160815716
train_loss: 0.00733733968809247


HBox(children=(FloatProgress(value=0.0, max=126.0), HTML(value='')))


valid_evaluation: loss=0.0005730452869118778, acc=0.8747445756446053
train_loss: 0.010083584114909172
train_loss: 0.00732594495639205
train_loss: 0.006527394521981478
train_loss: 0.011878238059580326
train_loss: 0.007899826392531395
train_loss: 0.0074873859994113445
train_loss: 0.010409853421151638
train_loss: 0.006886688526719809
train_loss: 0.007666162680834532
train_loss: 0.008246097713708878


HBox(children=(FloatProgress(value=0.0, max=126.0), HTML(value='')))


valid_evaluation: loss=0.0005463662331740481, acc=0.8786753809917485
train_loss: 0.01107663381844759
train_loss: 0.01218966580927372
train_loss: 0.0050710514187812805
train_loss: 0.006209608633071184
train_loss: 0.0030823105480521917
train_loss: 0.01074244175106287
train_loss: 0.014105181209743023
train_loss: 0.011393612250685692
train_loss: 0.00753560708835721
train_loss: 0.008807427249848843


HBox(children=(FloatProgress(value=0.0, max=126.0), HTML(value='')))


valid_evaluation: loss=0.0005130511056214874, acc=0.8853203138404904


## Testing

In [13]:
# 測試
test_dataset = NewsPairDataset(tokenizer, df_test)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=eval_batch_size,
    collate_fn=create_mini_batch
)

with torch.no_grad():
    pred = []
    for data in tqdm(test_loader):
        input_ids, token_type_ids, attention_mask = [d.to(device) for d in data]

        outputs = model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask
        )
        indexes = outputs.logits.argmax(dim=-1).cpu().tolist()
        pred += [ALL_LABELS[i] for i in indexes]

df_result = df_test[['id']].copy()
df_result['pred'] = pred
df_result.to_csv('result.csv', index=None)

HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




In [14]:
df_ans = pd.read_csv('solution.csv')
df_ans.rename(columns={'Id': 'id'}, inplace=True)
df = df_ans.merge(df_result, how='left')
test_acc = np.mean(df['Expected'] == df['pred'])
print(f"test accuarcy: {test_acc}")

test accuarcy: 0.8717519906147817
