In [None]:
!pip install transformers

In [2]:
import csv
import re
import time
import nltk
from nltk.stem import PorterStemmer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from transformers import AutoTokenizer, AutoModel
from keras.preprocessing.sequence import pad_sequences
from IPython.display import clear_output
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset
from IPython.display import clear_output

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

import numpy as np
import pandas as pd
import os

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [4]:
### MOUNT THE FILE SYSTEM
from google.colab import drive
drive.mount('/content/BDA')

Mounted at /content/BDA


In [5]:
""" Data input """

train = pd.read_csv('/content/BDA/MyDrive/dataset/fixed_train.csv')
valid = pd.read_csv('/content/BDA/MyDrive/dataset/fixed_valid.csv')
test = pd.read_csv('/content/BDA/MyDrive/dataset/fixed_test.csv')
# train.head(20)

In [24]:
""" Data preprocessing """

# 取得此預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class myDataset(Dataset):
    # 讀取前處理後的 csv 檔並初始化一些參數
    def __init__(self, mode, df, tokenizer):
        assert mode in ["train", "valid", "test"]
        self.mode = mode
        self.tokenizer = tokenizer
        
        # df = df[df['utterance_idx']%2 == 1].copy()
        # df['utterance'] = df.groupby(['prompt'])['utterance'].transform(lambda x: ' '.join(x))
        # df['utterance'] = df['utterance'].apply(nltk.sent_tokenize)
        # df = df.explode('utterance')
        df = df.drop('conv_id', axis = 1)
        df = df.drop('utterance_idx', axis = 1)
#         df = df.drop('utterance', axis = 1)
        
        self.df = df.dropna()
        if self.mode != "test":
            self.df.drop_duplicates(inplace=True)
        self.len = len(self.df)
        self.maxlen = 100
        self.white_list = ["don't", "not", "didn't", "cannot", "can't"]
        self.nltk_stopwords = nltk.corpus.stopwords.words('english')
        self.punc = '''.()-[]{};:"\,<>/@#$%^&*_~”'''
        self.ps = PorterStemmer()
    
    # Preprocess token
    def cleardata(self, ss):
        token = tokenizer.tokenize(ss.lower().strip().replace('_comma_',' '))
        
        # token = [word for word in token if (word not in self.nltk_stopwords) or (word in self.white_list)]
        
        # token = [self.ps.stem(w) for w in token if w not in self.white_list]
        
        token = [w for w in token if w not in self.punc] 
        
        return token
    
    # 這裡需要定義回傳一筆訓練 / 測試數據的函式，
    # 也就是當以 [idx] 來 index Dataset 時，要回傳的東西
    def __getitem__(self, idx):
        if self.mode == "test":
            prompt = self.df.iloc[idx]['prompt']
            utter = self.df.iloc[idx]['utterance']
            label_tensor = None
        else:
            prompt = self.df.iloc[idx]['prompt']
            utter = self.df.iloc[idx]['utterance']
            label = self.df.iloc[idx]['label']
            # 將 label 文字也轉換成索引方便轉換成 tensor，
            label_tensor = torch.tensor(label)
            
        # [TODO1]: 將 input sentence 轉成 [CLS] + text_a + [SEP] + text_b + [SEP]
        # ==============================================
        word_pieces = ['[CLS]']
        word_pieces += self.cleardata(prompt)
        word_pieces += ['[SEP]']
        word_pieces += self.cleardata(utter)
        word_pieces += ['[SEP]']
        
        # ==============================================
        
        # 將剛剛做好的 input sentence 轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        ids = [id for id in ids if id != 100]
        ids = ids[:self.maxlen]
        tokens_tensor = torch.tensor(ids)
        
        # [TODO2]: 將第一句+ [SEP] 的 token 位置設為 0，其他為 1 來表示句子間的segment
        # ==============================================
        seg = 0
        cut_points = (tokens_tensor == 102).nonzero().flatten()
        segments_tensor = torch.zeros(tokens_tensor.shape, dtype=torch.long)
        if len(cut_points)>0:
            segments_tensor[cut_points[0]+1:] = 1
        # ==============================================
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
# 初始化 Dataset，使用中文 BERT 斷詞
trainset = myDataset("train", train, tokenizer=tokenizer)
validset = myDataset("valid", valid, tokenizer=tokenizer)

In [None]:
# 選擇第一個樣本出來，看看原始 input 是怎麼被轉換成 BERT 相容的格式的
sample_idx = 0

# 將原始文本拿出做比較
prompt, utter, label = trainset.df.iloc[sample_idx].values

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors，
# 經過我們自定義的 Dataset 後，trainset 現在已經是個 iterable 的 object，
# 可以用編號來索引你想要去得的位置的 id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = " ".join(tokens)

print(f"""[原始文本]
情境  ：{prompt}
對話  ：{utter}
分類  ：{label}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

--------------------

[還原 tokens_tensors]
{combined_text}
""")

In [8]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# create_mini_batch 的參數 `samples` 是一個 list，裡頭的每個 element 都是
# 剛剛定義的 `FakeNewsDataset` 回傳的一個樣本，每個樣本都包含 3 tensors：
# - tokens_tensor
# - segments_tensor
# - label_tensor
# 它會對前兩個 tensors 作 zero padding，並產生前面說明過的 masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # [TODO3]: 將 token_tensors 及 segments_tensors zero padding 到同樣長度，
    # hint: 可以使用 import的 pad_sequence，記得 batch_first 要設為 True
    #================================================
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)
    #================================================


    # [TODO4] 製作 attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1，讓 BERT 只關注這些位置的 tokens
    # ================================================
    # 先製作一條長度和 token_tensors 一樣的 0 張量
    masks_tensors = torch.zeros(tokens_tensors.shape,dtype=torch.long)
        
    # 接著將不為 zero padding 的位置設為1 (由同學實作)
    # hint: 可以使用 tensor.masked_fill()
    for i in range(len(masks_tensors)):
      for j in range(len(masks_tensors[i])):
        if(tokens_tensors[i][j]==0):
          break
        masks_tensors[i][j] = 1
    # ================================================
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids


# 初始化一個每次回傳 BATCH_SIZE 個訓練樣本的 DataLoader
# 利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵
trainloader = DataLoader(trainset, batch_size=128, 
                         collate_fn=create_mini_batch)

validloader = DataLoader(validset, batch_size=256, 
                         collate_fn=create_mini_batch)

In [None]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, \
    masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")

In [10]:
from transformers import BertForSequenceClassification
from IPython.display import clear_output

NUM_LABELS = 32
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS)

clear_output()

In [None]:
%%time
from tqdm.notebook import tqdm
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
    running_loss = 0
    with torch.no_grad():
        for data in tqdm(dataloader):
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            if not compute_acc:
                # 只是單純要回傳預測值的話，不用計算準確度也不用紀錄 loss
                tokens_tensors, segments_tensors, masks_tensors = data[:3]
                outputs = model(input_ids=tokens_tensors, 
                                token_type_ids=segments_tensors, 
                                attention_mask=masks_tensors)
                logits = outputs[0]
                _, pred = torch.max(logits.data, 1)
            else:
                # 否則就要計算 loss，這邊有一個小細節是 model 如果有吃 label 的話，
                # output[0]會變成是 loss，沒有吃 label 時 output[0] 會是 logits
                tokens_tensors, segments_tensors, masks_tensors, labels = data[:4]
                outputs = model(input_ids=tokens_tensors, 
                                token_type_ids=segments_tensors, 
                                attention_mask=masks_tensors,
                                labels=labels)
                loss = outputs[0]
                logits = outputs[1]
                _, pred = torch.max(logits.data, 1)
                running_loss += loss.item()
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        loss = running_loss / total
        return predictions, acc, loss
    
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 這邊要記得確認 model 在 GPU 上運行 (投影片有說明)，否則會跑很久！
print("device:", device)
model = model.to(device)
#_, train_acc, train_loss = get_predictions(model, trainloader, compute_acc=True)
#print("train acc:", train_acc)
#print("train loss:", train_loss)

In [None]:
%%time
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

EPOCHS = 3  # 由於時間有限，訓練 3 輪看看表現如何就好
for epoch in range(EPOCHS):
    running_loss = 0.0
    for data in tqdm(trainloader):
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        optimizer.zero_grad()
        
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        loss.backward()
        optimizer.step()

        # 紀錄當前 batch loss
        running_loss += loss.item()

    # 計算分類準確率
    _, train_acc, train_loss = get_predictions(model, trainloader, compute_acc=True)
    _, valid_acc, valid_loss = get_predictions(model, validloader, compute_acc=True)
    
    print('[epoch %d] train loss: %.3f, train acc: %.3f, valid loss: %.3f, valid acc: %.3f' %
          (epoch + 1, train_loss, train_acc, valid_loss, valid_acc))

In [14]:
# torch.save(model.state_dict(), '/content/BDA/MyDrive/dataset/checkpoint_5530.pt')

In [15]:
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS)
# model.load_state_dict(torch.load('/content/BDA/MyDrive/dataset/checkpoint_5530.pt'))

In [None]:
testset = myDataset("test", test, tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, 
                        collate_fn=create_mini_batch)

# 請我們的模型給出它的預測！
predictions = get_predictions(model, testloader)
# 要和在 cpu 上的 test_y 算準確度，還要把它從 GPU 上搬回來才行
predictions = predictions.cpu().numpy()
print(predictions)

In [22]:
t1 = testset.df.copy()

tt = test.copy()
t1['pred'] = predictions
tidx = t1.groupby(['prompt'])['pred'].apply(lambda x: x.value_counts().index[0]).reset_index()
tt = pd.merge(tt, tidx, on=['prompt'])
# tt.head(5)


In [23]:
headers = ['', 'pred']
rows = []
for i in range(len(tt['pred'])):
    rows.append((i, tt.iloc[i]['pred']))

with open('/content/BDA/MyDrive/dataset/predictions_merged.csv','w') as f:
    writeCsv = csv.writer(f)
    writeCsv.writerow(headers)
    writeCsv.writerows(rows)