# Text Classification for the IMDB Dataset using BERT.

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 33.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 58.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 44.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 57.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 8.5 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses

In [2]:
!nvidia-smi

Sat May  7 04:50:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from google.colab import drive
drive.mount('/content/drive') #掛載你的雲端硬碟到colab上的路徑

Mounted at /content/drive


In [4]:
# Basic
import numpy as np
import random
import time
import os
import pandas as pd

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset,random_split
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# Bert model and its tokenizer
from transformers import BertTokenizer, BertModel

# IMDB
from keras.datasets import imdb

# 畫畫用
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [5]:
# Confirm version and GPU status
if torch.cuda.is_available():
    device = torch.device('cuda')
    devicename = '['+torch.cuda.get_device_name(0)+']'
else:
    device = torch.device('cpu')
    devicename = ""

print('Using PyTorch version:', torch.__version__,
      'Device:', device, devicename)

Using PyTorch version: 1.11.0+cu113 Device: cuda [Tesla T4]


In [6]:
# 載入 BERT token
# 取得 BERT 內的 pre-train tokenizer
PRETRAINED_MODEL_NAME = "bert-base-uncased" #英文pretrain(不區分大小寫)
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
vocab = tokenizer.vocab
print("字典大小:", len(vocab))

# 隨機看一下 BERT tokenizer 完的字典
random_tokens = random.sample(list(vocab), 10)
random_ids = [vocab[t] for t in random_tokens]
print("隨便看幾個字:")
print("{0:20}{1:15}".format("token", "index"))
print("-" * 25)

for t, id in zip(random_tokens, random_ids):
    print("{0:15}{1:10}".format(t, id))

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

字典大小: 30522
隨便看幾個字:
token               index          
-------------------------
stephan             15963
namibia             15408
depictions          20818
mcgraw              24179
straight             3442
##broken            29162
##cta               25572
cub                 21987
hodgson             26107
harding             15456


# 準備原始文本資料

把IMDB的資料集讀取進來

In [7]:
# 僅保留訓練資料集前10000個最常出現的單詞，捨棄低頻的單詞
(train_data,train_labels),(test_data,test_labels) = imdb.load_data(num_words=10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [8]:
print("訓練資料大小：" , len(train_data) , ", 測試資料大小：", len(test_data))

訓練資料大小： 25000 , 測試資料大小： 25000


In [9]:
# 下載IMDB的字典 word_index -> word:index
word_index = imdb.get_word_index()

# 鍵值對調 reverse_word_index -> index:word
reverse_word_index = {value:key for key,value in word_index.items()}

# 查看每一筆評論內容(index-3，因為index=0,1和2分別是“填充”,“序列開始”,“未知”的保留索引)，查不到的以?表示
def read_IMDB_text(train_data):
  text = ' '.join([reverse_word_index.get(i-3,'?') for i in train_data])
  return text

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [10]:
# 做成train/test的dataframe
df_train = pd.DataFrame({'TRAIN_text_to_sequence':train_data,"TRAIN_label":train_labels})
df_test = pd.DataFrame({'TEST_text_to_sequence':test_data,"TEST_label":test_labels})

df_train['TRAIN_text'] = df_train['TRAIN_text_to_sequence'].apply(read_IMDB_text)
df_test['TEST_text'] = df_test['TEST_text_to_sequence'].apply(read_IMDB_text)

df_train = df_train[["TRAIN_text","TRAIN_label"]]
df_test = df_test[["TEST_text","TEST_label"]]

display(df_train.head())
display(df_test.head())

Unnamed: 0,TRAIN_text,TRAIN_label
0,? this film was just brilliant casting locatio...,1
1,? big hair big boobs bad music and a giant saf...,0
2,? this has to be one of the worst films of the...,0
3,? the ? ? at storytelling the traditional sort...,1
4,? worst mistake of my life br br i picked this...,0


Unnamed: 0,TEST_text,TEST_label
0,? please give this one a miss br br ? ? and th...,0
1,? this film requires a lot of patience because...,1
2,? many animation buffs consider ? ? the great ...,1
3,? i generally love this type of movie however ...,0
4,? like some other people wrote i'm a die hard ...,1


# 將原始文本轉換成BERT相容的輸入格式
**實作一個可以用來讀取訓練與測試集的 Dataset**，這個Dataset會將資料裏頭的text轉換成BERT的相容輸入格式，並回傳3個tensors

1.   tokens_tensor: 合併句子的index sequence，包含[CLS],[SEP]
2.   segments_tensor: 用來區別兩句子的界線
3.   label_tensor: 將分類的label轉換成index的tensor



In [11]:
# 建立Dataset
class IMDB_Dataset(Dataset):
  def __init__(self, mode, tokenizer):
    assert mode in ["train", "test"]  
    self.mode = mode
    self.df = eval(f"df_{mode}") # df_train or df_test
    self.len = len(self.df)
    self.maxlen = 300      #限制文章長度(depend on 你的記憶體)
    self.tokenizer = tokenizer  # 把 BERT tokenizer 傳進來

  # 定義回傳一筆訓練/測試數據的函式
  def __getitem__(self, idx):
    origin_text = self.df.iloc[idx][0] # 原始文本
    origin_label = self.df.iloc[idx][1]      # 原始分類
    if self.mode == "test":
        text = self.df.iloc[idx][0]
        label_tensor = None 
        # label_id = self.df.iloc[idx][1]
        # label_tensor = torch.tensor(label_id)
    else:     
        text = self.df.iloc[idx][0]
        # label_id = self.label_id
        label_tensor = torch.tensor(origin_label)
        
    
    # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
    word_pieces = ["[CLS]"]
    tokens_a = self.tokenizer.tokenize(text)
    word_pieces += tokens_a[:self.maxlen] + ["[SEP]"]
    len_a = len(word_pieces)
            
    # 將整個 token 序列轉換成索引序列
    ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
    tokens_tensor = torch.tensor(ids)
    
    # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
    segments_tensor = torch.tensor([0] * len_a,dtype=torch.long)
    
    return (tokens_tensor, segments_tensor, label_tensor, origin_text, origin_label)

  def __len__(self):
    return self.len

透過IMDB_Dataset的class實例出訓練資料集與測試資料集，並轉成BERT的輸入格式

In [12]:
# initialize Dataset
trainset = IMDB_Dataset("train", tokenizer=tokenizer)
testset = IMDB_Dataset("test", tokenizer=tokenizer)


In [13]:
#split val from trainset
val_size = int(trainset.__len__()*0.04) #比對LSTM 切出1000筆當validation
trainset, valset = random_split(trainset,[trainset.__len__()-val_size,val_size])
print('trainset size:' ,trainset.__len__())
print('valset size:',valset.__len__())
print('testset size: ',testset.__len__())

trainset size: 24000
valset size: 1000
testset size:  25000


訓練資料集的第一筆回傳3個tensor加上原始文本與原始label，分別是tokens_tensor, segments_tensor, label_tensor, origin_text, origin_label

In [14]:
trainset[0]

(tensor([  101,  1029,  1045,  2001,  2025,  8074,  1996,  3928, 24466,  3325,
          1997,  1029,  2009,  1005,  1055,  2019, 10271,  2659,  5166,  2053,
          2502,  2171,  5889,  1029,  2472,  1045,  2018,  2657,  2009,  2001,
          2204,  2021,  2025,  2023,  2204,  7987,  7987,  2872,  1999,  1037,
          3824,  5636,  2551,  2465,  6613,  1029,  1029,  2038,  2589,  2019,
          9313,  3105,  1997, 11847,  1996,  2154,  2079,  2154, 11785,  1997,
          3923,  1029,  8805,  1996, 10191,  2003,  1029,  2007,  4963,  1998,
          1029,  2041,  2012,  2014,  2152,  2082, 12746,  2893,  1999,  4390,
          2007,  1996,  2082,  1998,  2014,  2814,  2016,  2003,  2108,  2992,
          2011,  2014,  2309,  2269,  2040,  3544,  2000,  2293,  2014,  1998,
          2014,  2567,  2021, 12033,  1037,  9384,  3348,  2241,  3313,  3115,
          2006,  2010,  2336,  1996,  2269,  1005,  1055,  3313,  3115,  2003,
          7203,  2011,  1996,  2755,  2008,  4714,  

隨機選一個id來看一下轉換前後的差異

In [15]:
# 隨便選一個樣本
sample_idx = 25

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor, origin_text, origin_label = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())

print(f"""[原始文本]
句子：{origin_text}
分類  ：{origin_label}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor[0:20]}

segments_tensor：{segments_tensor[0:20]}

label_tensor   ：{label_tensor}

""")

[原始文本]
句子：? i watched the dvd of this movie which also comes with an excellent commentary track in english it seems in cambodia the subtitles in english say the character is speaking ? but the movie says cambodia a very violent evil man is raising boys to be killers using ? and training them to fight and kill he sends ? to kill some people in china and during the killings a ? partner is killed the cop ? is a loose cannon who is worried about his father who is also a cop who was shot and is in a coma ? chief is his ? friend and is worried about ? ? behavior he doesn't know ? was the one who caught his dad in dealing with drug dealers and shot him and put him into the coma ? escapes and hides in a ? ? shack where he meets a woman who came here to find her mother and keeps repeating her father won't let her leave ? doesn't speak chinese and doesn't understand this but saves her from her father who appears to be having sex with her maybe this is the reason for cat iii ? becomes more and mo

製作一個DataLoader去分批讀取小量的mini-batch
這個函式的輸入 samples 是一個 list，裡頭的每個 element 都是剛剛定義的 IMDB_Dataset 回傳的一個資料，每個資料都包含3個tensors：

*  tokens_tensor
*  segments_tensor
*  label_tensor

它會對前兩個 tensors 作 zero padding，並產生masks_tensors

In [16]:
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 訓練集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad到該batch下最長的長度
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape,dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids


實例化一個每次回傳 batch size 個訓練樣本的 DataLoader，並利用 collate_fn 將 list of samples 合併成一個 mini-batch

In [17]:
BATCH_SIZE = 16
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=True)
testloader = DataLoader(testset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=True)
valloader = DataLoader(valset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=False)

data = next(iter(trainloader))
tokens_tensors, segments_tensors, masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([16, 302]) 
tensor([[ 101, 1029, 1029,  ..., 1998, 5186,  102],
        [ 101, 1029, 2182,  ..., 2036, 2038,  102],
        [ 101, 1029, 1045,  ..., 1037, 1029,  102],
        ...,
        [ 101, 1029, 2023,  ...,    0,    0,    0],
        [ 101, 1029, 1045,  ...,    0,    0,    0],
        [ 101, 1029, 1996,  ...,    0,    0,    0]])
------------------------
segments_tensors.shape = torch.Size([16, 302])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([16, 302])
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
------------------------
label_ids.shape     

# 以BERT為基礎加入layers成下游任務模型
載入一個可以做分類的 BERT 模型

In [18]:
from transformers import BertForSequenceClassification

NUM_LABELS = 2
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

print("""
name      module
--------------------""")

for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print("{:10}{}".format(name,n) )
    else:
        print("{:10} {}".format(name, module))


Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


name      module
--------------------
bert      embeddings
bert      encoder
bert      pooler
dropout    Dropout(p=0.1, inplace=False)
classifier Linear(in_features=768, out_features=2, bias=True)


In [19]:
model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [20]:
from sklearn.metrics import accuracy_score

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:",device)
model = model.to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
EPOCHS = 1

for epoch in range(EPOCHS):
    correct = 0
    #total = 0
    train_loss , val_loss = 0.0 , 0.0
    train_acc, val_acc = 0, 0
    n, m = 0, 0
    model.train()
    for data in trainloader:
        n += 1
        tokens_tensors, segments_tensors,masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)
        # outputs 的順序是 "(loss), logits, (hidden_states), (attentions)"
        
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        
        #get prediction and calulate acc
        logits = outputs[1]
        _, pred = torch.max(logits.data, 1)
        train_acc += accuracy_score(pred.cpu().tolist() , labels.cpu().tolist())

        # 紀錄當前 batch loss
        train_loss += loss.item()
    
    #validation
    with torch.no_grad():
        model.eval()
        for data in valloader:
            m += 1
            tokens_tensors, segments_tensors,masks_tensors, labels = [t.to(device) for t in data]
            val_outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)
            
            logits = val_outputs[1]
            _, pred = torch.max(logits.data, 1)
            val_acc += accuracy_score(pred.cpu().tolist() , labels.cpu().tolist())
            val_loss += val_outputs[0].item()

    print('[epoch %d] loss: %.4f, acc: %.4f, val loss: %4f, val acc: %4f' %
          (epoch+1, train_loss/n, train_acc/n, val_loss/m,  val_acc/m  ))

print('Done')

device: cuda:0
[epoch 1] loss: 0.2692, acc: 0.8893, val loss: 0.220397, val acc: 0.903770
Done


In [1]:
!nvidia-smi

Sat May  7 05:59:47 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces