# 作業：微調輕量化 Bert 預訓練模型
***

## [作業目標]
- 觀察 distilBERT 及 Bert 模型的表現
- 瞭解前處理對輕量化 Bert 模型帶來的影響

## [作業重點]
- 試著替換不同的預訓練模型(DistilBERT/Bert)，觀察有何不同
- 試著註解或跳過"前處理"的3個步驟，觀察其影響

In [1]:
import re
import string
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer, BertModel, BertTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

import warnings
warnings.filterwarnings("ignore")

## 載入訓練與測試資料

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv') 

## 前處理

In [3]:
# 前處理-1: 消除連字
# def decontracted(text):
#     # 特殊連字
#     text = re.sub(r'(W|w)on(\'|\’)t ', 'will not ', text)
#     text = re.sub(r'(C|c)an(\'|\’)t ', 'can not ', text)
#     text = re.sub(r'(Y|y)(\'|\’)all ', 'you all ', text)
#     text = re.sub(r'(Y|y)a(\'|\’)ll ', 'you all ', text)
#     # 一般連字
#     text = re.sub(r'(I|i)(\'|\’)m ', 'i am ', text)
#     text = re.sub(r'(A|a)in(\'|\’)t ', 'is not ', text)
#     text = re.sub(r'n(\'|\’)t ', ' not ', text)
#     text = re.sub(r'(\'|\’)re ', ' are ', text)
#     text = re.sub(r'(\'|\’)s ', ' is ', text)
#     text = re.sub(r'(\'|\’)d ', ' would ', text)
#     text = re.sub(r'(\'|\’)ll ', ' will ', text)
#     text = re.sub(r'(\'|\’)t ', ' not ', text)
#     text = re.sub(r'(\'|\’)ve ', ' have ', text)

#     return text

# df_train['text'] = df_train['text'].apply(lambda x: decontracted(x))
# df_test['text'] = df_test['text'].apply(lambda x: decontracted(x))

In [4]:
# 前處理-2: 清除特殊符號
# regular_punct = list(string.punctuation)
# extra_punct = [
#     ',', '.', '"', ':', ')', '(', '!', '?', '|', ';', "'", '$', '&',
#     '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
#     '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
#     '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”',
#     '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾',
#     '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼',
#     '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
#     'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
#     '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
#     '¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤'
# ]
# # 清除標點符號以及上列符號
# all_punct = list(set(regular_punct + extra_punct))
# # 清除連字號 "-" 以及句號 "."
# all_punct.remove('-')
# all_punct.remove('.')

# def spacing_punctuation(text):
#     """
#     add space before and after punctuation and symbols
#     """
#     for punc in all_punct:
#         if punc in text:
#             text = text.replace(punc, f' {punc} ')

#     return text

# df_train['text'] = df_train['text'].apply(lambda x: spacing_punctuation(x))
# df_test['text'] = df_test['text'].apply(lambda x: spacing_punctuation(x))

In [5]:
# 前處理-3: 錯漏字修正
# mis_connect_list = ['(W|w)hat', '(W|w)hy', '(H|h)ow', '(W|w)hich', '(W|w)here', '(W|w)ill']
# mis_connect_re = re.compile(f"({'|'.join(mis_connect_list)})")

# mis_spell_mapping = {'whattsup': 'WhatsApp', 'whatasapp': 'WhatsApp', 'whatsupp': 'WhatsApp', 
#                      'whatcus': 'what cause', 'arewhatsapp': 'are WhatsApp', 'Hwhat': 'what',
#                      'Whwhat': 'What', 'whatshapp': 'WhatsApp', 'howhat': 'how that',
#                      # why
#                      'Whybis': 'Why is', 'laowhy86': 'Foreigners who do not respect China',
#                      'Whyco-education': 'Why co-education',
#                      # How
#                      'Howddo': 'How do', 'Howeber': 'However', 'Showh': 'Show',
#                      'Willowmagic': 'Willow magic', 'WillsEye': 'Will Eye', 'Williby': 'will by'}

# def spacing_some_connect_words(text):
#     '''
#     'Whyare' -> 'Why are'
#     '''
#     ori = text
#     for error in mis_spell_mapping:
#         if error in text:
#             text = text.replace(error, mis_spell_mapping[error])
            
#     # what
#     text = re.sub(r' (W|w)hat+(s)*[A|a]*(p)+ ', ' WhatsApp ', text)
#     text = re.sub(r' (W|w)hat\S ', ' What ', text)
#     text = re.sub(r' \S(W|w)hat ', ' What ', text)
#     # why
#     text = re.sub(r' (W|w)hy\S ', ' Why ', text)
#     text = re.sub(r' \S(W|w)hy ', ' Why ', text)
#     # How
#     text = re.sub(r' (H|h)ow\S ', ' How ', text)
#     text = re.sub(r' \S(H|h)ow ', ' How ', text)
#     # which
#     text = re.sub(r' (W|w)hich\S ', ' Which ', text)
#     text = re.sub(r' \S(W|w)hich ', ' Which ', text)
#     # where
#     text = re.sub(r' (W|w)here\S ', ' Where ', text)
#     text = re.sub(r' \S(W|w)here ', ' Where ', text)
    
#     text = mis_connect_re.sub(r' \1 ', text)
#     text = text.replace('What sApp', 'WhatsApp') 

#     return text

# df_train['text'] = df_train['text'].apply(lambda x: spacing_some_connect_words(x))
# df_test['text'] = df_test['text'].apply(lambda x: spacing_some_connect_words(x))

In [6]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## 載入 distilBERT 或 Bert 模型將文字編碼

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 載入 distilBERT 或 Bert 模型 (下列兩行中，將不選的模型註解掉即可)
# model_class, tokenizer_class, pretrained_weights = (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased')
model_class, tokenizer_class, pretrained_weights = (BertModel, BertTokenizer, 'bert-base-uncased')

# 載入預訓練權重以及 tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights).to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




## 將訓練資料經由 distilBERT 或 Bert 轉換為 Embedding 編碼

In [8]:
# 將訓練資料經過 tokenizer 編碼轉換
tokenized_train = df_train['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [9]:
# 以最長字串為準，將訓練資料補零成相同長度
max_len = max([len(tokens) for tokens in tokenized_train.values])
padded_train = np.array([x + [0] * (max_len - len(x)) for x in tokenized_train.values])

In [10]:
# 設定 attention_mask，將計算經過 Bert 生成的 Embedding 結果，儲存於 last_hidden_states 中
attention_mask = np.where(padded_train != 0, 1, 0)
input_ids = torch.LongTensor(padded_train).to(device)
attention_mask = torch.LongTensor(attention_mask).to(device)

training_data = TensorDataset(input_ids, attention_mask)
training_loader = DataLoader(training_data, 100)

last_hidden_states = []
with torch.no_grad():
    for batch in training_loader:
        ids, mask = [x.to(device) for x in batch]
        last_hidden_states.append(model(ids, attention_mask=mask)[0][:, 0,:].cpu())
last_hidden_states = torch.cat(last_hidden_states, dim=0)

In [11]:
# 準備下一階段要用的特徵(上階段 Embedding 結果)與目標值
labels = df_train['target']
features = last_hidden_states.numpy()
features.shape

(7613, 768)

## 切割訓練/測試集

In [12]:
train_features, val_features, train_labels, val_labels = train_test_split(features, labels)

## 使用 Logistic Regression 當作最後一層, 輸出預測結果

In [13]:
# 訓練 Logistic Regression, 相當於加上單層類神經網路
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)
print('best parameters:', grid_search.best_params_)
print('best scrores:', grid_search.best_score_)

best parameters: {'C': 36.842168421052634}
best scrores: 0.78402605635208


In [14]:
# 將跑出的 Logistic Regression 最佳 C 值填入, 觀察測試集的驗證分數
lr_clf = LogisticRegression(C=grid_search.best_params_['C'])
lr_clf.fit(train_features, train_labels)
lr_clf.score(val_features, val_labels)

0.803046218487395

## 對預測目標資料做出最終預測

In [15]:
# 將預測目標資料經過 tokenizer 編碼轉換
tokenized_test = df_test['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

In [16]:
# 以最長字串為準, 將預測目標資料補零成相同長度
max_len = max([len(tokens) for tokens in tokenized_test.values])
padded_test = np.array([x + [0] * (max_len - len(x)) for x in tokenized_test.values])

In [17]:
# 設定 attention_mask，將計算經過 Bert 生成的 Embedding 結果，儲存於 last_hidden_states 中
attention_mask = np.where(padded_test != 0, 1, 0)
input_ids = torch.LongTensor(padded_test).to(device)
attention_mask = torch.LongTensor(attention_mask).to(device)

test_data = TensorDataset(input_ids, attention_mask)
test_loader = DataLoader(test_data, 100)

last_hidden_states = []
with torch.no_grad():
    for batch in test_loader:
        ids, mask = [x.to(device) for x in batch]
        last_hidden_states.append(model(ids, attention_mask=mask)[0][:, 0,:].cpu())
last_hidden_states = torch.cat(last_hidden_states, dim=0)

In [18]:
# 輸出預測目標資料的預測結果
test_features = last_hidden_states.numpy() 
y_pred = lr_clf.predict(test_features)
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [19]:
# 生成提交擋
submission = pd.DataFrame()
submission['id'] = df_test['id']
submission['target'] = y_pred
submission.to_csv('submission_Bert.csv', index=False)