<a href="https://colab.research.google.com/github/funpi89/NLP_marathon/blob/main/DitsilBert_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# DistilBert 預訓練模型的使用方式

- 資料來源 : https://www.kaggle.com/c/nlp-getting-started/overview
判斷某一段推文是否企圖告訴我們"天災"正在發生,訓練集與要判斷的測試集都有數千筆資料，其中"text”是推文內容，"target"是是否為天災

In [2]:
!pip install transformers



In [3]:
# 載入相關套件, 第一次執行前需安裝 transformers 套件
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import re, warnings
warnings.filterwarnings("ignore")

In [4]:
# 載入訓練與測試資料
df = pd.read_csv('/content/drive/MyDrive/NLPMarathon/huggingface/data/disaster/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/NLPMarathon/huggingface/data/disaster/test.csv') 

# 前處理

In [5]:
# 前處理-1 : 消除連字
def decontracted(text):
    # 特殊連字
    text = re.sub(r"(W|w)on(\'|\’)t ", "will not ", text)
    text = re.sub(r"(C|c)an(\'|\’)t ", "can not ", text)
    text = re.sub(r"(Y|y)(\'|\’)all ", "you all ", text)
    text = re.sub(r"(Y|y)a(\'|\’)ll ", "you all ", text)
    # 一般性連字
    text = re.sub(r"(I|i)(\'|\’)m ", "i am ", text)
    text = re.sub(r"(A|a)in(\'|\’)t ", "is not ", text)
    text = re.sub(r"n(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)re ", " are ", text)
    text = re.sub(r"(\'|\’)s ", " is ", text)
    text = re.sub(r"(\'|\’)d ", " would ", text)
    text = re.sub(r"(\'|\’)ll ", " will ", text)
    text = re.sub(r"(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)ve ", " have ", text)
    return text

df['text'] = df['text'].apply(lambda x: decontracted(x))
df_test['text'] = df_test['text'].apply(lambda x: decontracted(x))

In [6]:
# 前處理-2 : 清除特殊符號
import string
regular_punct = list(string.punctuation)
extra_punct = [
    ',', '.', '"', ':', ')', '(', '!', '?', '|', ';', "'", '$', '&',
    '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
    '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
    '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”',
    '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾',
    '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼',
    '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
    'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
    '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
    '¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤']
# 消除標點符號以及上列符號
all_punct = list(set(regular_punct + extra_punct))
# 消除連字號 "-" 以及句號 "."
all_punct.remove('-')
all_punct.remove('.')

def spacing_punctuation(text):
    """
    add space before and after punctuation and symbols
    """
    for punc in all_punct:
        if punc in text:
            text = text.replace(punc, f' {punc} ')
    return text

df['text'] = df['text'].apply(lambda x: spacing_punctuation(x))
df_test['text'] = df_test['text'].apply(lambda x: spacing_punctuation(x))

In [7]:
# 前處理-3 : 錯漏字修正
mis_connect_list = ['(W|w)hat', '(W|w)hy', '(H|h)ow', '(W|w)hich', '(W|w)here', '(W|w)ill']
mis_connect_re = re.compile('(%s)' % '|'.join(mis_connect_list))

mis_spell_mapping = {'whattsup': 'WhatsApp', 'whatasapp':'WhatsApp', 'whatsupp':'WhatsApp', 
                      'whatcus':'what cause', 'arewhatsapp': 'are WhatsApp', 'Hwhat':'what',
                      'Whwhat': 'What', 'whatshapp':'WhatsApp', 'howhat':'how that',
                      # why
                      'Whybis':'Why is', 'laowhy86':'Foreigners who do not respect China',
                      'Whyco-education':'Why co-education',
                      # How
                      "Howddo":"How do", 'Howeber':'However', 'Showh':'Show',
                      "Willowmagic":'Willow magic', 'WillsEye':'Will Eye', 'Williby':'will by'}
def spacing_some_connect_words(text):
    """
    'Whyare' -> 'Why are'
    """
    ori = text
    for error in mis_spell_mapping:
        if error in text:
            text = text.replace(error, mis_spell_mapping[error])
            
    # what
    text = re.sub(r" (W|w)hat+(s)*[A|a]*(p)+ ", " WhatsApp ", text)
    text = re.sub(r" (W|w)hat\S ", " What ", text)
    text = re.sub(r" \S(W|w)hat ", " What ", text)
    # why
    text = re.sub(r" (W|w)hy\S ", " Why ", text)
    text = re.sub(r" \S(W|w)hy ", " Why ", text)
    # How
    text = re.sub(r" (H|h)ow\S ", " How ", text)
    text = re.sub(r" \S(H|h)ow ", " How ", text)
    # which
    text = re.sub(r" (W|w)hich\S ", " Which ", text)
    text = re.sub(r" \S(W|w)hich ", " Which ", text)
    # where
    text = re.sub(r" (W|w)here\S ", " Where ", text)
    text = re.sub(r" \S(W|w)here ", " Where ", text)
    
    text = mis_connect_re.sub(r" \1 ", text)
    text = text.replace("What sApp", 'WhatsApp') 
    return text

df['text'] = df['text'].apply(lambda x: spacing_some_connect_words(x))
df_test['text'] = df_test['text'].apply(lambda x: spacing_some_connect_words(x))

In [8]:
# 調整訓練資料的大小(colab跑不完全部的量)
df = df[:4000]

In [9]:
df.head() 

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this # earthquake...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to ' shelter in place ' ...,1
3,6,,,"13 , 000 people receive # wildfires evacuatio...",1
4,7,,,Just got sent this photo from Ruby # Alaska a...,1


In [10]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,Heard about # earthquake is different cities ...
2,3,,,"there is a forest fire at spot pond , geese a..."
3,9,,,Apocalypse lighting. # Spokane # wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# 載入 distilBERT 模型或 Bert 模型, 將文字編碼

In [11]:
# 載入 distilBERT 模型
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# 載入預訓練權重以及 tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)
model = model.cuda()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# 將訓練資料經由 distilBERT 或 Bert 轉換為 Embedding 編碼

In [12]:
# 將訓練資料經過 tokenizer 編碼轉換
tokenized = df['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [13]:
# 以最長字串為準, 將訓練資料補零成相同長度
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [14]:
# 設定 attention_mask, 將計算經過 Bert 生成的 Embedding 結果, 儲存於 last_hidden_states 中
attention_mask = np.where(padded != 0, 1, 0)
input_ids = torch.tensor(padded).to(torch.int64).cuda()
attention_mask = torch.tensor(attention_mask).to(torch.int64).cuda()

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [15]:
# 準備下一階段要用的特徵 (上階段 Embedding 結果) 與目標值
labels = df['target']
features = last_hidden_states[0][:,0,:].cpu().numpy()
features[0].shape

(768,)

In [16]:
# 切割訓練 / 測試集
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

# 使用 Logistic Regression 當作最後一層, 輸出預測結果

In [17]:
import sklearn
from sklearn.model_selection import GridSearchCV
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)
print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 5.263252631578947}
best scrores:  0.7953333333333333


In [18]:
# 將上一格跑出的 Logistic Regression 最佳 C 值填入, 觀察測試集的驗證分數
lr_clf = LogisticRegression(C = 5.263252631578947)  
lr_clf.fit(train_features, train_labels)
lr_clf.score(test_features, test_labels)

0.803

# 對預測目標資料做出最終預測

In [19]:
# 將預測目標資料經過 tokenizer 編碼轉換
tokenized_t = df_test['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [20]:
# 以最長字串為準, 將預測目標資料補零成相同長度
max_len = 0
for i in tokenized_t.values:
    if len(i) > max_len:
        max_len = len(i)
        
padded_t = np.array([i + [0]*(max_len-len(i)) for i in tokenized_t.values])
np.array(padded_t).shape

(3263, 73)

In [21]:
# 設定 attention_mask, 將計算經過 Bert 生成的 Embedding 結果, 儲存於 last_hidden_states 中
attention_mask_t = np.where(padded_t != 0, 1, 0)
input_ids = torch.tensor(padded_t).to(torch.int64).cuda()
attention_mask_t = torch.tensor(attention_mask_t).to(torch.int64).cuda()

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask_t)

In [22]:
# 輸出預測目標資料的預測結果
val_features = last_hidden_states[0][:,0,:].cpu().numpy() 
y_pred = lr_clf.predict(val_features)
y_pred

array([1, 1, 1, ..., 1, 1, 0])

# Inference

In [29]:
s = df_test['text'][15]
s

'Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham is Wholesale Market http :  /  / t.co / irWqCEZWEU'

In [39]:
s_tokened = tokenizer.encode(s, add_special_tokens=True)

padded = np.array(s_tokened + [0]*(max_len-len(s_tokened)))

attention_mask = np.where(padded != 0, 1, 0)
input_ids = torch.tensor(padded).to(torch.int64).cuda()
input_ids = torch.unsqueeze(input_ids, 0)
attention_mask = torch.tensor(attention_mask).to(torch.int64).cuda()
attention_mask = torch.unsqueeze(attention_mask, 0)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

features = last_hidden_states[0][:,0,:].cpu().numpy()
print(features.shape) 
pred = lr_clf.predict(features)
pred


(1, 768)


array([1])