<a href="https://colab.research.google.com/github/dunliangyang2010/Deep-Learning-practice/blob/master/BERT2_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install bert4keras



In [None]:
import pandas as pd
import numpy as np
import os

from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.snippets import sequence_padding

from keras import layers, models, utils, losses, optimizers, callbacks

Using TensorFlow backend.


In [None]:
# 基本参数
maxlen = 256
batch_size = 16
epochs = 10000

FOLDER_PATH = '/content/drive/MyDrive/class/勞動部/05-Transformer/BERT'
BERT_MODEL_PATH = os.path.join(FOLDER_PATH, 'chinese_L-12_H-768_A-12')

In [None]:
# BERT配置
config_path = os.path.join(BERT_MODEL_PATH, 'bert_config.json')
checkpoint_path = os.path.join(BERT_MODEL_PATH, 'bert_model.ckpt')
dict_path = os.path.join(BERT_MODEL_PATH, 'vocab.txt')

In [None]:
# Data
df_neg = pd.read_excel(os.path.join(FOLDER_PATH, 'sentiment', 'neg_trad.xlsx'),header=None, index_col=None)
df_pos = pd.read_excel(os.path.join(FOLDER_PATH, 'sentiment', 'pos_trad.xlsx'), header=None, index_col=None)
df_pos['mark'] = 1
df_neg['mark'] = 0
df_all = pd.concat([df_pos, df_neg], ignore_index=True)
df_all = df_all.sample(frac=1).reset_index(drop=True) # shuffle
df_all.head()

Unnamed: 0,0,mark
0,"外觀漂亮,各功能指示燈也很好看,價格較中關村划算一些。給女孩子用較適合,看電影、聽音樂音效還可以",1
1,買了此套書後，女兒非常喜歡，我本人也比較喜歡，經常陪著孩子一起閱讀，精美的圖畫和生動的語言讓...,0
2,這本書沒有我想象的好，書本身的質量和印刷還可以 ，但內容與書名不符，應該改名字；而且有很多牽...,0
3,一直以來對日本這個民族沒有好感，但是，對他的文化始終有濃厚的興趣，各方面的信息拉拉雜雜也了解...,1
4,我真的建議所有到成都出差的朋友千萬不要入住這家酒店，首先這個酒店的位置實在不好，附近基本上沒...,0


In [None]:
# split data
val_ratio = 0.2
df_train = df_all.iloc[:-int(len(df_all) * 0.2), :]
df_val = df_all.iloc[-int(len(df_all) * 0.2):, :]
df_train.shape, df_val.shape

((16884, 2), (4221, 2))

In [None]:
# 載入精簡詞表，建立分詞器
token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)

In [None]:
token_dict

{'[PAD]': 0,
 '[UNK]': 1,
 '[CLS]': 2,
 '[SEP]': 3,
 '!': 4,
 '"': 5,
 '#': 6,
 '$': 7,
 '%': 8,
 '&': 9,
 "'": 10,
 '(': 11,
 ')': 12,
 '*': 13,
 '+': 14,
 ',': 15,
 '-': 16,
 '.': 17,
 '/': 18,
 '0': 19,
 '1': 20,
 '2': 21,
 '3': 22,
 '4': 23,
 '5': 24,
 '6': 25,
 '7': 26,
 '8': 27,
 '9': 28,
 ':': 29,
 ';': 30,
 '<': 31,
 '=': 32,
 '>': 33,
 '?': 34,
 '@': 35,
 '[': 36,
 '\\': 37,
 ']': 38,
 '^': 39,
 '_': 40,
 'a': 41,
 'b': 42,
 'c': 43,
 'd': 44,
 'e': 45,
 'f': 46,
 'g': 47,
 'h': 48,
 'i': 49,
 'j': 50,
 'k': 51,
 'l': 52,
 'm': 53,
 'n': 54,
 'o': 55,
 'p': 56,
 'q': 57,
 'r': 58,
 's': 59,
 't': 60,
 'u': 61,
 'v': 62,
 'w': 63,
 'x': 64,
 'y': 65,
 'z': 66,
 '{': 67,
 '|': 68,
 '}': 69,
 '~': 70,
 '£': 71,
 '¤': 72,
 '¥': 73,
 '§': 74,
 '©': 75,
 '«': 76,
 '®': 77,
 '°': 78,
 '±': 79,
 '²': 80,
 '³': 81,
 'µ': 82,
 '·': 83,
 '¹': 84,
 'º': 85,
 '»': 86,
 '¼': 87,
 '×': 88,
 'ß': 89,
 'æ': 90,
 '÷': 91,
 'ø': 92,
 'đ': 93,
 'ŋ': 94,
 'ɔ': 95,
 'ə': 96,
 'ɡ': 97,
 'ʰ': 98,
 'ˇ

In [None]:
def data_generator(df, batch_size):
    '''data generator for fit_generator'''
    n = len(df)
    i = 0
    df = df
    while True:
        batch_token_ids, batch_segment_ids, labels = [], [], []
        for b in range(batch_size):
            if i==0:
                # shuffle
                df = df.sample(frac=1).reset_index(drop=True)
            comment = df.iloc[i, 0]
            label = df.iloc[i, 1]
            token_ids, segment_ids = tokenizer.encode(
                comment, maxlen=maxlen
            )
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            labels.append(label)
            i = (i+1) % n
            
        batch_token_ids = sequence_padding(batch_token_ids)
        batch_segment_ids = sequence_padding(batch_segment_ids)
        yield [batch_token_ids, batch_segment_ids], np.expand_dims(np.array(labels), axis=-1)

In [None]:
train_generator = data_generator(df_all, batch_size=1)
next(train_generator)

([array([[   2, 4056,  682, 7836, 6741, 6413, 6755, 7836, 2612, 2667, 8023,
          5896,  576, 4536, 1504, 1992,   15, 2480, 7836, 6815, 3658, 1066,
            34, 2667, 2491, 5763, 4398, 6755,  661, 3190,   15, 2259, 3205,
          5441, 1660, 6755,  841, 6765, 3212, 1082, 6741, 1066,   17, 6240,
          6240,   17,    3]]),
  array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0]])],
 array([[0]]))

In [None]:
bert_model = build_transformer_model(
    config_path,
    checkpoint_path,
    application='unilm',
    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精簡原字表
)

# # Freeze BERT layers
# for l in bert_model.layers:
#     l.trainable = False

x = layers.Lambda(lambda x: x[:, 0])(bert_model.output)
pred = layers.Dense(1, activation='sigmoid')(x)

model = models.Model(bert_model.input, pred)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [None]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     multiple             10432512    Input-Token[0][0]                
                                                                 MLM-Norm[0][0]                   
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 768)    1536        Input-Segment[0][0]        

In [None]:
model.compile(
    loss=losses.binary_crossentropy,
    optimizer=optimizers.Adam(1e-5), # smaller learning rate
    metrics=['accuracy']
)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [None]:
data_gen_train = data_generator(df_train, batch_size)
data_gen_val = data_generator(df_val, batch_size)

In [None]:
model.fit_generator(
    data_gen_train,
    steps_per_epoch=len(df_train) // batch_size,
    epochs=10000, 
    validation_data=data_gen_val,
    validation_steps=len(df_val) // batch_size,
    callbacks=[callbacks.ModelCheckpoint('sentiment.weights', save_best_only=True, save_weights_only=True)]
)


Epoch 1/10000

KeyboardInterrupt: ignored

In [None]:
# Rebuild model

model.load_weights('model.weights')
model.fit(, init_epochs=120, ecpohs=1000)