<a href="https://colab.research.google.com/github/dunliangyang2010/Deep-Learning-practice/blob/master/BERT_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# load the pre-trained model

In [None]:
!pip install keras-bert

Collecting keras-bert
  Downloading keras-bert-0.88.0.tar.gz (26 kB)
Collecting keras-transformer>=0.39.0
  Downloading keras-transformer-0.39.0.tar.gz (11 kB)
Collecting keras-pos-embd>=0.12.0
  Downloading keras-pos-embd-0.12.0.tar.gz (6.0 kB)
Collecting keras-multi-head>=0.28.0
  Downloading keras-multi-head-0.28.0.tar.gz (14 kB)
Collecting keras-layer-normalization>=0.15.0
  Downloading keras-layer-normalization-0.15.0.tar.gz (4.2 kB)
Collecting keras-position-wise-feed-forward>=0.7.0
  Downloading keras-position-wise-feed-forward-0.7.0.tar.gz (4.5 kB)
Collecting keras-embed-sim>=0.9.0
  Downloading keras-embed-sim-0.9.0.tar.gz (4.1 kB)
Collecting keras-self-attention>=0.50.0
  Downloading keras-self-attention-0.50.0.tar.gz (12 kB)
Building wheels for collected packages: keras-bert, keras-transformer, keras-embed-sim, keras-layer-normalization, keras-multi-head, keras-pos-embd, keras-position-wise-feed-forward, keras-self-attention
  Building wheel for keras-bert (setup.py) ... [?

In [None]:
import numpy as np
import tensorflow as tf
from keras_bert import load_vocabulary, load_trained_model_from_checkpoint, Tokenizer, get_checkpoint_paths
from keras_bert.datasets import get_pretrained, PretrainedList
from pprint import pprint

In [None]:
# Download BERT chinese pretrained model
model_path = get_pretrained(PretrainedList.chinese_base)
model_path

Downloading data from https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip


'/root/.keras/datasets/chinese_L-12_H-768_A-12'

In [None]:
PretrainedList.chinese_base

'https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip'

In [None]:
paths = get_checkpoint_paths(model_path)
pprint(paths)

CheckpointPaths(config='/root/.keras/datasets/chinese_L-12_H-768_A-12/bert_config.json', checkpoint='/root/.keras/datasets/chinese_L-12_H-768_A-12/bert_model.ckpt', vocab='/root/.keras/datasets/chinese_L-12_H-768_A-12/vocab.txt')


In [None]:
model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, seq_len=20)

In [None]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Input-Token (InputLayer)       [(None, 20)]         0           []                               
                                                                                                  
 Input-Segment (InputLayer)     [(None, 20)]         0           []                               
                                                                                                  
 Embedding-Token (TokenEmbeddin  [(None, 20, 768),   16226304    ['Input-Token[0][0]']            
 g)                              (21128, 768)]                                                    
                                                                                                  
 Embedding-Segment (Embedding)  (None, 20, 768)      1536        ['Input-Segment[0][0]']    

In [None]:
token_dict = load_vocabulary(paths.vocab)
len(token_dict)

21128

In [None]:
token_dict['[PAD]'], token_dict['[CLS]'], token_dict['醒']

(0, 101, 7008)

In [None]:
pprint(token_dict)

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
 '敎': 3129,
 '敏': 3130,
 '救': 3131,
 '敕': 3132,
 '敖': 3133,
 '敗': 3134,
 '敘': 3135,
 '教': 3136,
 '敛': 3137,
 '敝': 3138,
 '敞': 3139,
 '敢': 3140,
 '散': 3141,
 '敦': 3142,
 '敬': 3143,
 '数': 3144,
 '敲': 3145,
 '整': 3146,
 '敵': 3147,
 '敷': 3148,
 '數': 3149,
 '斂': 3150,
 '斃': 3151,
 '文': 3152,
 '斋': 3153,
 '斌': 3154,
 '斎': 3155,
 '斐': 3156,
 '斑': 3157,
 '斓': 3158,
 '斗': 3159,
 '料': 3160,
 '斛': 3161,
 '斜': 3162,
 '斟': 3163,
 '斡': 3164,
 '斤': 3165,
 '斥': 3166,
 '斧': 3167,
 '斩': 3168,
 '斫': 3169,
 '斬': 3170,
 '断': 3171,
 '斯': 3172,
 '新': 3173,
 '斷': 3174,
 '方': 3175,
 '於': 3176,
 '施': 3177,
 '旁': 3178,
 '旃': 3179,
 '旅': 3180,
 '旋': 3181,
 '旌': 3182,
 '旎': 3183,
 '族': 3184,
 '旖': 3185,
 '旗': 3186,
 '无': 3187,
 '既': 3188,
 '日': 3189,
 '旦': 3190,
 '旧': 3191,
 '旨': 3192,
 '早': 3193,
 '旬': 3194,
 '旭': 3195,
 '旮': 3196,
 '旱': 3197,
 '时': 3198,
 '旷': 3199,
 '旺': 3200,
 '旻': 3201,
 '昀': 3202,
 '昂': 3203,
 '昆': 3204,
 '昇': 3205,
 '昉': 3206,
 '昊': 3207,
 '昌': 3208,
 '明':

# Tokenization - Approach1: Default Tokenizer

In [None]:
# Init tokenizer
tokenizer = Tokenizer(token_dict)
text = '醒醒吧，你沒有妹妹'
tokens = tokenizer.tokenize(text)
print('Tokens:', tokens)

Tokens: ['[CLS]', '醒', '醒', '吧', '，', '你', '沒', '有', '妹', '妹', '[SEP]']


In [None]:
indices, segments = tokenizer.encode(first=text, max_len=20)
indices, segments

([101,
  7008,
  7008,
  1416,
  8024,
  872,
  3760,
  3300,
  1987,
  1987,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Tokenization:  Approach 2: Custom tokenizer

In [None]:
config_path = paths.config
checkpoint_path = paths.checkpoint
dict_path = paths.vocab
print('config_path: ', config_path)
print('checkpoint_path: ', checkpoint_path)
print('dict_path : ', dict_path)

config_path:  /root/.keras/datasets/chinese_L-12_H-768_A-12/bert_config.json
checkpoint_path:  /root/.keras/datasets/chinese_L-12_H-768_A-12/bert_model.ckpt
dict_path :  /root/.keras/datasets/chinese_L-12_H-768_A-12/vocab.txt


In [None]:
import codecs

# build dictionary from scratch

token_dict = {}
with codecs.open(dict_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)
token_dict

{'[PAD]': 0,
 '[unused1]': 1,
 '[unused2]': 2,
 '[unused3]': 3,
 '[unused4]': 4,
 '[unused5]': 5,
 '[unused6]': 6,
 '[unused7]': 7,
 '[unused8]': 8,
 '[unused9]': 9,
 '[unused10]': 10,
 '[unused11]': 11,
 '[unused12]': 12,
 '[unused13]': 13,
 '[unused14]': 14,
 '[unused15]': 15,
 '[unused16]': 16,
 '[unused17]': 17,
 '[unused18]': 18,
 '[unused19]': 19,
 '[unused20]': 20,
 '[unused21]': 21,
 '[unused22]': 22,
 '[unused23]': 23,
 '[unused24]': 24,
 '[unused25]': 25,
 '[unused26]': 26,
 '[unused27]': 27,
 '[unused28]': 28,
 '[unused29]': 29,
 '[unused30]': 30,
 '[unused31]': 31,
 '[unused32]': 32,
 '[unused33]': 33,
 '[unused34]': 34,
 '[unused35]': 35,
 '[unused36]': 36,
 '[unused37]': 37,
 '[unused38]': 38,
 '[unused39]': 39,
 '[unused40]': 40,
 '[unused41]': 41,
 '[unused42]': 42,
 '[unused43]': 43,
 '[unused44]': 44,
 '[unused45]': 45,
 '[unused46]': 46,
 '[unused47]': 47,
 '[unused48]': 48,
 '[unused49]': 49,
 '[unused50]': 50,
 '[unused51]': 51,
 '[unused52]': 52,
 '[unused53]': 53

In [None]:
# To prevent different len of setence after tokenize

class OurTokenizer(Tokenizer):
    def _tokenize(self, text):
        tokens = []
        for c in text:
            if c in self._token_dict:
                tokens.append(c)
            elif self._is_space(c):
                tokens.append('[unused1]') # replace "space" with [unused1] tag, [unused*] is used for new char
            else:
                tokens.append('[UNK]') # the other tag: [UNK]
        return tokens

tokenizer = OurTokenizer(token_dict)

In [None]:
text = '醒醒吧，你沒有妹妹'
# text = '庭院深深深幾許'
# text = '妹妹'
# text = '穿衣服賣衣服'
tokens = tokenizer.tokenize(text)
tokens

['[CLS]', '醒', '醒', '吧', '，', '你', '沒', '有', '妹', '妹', '[SEP]']

In [None]:
indices, segments = tokenizer.encode(first=text, max_len=20)
indices, segments

([101,
  7008,
  7008,
  1416,
  8024,
  872,
  3760,
  3300,
  1987,
  1987,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Extract Embedding

In [None]:
predicts = model.predict([np.array([indices]), np.array([segments])])[0]
print(predicts.shape)

(20, 768)


In [None]:
for i, token in enumerate(tokens):
    print(token, predicts[i].tolist()[:3])

[CLS] [0.37912076711654663, 0.6134757399559021, -0.3767380714416504]
醒 [-0.2556297183036804, 0.6826424598693848, -1.1576611995697021]
醒 [0.8524934649467468, 0.2761129140853882, -1.3342105150222778]
吧 [1.3835668563842773, 0.5293526649475098, -0.8468607068061829]
， [1.2186630964279175, 0.7749003171920776, -0.9864788055419922]
你 [1.3306928873062134, -0.3070710599422455, -0.22489207983016968]
沒 [2.355961561203003, 0.15383456647396088, 1.1854667663574219]
有 [0.9452126622200012, 0.38598328828811646, -0.2148783802986145]
妹 [0.2918621301651001, 0.6621036529541016, -0.8666204214096069]
妹 [1.013543725013733, 0.7323691248893738, -0.2728177607059479]
[SEP] [0.13499897718429565, 0.3240869641304016, -0.36382871866226196]


# Case 1: Many-to-One
## Sentiment analysic, document classification ... etc

In [None]:
from tensorflow.keras import layers, models

bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None) # seq no longer than 512

# Freeze BERT 
for l in bert_model.layers:
    l.trainable = True # False

input_token = layers.Input(shape=(None,))
input_segment = layers.Input(shape=(None,))

x = bert_model([input_token, input_segment])
x = layers.Lambda(lambda x: x[:, 0])(x) # get the output vector of [CLS]
p = layers.Dense(1, activation='sigmoid')(x)

model = models.Model([input_token, input_segment], p)
model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(1e-5), # smaller lr
    metrics=['accuracy']
)
# model.summary()

In [None]:
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
model_7 (Functional)            (None, None, 768)    101677056   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 768)          0           model_7[0][0]              