In [1]:
import json
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import bert
from datetime import datetime

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
train

Unnamed: 0,stock_id,stock_name,event_date,TCRI(m/y),importance,class,category,event_content
0,1218,泰山,20190101,6(2018/09),0,M_經營層,MT06_高管異動,發言人林俐婉內部調動，由江巍峰接任。
1,1503,士電,20190101,4(2018/09),0,M_經營層,MT06_高管異動,內部稽核主管林志強內部調動，由莊文清接任。
2,1504,東元,20190101,4(2018/09),0,M_經營層,MT06_高管異動,會計主管藍俊雄內部調動，由林鴻名接任。
3,1709,和益,20190101,5(2018/09),0,M_經營層,MT06_高管異動,內部稽核主管游本詮內部調動，由曾筱茜接任。
4,1721,三晃,20190101,7(2018/09),0,M_經營層,MT06_高管異動,財務經理洪廷宜內部調動，由王婷渝接任。
...,...,...,...,...,...,...,...,...
23698,6715,嘉基,20191231,6(2019/09),1,I_產業前景,IS01_營收變動或客戶/商品/通路策略,"2018年09月累計營收1,496,639千元，年增162%。2018年09月單月營收161..."
23699,6736,碩辣椒,20191231,,-1,F_市場交易,FS02_股價暴跌或異常,2019/12/31收盤價35.00元，月報酬下跌43%。
23700,8028,昇陽半導體,20191231,6(2019/09),0,I_產業前景,IS01_營收變動或客戶/商品/通路策略,世界先進評估跨足晶圓薄化領域，搶進晶圓薄化市場機率高。晶圓薄化是晶圓製造後到封裝之間的一個重...
23701,8033,雷虎,20191231,8(2019/09),0,I_產業前景,IF03_財務協議/糾紛,1.雷虎科技與香港中澤文化投資股份有限公司仲裁案號(補充公告)：雷虎科技將子公司英屬維京群島...


In [4]:
val_size = int(len(test)/2)
val = test[:val_size].reset_index(drop=True)
test = test[val_size:].reset_index(drop=True)

# Tokenizer

In [5]:
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file='../model/bert_zh_L-12_H-768_A-12_2/assets/vocab.txt')

In [6]:
vocab = tokenizer.vocab
print('Size of dictionary:', len(vocab))

Size of dictionary: 21128


In [7]:
import random
random_tokens = random.sample(list(vocab), 20)
random_ids = [vocab[t] for t in random_tokens]

print("{0:20}{1:15}".format("token", "index"))
print("-" * 25)
for t, id in zip(random_tokens, random_ids):
    print("{0:15}{1:10}".format(t, id))

token               index          
-------------------------
遁                    6875
黝                    7952
盖                    4667
survey               8704
繭                    5259
##垂                 14852
##陶                 20435
##葺                 18937
秣                    4910
iso9001              9727
262                 10975
##ᆻ                 13487
钼                    7184
20000                8875
##斥                 16223
##gc                10817
##嗆                 14676
##憧                 15792
pinterest            8379
煅                    4199


# Data pre-processing

In [8]:
for sentence in train['event_content'][:3]:
    tokens = ["[CLS]"] + tokenizer.tokenize(sentence) + ['[SEP]']
    
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    segment_ids = [0] * len(input_ids)
    mask_ids = [1] * len(input_ids)
    print('sentence:', sentence)
    print('tokens:', tokens)
    print('input ids:', input_ids)
    print('segment ids:', segment_ids)
    print('mask_ids:', mask_ids)
    print()

sentence: 發言人林俐婉內部調動，由江巍峰接任。
tokens: ['[CLS]', '發', '言', '人', '林', '俐', '婉', '內', '部', '調', '動', '，', '由', '江', '巍', '峰', '接', '任', '。', '[SEP]']
input ids: [101, 4634, 6241, 782, 3360, 919, 2039, 1058, 6956, 6310, 1240, 8024, 4507, 3736, 2331, 2292, 2970, 818, 511, 102]
segment ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
mask_ids: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

sentence: 內部稽核主管林志強內部調動，由莊文清接任。
tokens: ['[CLS]', '內', '部', '稽', '核', '主', '管', '林', '志', '強', '內', '部', '調', '動', '，', '由', '莊', '文', '清', '接', '任', '。', '[SEP]']
input ids: [101, 1058, 6956, 4942, 3417, 712, 5052, 3360, 2562, 2485, 1058, 6956, 6310, 1240, 8024, 4507, 5800, 3152, 3926, 2970, 818, 511, 102]
segment ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
mask_ids: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

sentence: 會計主管藍俊雄內部調動，由林鴻名接任。
tokens: ['[CLS]', '會', '計', '主', '管', '藍', '俊', '雄', '內', '部', '調', '動', '，', '

In [9]:
def create_inputs(sentences, tokenizer, maxlen=None):
    input_ids = []
    segment_ids = []
    mask_ids = []
    for sentence in sentences:
        tokens = ["[CLS]"] + tokenizer.tokenize(sentence) + ['[SEP]']

        inputs = tokenizer.convert_tokens_to_ids(tokens)
        segments = [0] * len(input_ids)
        masks = [1] * len(input_ids)
        
        input_ids.append(inputs)
        segment_ids.append(segments)
        mask_ids.append(masks)
        
    input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=maxlen, dtype='int32', padding='post', truncating='post')
    segment_ids = tf.keras.preprocessing.sequence.pad_sequences(segment_ids, maxlen=maxlen, dtype='int32', padding='post', truncating='post')
    mask_ids = tf.keras.preprocessing.sequence.pad_sequences(mask_ids, maxlen=maxlen, dtype='int32', padding='post', truncating='post')
    return input_ids, mask_ids, segment_ids

In [10]:
maxlen = 200
train_features = create_inputs(train['event_content'], tokenizer, maxlen=maxlen)
val_features = create_inputs(val['event_content'], tokenizer, maxlen=maxlen)
test_features = create_inputs(test['event_content'], tokenizer, maxlen=maxlen)

In [11]:
train_labels = train['importance'].values.copy()
val_labels = val['importance'].values.copy()
test_labels = test['importance'].values.copy()

In [12]:
train_labels += 3
val_labels += 3
test_labels += 3

# Model

In [13]:
def build_model(bert_layer, maxlen, training=False):
    input_word_ids = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = tf.keras.layers.Dense(7, activation='softmax')(clf_output)

    model = tf.keras.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    
    optimizer = tf.keras.optimizers.Adam(lr=2e-6)
    loss = tf.keras.losses.SparseCategoricalCrossentropy()
    model.compile(optimizer, loss=loss, metrics=['accuracy'])
    return model

In [14]:
bert_layer = hub.KerasLayer('../model/bert_zh_L-12_H-768_A-12_2/', trainable=True)
model = build_model(bert_layer, maxlen)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 200)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 200)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 200)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 102267649   input_word_ids[0][0]             
                                                                 input_mask[0][0]      

In [None]:
log_dir = "logs/fit/watch_dog_" + datetime.now().strftime("%Y%m%d")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)
model.fit(train_features, train_labels, 
          epochs=50, 
          batch_size=4,
          validation_data=(val_features, val_labels),
          callbacks=[early_stopping])

Epoch 1/50

In [None]:
model.save('bert_whole')

In [19]:
class Model(tf.keras.Model):
    def __init__(self):
        super(Model, self).__init__()
        self.bert_layer = hub.KerasLayer('../model/bert_zh_L-12_H-768_A-12_2/', trainable=False)
        self.fc1 = tf.keras.layers.Dense(1024, activation='relu')
        self.fc2 = tf.keras.layers.Dense(1024, activation='relu')
        self.fc3 = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs, training=None, mask=None):
        x = self.bert_layer(inputs)
        x = x[:, 0, :]
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

In [27]:
model = Model()
optimizer = tf.keras.optimizers.Adam(lr=2e-6)
loss = tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer, loss=loss, metrics=['accuracy'])