In [1]:
#-*- coding : utf-8 -*-
import os
import re
import time
import logging
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf

import jieba
import jieba.posseg as pseg
from jieba.analyse import *

from keras.layers import *
from keras.models import Model
from keras.callbacks import *
from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical
from keras.preprocessing import sequence
import keras.backend as K
import keras.backend.tensorflow_backend as KTF
from keras_bert import load_trained_model_from_checkpoint, Tokenizer

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score

warnings.filterwarnings('ignore')
os.environ["CUDA_VISIBLE_DEVICES"] = "2"


# 创建一个logger
file_path = 'log/'
logger = logging.getLogger('mylogger')
logger.setLevel(logging.DEBUG)
# 创建一个handler
timestamp = time.strftime("%Y.%m.%d_%H.%M.%S", time.localtime())
fh = logging.FileHandler(file_path + 'log_' + timestamp +'.txt')
fh.setLevel(logging.DEBUG)
# 再创建一个handler，用于输出到控制台
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# 定义handler的输出格式
formatter = logging.Formatter('[%(asctime)s][%(levelname)s] ## %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# 给logger添加handler
logger.addHandler(fh)
logger.addHandler(ch)


# set some global params
learning_rate = 5e-5
min_learning_rate = 1e-5
config_path = '../bert_base/bert_pretrained_model/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '../bert_base/bert_pretrained_model/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '../bert_base/bert_pretrained_model/chinese_L-12_H-768_A-12/vocab.txt'
MAX_LEN = 150
foldnum = 5
BATCHSIZE = 8
n_class = 8

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
train_data = pd.read_csv('data/train_data.csv', delimiter="\t")
label_map = {"NoneType":0,"医学专科":1,"检查科目":2,"疾病":3,"病毒":4,"症状":5,"细菌":6,"药物":7}
train_data['Type'] = train_data['Type'].apply(lambda x:label_map[x])
train_data['Type'].value_counts()

7    2550
3    1266
5     866
2     144
0     100
6      52
4      10
1       4
Name: Type, dtype: int64

In [3]:
# 创建分词器
token_dict = {}
with open(dict_path, 'r', encoding='utf-8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)
tokenizer = Tokenizer(token_dict)


def clean(text):
    r='[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{}~]+'
    text = re.sub(r, "", text)       # 去除标点
    text = text.replace("本词条内容尚未完善，欢迎各位编辑词条，贡献自己的专业知识！", "")       # 去除无意义的词语
    text = re.sub(r"\s+", " ", text)  # 合并正文中过多的空格
    return text.strip()
    

# 读取数据集
label_map = {"NoneType":0,"医学专科":1,"检查科目":2,"疾病":3,"病毒":4,"症状":5,"细菌":6,"药物":7}
train_df = pd.read_csv('data/train_data.csv', delimiter="\t")
train_df['Type'] = train_df['Type'].apply(lambda x:label_map[x])
print("train data: " + str(len(train_df)))
test_df = pd.read_csv('data/test_data.csv', delimiter="\t")
print("test data: " + str(len(test_df)))

# 缺失值处理
train_df["Descreption"] = train_df["Descreption"].fillna('')
test_df["Descreption"] = test_df["Descreption"].fillna('')

# 数据清洗
train_df["Descreption"] = train_df["Descreption"].apply(clean)
test_df["Descreption"] = test_df["Descreption"].apply(clean)

# 特征
train_fea = train_df['Descreption'].values
test_fea = test_df['Descreption'].values
print(train_fea[0])

# 类标
labels = train_df['Type']  # 数字
labels_cat = to_categorical(labels)  # 数组
labels_cat = labels_cat.astype(np.int32)
print(labels_cat[0])


train data: 4992
test data: 2974
A医学百科替加氟胶囊条目介绍替加氟胶囊的功效作用，替加氟胶囊的副作用和服用方法等。替加氟胶囊（TegafurCapsules），主要治疗消化道肿瘤，对胃癌、结肠癌、直肠癌有一定疗效。也可用于替加氟胶囊替加氟胶囊，成都通德药业有限公司生产制造，可用于膀胱癌前列腺癌肾癌等。替加氟胶囊，适应症为抗肿瘤药。适用于消化道肿瘤，如胃癌、结肠癌和胰腺癌，也可用于乳腺癌、支气管肺癌和原发性肝癌等。
[0 0 0 0 0 0 0 1]


In [4]:
# data generator for generating bathc data for traning
class data_generator:
    def __init__(self, data, batch_size=BATCHSIZE):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data[0]) // self.batch_size
        if len(self.data[0]) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def __iter__(self):
        while True:
            X, y = self.data
            idxs = list(range(len(self.data[0])))
            np.random.shuffle(idxs)
            T, T_, Y = [], [], []
            for c, i in enumerate(idxs):
                text = X[i]
                t, t_ = tokenizer.encode(first=text, max_len = MAX_LEN)
                T.append(t)
                T_.append(t_)
                Y.append(y[i])
                if len(T) == self.batch_size or i == idxs[-1]:
                    T = np.array(T)
                    T_ = np.array(T_)
                    # T = sequence.pad_sequences(T, maxlen=MAX_LEN, padding='post', truncating='post')
                    # T_ = sequence.pad_sequences(T_, maxlen=MAX_LEN, padding='post', truncating='post')
                    Y = np.array(Y)
                    yield [T, T_], Y
                    T, T_, Y = [], [], []
                    
                    
def get_model():
    bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path)
    for l in bert_model.layers:
        l.trainable = True

    T1 = Input(shape=(None,))
    T2 = Input(shape=(None,))

    T = bert_model([T1, T2])

    T = Lambda(lambda x: x[:, 0])(T)
    T = Dropout(0.5)(T)
    T = Dropout(0.5)(T)
    # t3 = Dropout(0.5)(T)
    # t4 = Dropout(0.5)(T)
    # t5 = Dropout(0.5)(T)
    # merged = concatenate([t1,t2,t3,t4,t5])
    # outputmerged = Dense(8, activation='relu')(merged)
    output = Dense(n_class, activation='softmax')(T)

    model = Model([T1, T2], output)
    model.compile(
        loss='categorical_crossentropy',
        optimizer=Adam(1e-5),  # 用足够小的学习率
        metrics=['accuracy']
    )
    model.summary()
    return model


class Evaluate(Callback):

    def __init__(self, val_data, val_index):
        self.score = []
        self.best = 0.
        self.early_stopping = 0
        self.val_data = val_data
        self.val_index = val_index
        self.predict = []
        self.lr = 0
        self.passed = 0

    def on_batch_begin(self, batch, logs=None):
        """第一个epoch用来warmup，第二个epoch把学习率降到最低
        """
        if self.passed < self.params['steps']:
            self.lr = (self.passed + 1.) / self.params['steps'] * learning_rate
            K.set_value(self.model.optimizer.lr, self.lr)
            self.passed += 1
        elif self.params['steps'] <= self.passed < self.params['steps'] * 2:
            self.lr = (2 - (self.passed + 1.) / self.params['steps']) * (learning_rate - min_learning_rate)
            self.lr += min_learning_rate
            K.set_value(self.model.optimizer.lr, self.lr)
            self.passed += 1

    def on_epoch_end(self, epoch, logs=None):
        score, acc, f1 = self.evaluate()
        if score > self.best:
            self.best = score
            self.early_stopping = 0
            model.save_weights('model_save_1/bert{}.w'.format(fold))
        else:
            self.early_stopping += 1
        logger.info('lr: %.6f, epoch: %d, score: %.4f, acc: %.4f, f1: %.4f,best: %.4f\n' % (self.lr, epoch, score, acc, f1, self.best))

    def evaluate(self):
        self.predict = []
        prob = []
        val_x, val_y, val_cat = self.val_data
        for i in tqdm(range(len(val_x))):
            test = val_x[i]

            t1, t1_ = tokenizer.encode(first=test, max_len=MAX_LEN)
            T1, T1_ = np.array([t1]), np.array([t1_])
            # T1 = sequence.pad_sequences([t1], maxlen=MAX_LEN, padding='post', truncating='post')
            # T1_ = sequence.pad_sequences([t1_], maxlen=MAX_LEN, padding='post', truncating='post')
            _prob = model.predict([T1, T1_])
            oof_train[self.val_index[i]] = _prob[0]
            self.predict.append(np.argmax(_prob, axis=1)[0]+1)
            prob.append(_prob[0])

        score = 1.0 / (1 + mean_absolute_error(val_y+1, self.predict))
        acc = accuracy_score(val_y+1, self.predict)
        f1 = f1_score(val_y+1, self.predict, average='macro')
        return score, acc, f1


def predict(data):
    prob = []
    val_x = data
    for i in tqdm(range(len(val_x))):
        text = val_x[i]
        t1, t1_ = tokenizer.encode(first=text, max_len=MAX_LEN)
        T1, T1_ = np.array([t1]), np.array([t1_])
        # T1 = sequence.pad_sequences([t1], maxlen=MAX_LEN, padding='post', truncating='post')
        # T1_ = sequence.pad_sequences([t1_], maxlen=MAX_LEN, padding='post', truncating='post')

        _prob = model.predict([T1, T1_])
        prob.append(_prob[0])
    return prob

In [5]:
# 训练模型
skf = StratifiedKFold(n_splits=foldnum, shuffle=True, random_state=42)

oof_train = np.zeros((len(train_df), n_class), dtype=np.float32)
oof_test = np.zeros((len(test_df), n_class), dtype=np.float32)

for fold, (train_index, valid_index) in enumerate(skf.split(train_fea, labels)):
    logger.info('================     fold {}        ==============='.format(fold))
    x = train_fea[train_index]
    y = labels_cat[train_index]

    val_x = train_fea[valid_index]
    val_y = labels[valid_index]
    val_cat = labels_cat[valid_index]

    train_D = data_generator([x, y])
    evaluator = Evaluate([val_x, val_y, val_cat], valid_index)
    model = get_model()
    # if os.path.exists('./model_save/bert{}.w'.format(fold)):
    #     model.load_weights('./model_save/bert{}.w'.format(fold))
    #model = ParallelModel(model, 2)
    model.fit_generator(train_D.__iter__(),
                        steps_per_epoch=len(train_D),
                        epochs=3,
                        callbacks=[evaluator]
                       )
    model.load_weights('model_save_1/bert{}.w'.format(fold))
    oof_test += predict(test_fea)
    K.clear_session()
    oof_test /= foldnum







Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
model_2 (Model)                 multiple             101677056   input_1[0][0]                    
                                                                 input_2[0][0]                    
_______________________________________________________________________________________

100%|██████████| 999/999 [00:20<00:00, 47.85it/s]
[2020-06-05 17:02:07,156][INFO] ## lr: 0.000050, epoch: 0, score: 0.8215, acc: 0.9510, f1: 0.6112,best: 0.8215



Epoch 2/3


100%|██████████| 999/999 [00:19<00:00, 51.97it/s]
[2020-06-05 17:03:49,555][INFO] ## lr: 0.000010, epoch: 1, score: 0.8920, acc: 0.9670, f1: 0.7383,best: 0.8920



Epoch 3/3


100%|██████████| 999/999 [00:19<00:00, 52.21it/s]
[2020-06-05 17:05:29,573][INFO] ## lr: 0.000010, epoch: 2, score: 0.8936, acc: 0.9690, f1: 0.7641,best: 0.8936

100%|██████████| 2974/2974 [00:54<00:00, 54.96it/s]


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
model_2 (Model)                 multiple             101677056   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 768)          0           model_2[1][0]                    
__________

100%|██████████| 999/999 [00:21<00:00, 45.47it/s]
[2020-06-05 17:09:16,715][INFO] ## lr: 0.000050, epoch: 0, score: 0.5453, acc: 0.8128, f1: 0.5699,best: 0.5453



Epoch 2/3


100%|██████████| 999/999 [00:18<00:00, 55.19it/s]
[2020-06-05 17:10:58,543][INFO] ## lr: 0.000010, epoch: 1, score: 0.9074, acc: 0.9650, f1: 0.7760,best: 0.9074



Epoch 3/3


100%|██████████| 999/999 [00:19<00:00, 52.38it/s]
[2020-06-05 17:12:38,806][INFO] ## lr: 0.000010, epoch: 2, score: 0.9182, acc: 0.9680, f1: 0.9557,best: 0.9182

100%|██████████| 2974/2974 [00:53<00:00, 55.19it/s]


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
model_2 (Model)                 multiple             101677056   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 768)          0           model_2[1][0]                    
__________

100%|██████████| 998/998 [00:24<00:00, 41.27it/s]
[2020-06-05 17:16:23,731][INFO] ## lr: 0.000050, epoch: 0, score: 0.9173, acc: 0.9689, f1: 0.7979,best: 0.9173



Epoch 2/3


100%|██████████| 998/998 [00:20<00:00, 48.38it/s]
[2020-06-05 17:18:07,525][INFO] ## lr: 0.000010, epoch: 1, score: 0.9258, acc: 0.9749, f1: 0.8959,best: 0.9258



Epoch 3/3


100%|██████████| 998/998 [00:21<00:00, 46.38it/s]
[2020-06-05 17:19:49,823][INFO] ## lr: 0.000010, epoch: 2, score: 0.9318, acc: 0.9749, f1: 0.9105,best: 0.9318

100%|██████████| 2974/2974 [00:59<00:00, 49.59it/s]


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
model_2 (Model)                 multiple             101677056   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 768)          0           model_2[1][0]                    
__________

100%|██████████| 998/998 [00:22<00:00, 44.80it/s]
[2020-06-05 17:23:38,746][INFO] ## lr: 0.000050, epoch: 0, score: 0.8537, acc: 0.9469, f1: 0.6416,best: 0.8537



Epoch 2/3


100%|██████████| 998/998 [00:20<00:00, 48.11it/s]
[2020-06-05 17:25:23,511][INFO] ## lr: 0.000010, epoch: 1, score: 0.9064, acc: 0.9619, f1: 0.8303,best: 0.9064



Epoch 3/3


100%|██████████| 998/998 [00:20<00:00, 48.47it/s]
[2020-06-05 17:27:05,124][INFO] ## lr: 0.000010, epoch: 2, score: 0.9241, acc: 0.9679, f1: 0.8422,best: 0.9241

100%|██████████| 2974/2974 [01:01<00:00, 48.21it/s]


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
model_2 (Model)                 multiple             101677056   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 768)          0           model_2[1][0]                    
__________

100%|██████████| 998/998 [00:23<00:00, 42.85it/s]
[2020-06-05 17:31:01,220][INFO] ## lr: 0.000050, epoch: 0, score: 0.8762, acc: 0.9509, f1: 0.5481,best: 0.8762



Epoch 2/3


100%|██████████| 998/998 [00:19<00:00, 52.22it/s]
[2020-06-05 17:32:44,682][INFO] ## lr: 0.000010, epoch: 1, score: 0.9258, acc: 0.9719, f1: 0.7693,best: 0.9258



Epoch 3/3


100%|██████████| 998/998 [00:19<00:00, 51.55it/s]
[2020-06-05 17:34:22,121][INFO] ## lr: 0.000010, epoch: 2, score: 0.9106, acc: 0.9629, f1: 0.8255,best: 0.9258

100%|██████████| 2974/2974 [00:56<00:00, 52.75it/s]


In [8]:
np.savetxt('model_save_1/train_bert.txt', oof_train)
np.savetxt('model_save_1/test_bert.txt', oof_test)

cv_score = 1.0 / (1 + mean_absolute_error(labels+1, np.argmax(oof_train, axis=1) + 1))
print(cv_score)

label_map = {0:"NoneType",1:"医学专科",2:"检查科目",3:"疾病",4:"病毒",5:"症状",6:"细菌",7:"药物"}
test_df['Type'] = np.argmax(oof_test, axis=1)
test_df['Type'] = test_df['Type'].apply(lambda x: label_map[x])
test_df[['Name', 'Type']].to_csv('submit/baseline_1_{}.txt'.format(cv_score), sep='\t', header=None, index=False, encoding = "utf-8")

0.9154593801577113
