### 原始训练数据预处理

In [1]:
import pandas as pd
import json
import codecs
from collections import Counter
import warnings
from sklearn.metrics import f1_score
warnings.filterwarnings('ignore')

In [2]:
import os
os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'

In [3]:
with codecs.open('./dataset/nCoV_100k_train.labled.csv','rb',errors='ignore') as f:
    train_data = []
    for line in f.readlines():
        try:
            train_data.append(codecs.decode(line,'gbk',errors='ignore').strip('\r\n'))
        except UnicodeDecodeError:
            continue

In [4]:
tmp_train_data = [v.split(',[') for v in train_data]
tmp_train_data = [v for v in tmp_train_data if len(v)<4]
text = [[v[0].split(',')[0],''.join(v[0].split(',')[3:]),v[-1].split(',')[-1]] for v in tmp_train_data]

In [5]:
train_df = pd.DataFrame(text[1:],columns=text[0])
train_df.columns = ['微博id','微博内容','情感倾向']
train_df['文本长度'] = train_df['微博内容'].map(len)
train_df = train_df[train_df['情感倾向'].isin(['-1','0','1'])]
label_map = {'-1':0,'0':1,'1':2}

In [6]:
label_map_reverse = {0:-1, 1:0, 2:1}

In [7]:
train_df['label'] = train_df['情感倾向'].map(label_map)

### 加载词向量

### 原始测试数据预处理

In [None]:
with codecs.open('./dataset/nCov_10k_test.csv','rb',errors='ignore') as f:
    test_data = []
    for line in f.readlines():
        test_data.append(codecs.decode(line,'gbk',errors='ignore').strip('\r\n'))

In [None]:
tmp_test_data = [v.split(',[') for v in test_data]
tmp_test_data = [v for v in tmp_test_data if len(v)<4]
test_text = [[v[0].split(',')[0],''.join(v[0].split(',')[3:])] for v in tmp_test_data]
test_df = pd.DataFrame(test_text,columns =['微博id','微博内容'])

### 模型数据预处理

In [None]:
import json
import numpy as np
import keras
from bert4keras.tokenizer import Tokenizer
from bert4keras.bert import build_bert_model
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from bert4keras.snippets import sequence_padding, DataGenerator
from keras.layers import Lambda, Dense

In [None]:
num_classes = 3
maxlen = 300
batch_size = 8
config_path = './chinese_wwm_ext_L-12_H-768_A-12/bert_config.json'
checkpoint_path = './chinese_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
dict_path = './chinese_wwm_ext_L-12_H-768_A-12/vocab.txt'

In [None]:
tokenizer = Tokenizer(dict_path,do_lower_case=True)

In [None]:
class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        idxs = list(range(len(self.data)))
        if random:
            np.random.shuffle(idxs)
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for i in idxs:
            text, label = self.data[i]
            token_ids, segment_ids = tokenizer.encode(text, max_length=maxlen)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids, batch_segment_ids], batch_labels
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []

#### 模型测试数据准备

In [None]:
test_token_ids, test_segment_ids = [], []
for i in range(len(test_df)):
    text = test_df['微博内容'].values[i]
    token_ids, segment_ids = tokenizer.encode(text, max_length=maxlen)
    test_token_ids.append(token_ids)
    test_segment_ids.append(segment_ids)
test_token_ids = sequence_padding(test_token_ids)
test_segment_ids = sequence_padding(test_segment_ids)

### 加载模型

In [None]:
bert = build_bert_model(
    config_path=config_path,
    checkpoint_path=checkpoint_path,
    return_keras_model=False,
)

In [None]:
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
output = Dense(units=num_classes,
               activation='softmax',
               kernel_initializer=bert.initializer)(output)

In [None]:
model = keras.models.Model(bert.model.input, output)

In [None]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(1e-5),  # 用足够小的学习率
    metrics=['accuracy'],
)

### 模型训练

#### 定义模型的评价函数

In [None]:
class evaluator(keras.callbacks.Callback):
    def __init(self):
        pass

    def on_epoch_end(self, epoch, logs=None):
        _y_pred = []
        _y_true = []
        for x, label in valid_generator:
            y_pred = model.predict(x)
            y_pred = [v.argmax() for v in y_pred]
            y_true = [v[0] for v in label]
            _y_pred.extend(y_pred)
            _y_true.extend(y_true)
        print(u'f1_score: %.5f\n' % (f1_score(_y_true,_y_pred,average='macro')))
_evaluator = evaluator()

#### 划分测试集训练集

In [None]:
idx = list(range(len(train_df)))
np.random.shuffle(idx)
train_idxs = idx[:int(0.9 * len(idx))]
test_idxs = idx[int(0.9 * len(idx)):]

In [None]:
train_data = train_df[['微博内容','label']].values[train_idxs]
test_data = train_df[['微博内容','label']].values[test_idxs]

In [None]:
train_generator = data_generator(train_data, batch_size)
valid_generator = data_generator(test_data, batch_size)

In [None]:
model.fit_generator(train_generator.forfit(),
                    steps_per_epoch=len(train_generator),
                    epochs=2,
                    callbacks=[_evaluator])

### 数据预测

In [None]:
test_res = model.predict([test_token_ids,test_segment_ids],batch_size=64)
test_res_lable = [v.argmax() for v in test_res]
test_df['label'] = test_res_lable

In [None]:
test_df.shape

In [None]:
sample = pd.read_csv('./dataset/submit_example.csv')
sample.columns = ['微博id','y']
sample['微博id'] = sample['微博id'].map(lambda x:str(x) + ' ')

In [None]:
sample = sample.merge(test_df[['微博id','label']],how='left',on='微博id')
sample['label'] = sample['label'].fillna(1)
sample['label'] = sample['label'].map(int)

In [None]:
sample['label'] = sample['label'].map(label_map_reverse)

In [None]:
sample['微博id'] = sample['微博id'].map(lambda x:x.strip())

In [None]:
sample[['微博id','label']].to_csv('submit.csv',index=False)