# $$SMP2018中文人机对话技术评测（ECDT）$$

1. 下面是一个完整的针对 [SMP2018中文人机对话技术评测（ECDT）](http://smp2018.cips-smp.org/ecdt_index.html) 的实验，由该实验训练的基线模型能达到评测排行榜的前三的水平。
2. 通过本实验，可以掌握处理自然语言文本数据的一般方法。
3. 推荐自己修改此文件，达到更好的实验效果，比如改变以下几个超参数 

```python
# 词嵌入的维度
embedding_word_dims = 32
# 批次大小
batch_size = 30
# 周期
epochs = 20
```

# 本实验还可以改进的地方举例 

1. 预处理阶段使用其它的分词工具
2. 采用字符向量和词向量结合的方式
3. 使用预先训练好的词向量
4. 改变模型结构
5. 改变模型超参数

# 导入依赖库

In [1]:
import numpy as np
import pandas as pd
import collections
import jieba
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.utils import to_categorical,plot_model
from keras.callbacks import TensorBoard, Callback

from sklearn.metrics import classification_report

import requests 

import time

import os

Using TensorFlow backend.


# 辅助函数

In [2]:
from keras import backend as K

# 计算 F1 值的函数
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [3]:
# 获取自定义时间格式的字符串
def get_customization_time():
    # return '2018_10_10_18_11_45' 年月日时分秒
    time_tuple = time.localtime(time.time())
    customization_time = "{}_{}_{}_{}_{}_{}".format(time_tuple[0], time_tuple[1], time_tuple[2], time_tuple[3], time_tuple[4], time_tuple[5])
    return customization_time

# 准备数据

## [下载SMP2018官方数据](https://worksheets.codalab.org/worksheets/0x27203f932f8341b79841d50ce0fd684f/)

In [4]:
raw_train_data_url = "https://worksheets.codalab.org/rest/bundles/0x0161fd2fb40d4dd48541c2643d04b0b8/contents/blob/"
raw_test_data_url = "https://worksheets.codalab.org/rest/bundles/0x1f96bc12222641209ad057e762910252/contents/blob/"

# 如果不存在 SMP2018 数据，则下载
if (not os.path.exists('./data/train.json')) or (not os.path.exists('./data/dev.json')):
    raw_train = requests.get(raw_train_data_url) 
    raw_test = requests.get(raw_test_data_url) 
    if not os.path.exists('./data'):
        os.makedirs('./data')
    with open("./data/train.json", "wb") as code:
         code.write(raw_train.content)
    with open("./data/dev.json", "wb") as code:
         code.write(raw_test.content)

In [5]:
def get_json_data(path):
    # read data
    data_df = pd.read_json(path)
    # change row and colunm
    data_df = data_df.transpose()
    # change colunm order
    data_df = data_df[['query', 'label']]
    return data_df

In [6]:
train_data_df = get_json_data(path="data/train.json")

test_data_df = get_json_data(path="data/dev.json")

In [7]:
train_data_df.head()

Unnamed: 0,query,label
0,今天东莞天气如何,weather
1,从观音桥到重庆市图书馆怎么走,map
2,鸭蛋怎么腌？,cookbook
3,怎么治疗牛皮癣,health
4,唠什么,chat


---

## [结巴分词](https://github.com/fxsjy/jieba)示例，下面将使用结巴分词对原数据进行处理

In [8]:
seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
print(list(seg_list))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.022 seconds.
Prefix dict has been built succesfully.


['他', '来到', '了', '网易', '杭研', '大厦']


---

# 序列化

In [9]:
def use_jieba_cut(a_sentence):
    return list(jieba.cut(a_sentence))

train_data_df['cut_query'] = train_data_df['query'].apply(use_jieba_cut)
test_data_df['cut_query'] = test_data_df['query'].apply(use_jieba_cut)

In [10]:
train_data_df.head(10)

Unnamed: 0,query,label,cut_query
0,今天东莞天气如何,weather,"[今天, 东莞, 天气, 如何]"
1,从观音桥到重庆市图书馆怎么走,map,"[从, 观音桥, 到, 重庆市, 图书馆, 怎么, 走]"
2,鸭蛋怎么腌？,cookbook,"[鸭蛋, 怎么, 腌, ？]"
3,怎么治疗牛皮癣,health,"[怎么, 治疗, 牛皮癣]"
4,唠什么,chat,"[唠, 什么]"
5,阳澄湖大闸蟹的做法。,cookbook,"[阳澄湖, 大闸蟹, 的, 做法, 。]"
6,昆山大润发在哪里,map,"[昆山, 大润发, 在, 哪里]"
7,红烧肉怎么做？嗯？,cookbook,"[红烧肉, 怎么, 做, ？, 嗯, ？]"
8,南京到厦门的火车票,train,"[南京, 到, 厦门, 的, 火车票]"
9,6的平方,calc,"[6, 的, 平方]"


## 处理特征

In [11]:
tokenizer = Tokenizer()

In [12]:
tokenizer.fit_on_texts(train_data_df['cut_query'])

In [13]:
max_features = len(tokenizer.index_word)

len(tokenizer.index_word)

2883

In [14]:
x_train = tokenizer.texts_to_sequences(train_data_df['cut_query'])

x_test = tokenizer.texts_to_sequences(test_data_df['cut_query'])

In [15]:
max_cut_query_lenth = 26

In [16]:
x_train = pad_sequences(x_train, max_cut_query_lenth)

x_test = pad_sequences(x_test, max_cut_query_lenth)

In [17]:
x_train.shape

(2299, 26)

In [18]:
x_test.shape

(770, 26)

## 处理标签

In [19]:
label_tokenizer = Tokenizer()

In [20]:
label_tokenizer.fit_on_texts(train_data_df['label'])

In [21]:
label_numbers = len(label_tokenizer.word_counts)

In [22]:
NUM_CLASSES = len(label_tokenizer.word_counts)

In [23]:
label_tokenizer.word_counts

OrderedDict([('weather', 66),
             ('map', 68),
             ('cookbook', 269),
             ('health', 55),
             ('chat', 455),
             ('train', 70),
             ('calc', 24),
             ('translation', 61),
             ('music', 66),
             ('tvchannel', 71),
             ('poetry', 102),
             ('telephone', 63),
             ('stock', 71),
             ('radio', 24),
             ('contacts', 30),
             ('lottery', 24),
             ('website', 54),
             ('video', 182),
             ('news', 58),
             ('bus', 24),
             ('app', 53),
             ('flight', 62),
             ('epg', 107),
             ('message', 63),
             ('match', 24),
             ('schedule', 29),
             ('novel', 24),
             ('riddle', 34),
             ('email', 24),
             ('datetime', 18),
             ('cinemas', 24)])

In [24]:
y_train = label_tokenizer.texts_to_sequences(train_data_df['label'])

In [25]:
y_train[:10]

[[10], [9], [2], [17], [1], [2], [9], [2], [8], [23]]

In [26]:
y_train = [[y[0]-1] for y in y_train]

In [27]:
y_train[:10]

[[9], [8], [1], [16], [0], [1], [8], [1], [7], [22]]

In [28]:
y_train = to_categorical(y_train, label_numbers)
y_train.shape

(2299, 31)

In [29]:
y_test = label_tokenizer.texts_to_sequences(test_data_df['label'])
y_test = [y[0]-1 for y in y_test]
y_test = to_categorical(y_test, label_numbers)
y_test.shape

(770, 31)

In [30]:
y_test[0]

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

# 设计模型

In [45]:
def create_SMP2018_lstm_model(max_features, max_cut_query_lenth, label_numbers):
    model = Sequential()
    model.add(Embedding(input_dim=max_features+1, output_dim=32, input_length=max_cut_query_lenth))
    model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(label_numbers, activation='softmax'))
    # try using different optimizers and different optimizer configs
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=[f1])

    plot_model(model, to_file='SMP2018_lstm_model.png', show_shapes=True)
    
    return model

# 训练模型

In [46]:
if 'max_features'  not in  dir():
    max_features = 2888
    print('not find max_features variable, use default max_features values:\t{}'.format(max_features))
if 'max_cut_query_lenth'  not in  dir():
    max_cut_query_lenth = 26
    print('not find max_cut_query_lenth, use default max_features values:\t{}'.format(max_cut_query_lenth))
if 'label_numbers'  not in  dir():
    label_numbers = 31
    print('not find label_numbers, use default max_features values:\t{}'.format(label_numbers))

In [47]:
model = create_SMP2018_lstm_model(max_features, max_cut_query_lenth, label_numbers)

In [48]:
batch_size = 20
epochs = 30

In [49]:
print(x_train.shape, y_train.shape)

(2299, 26) (2299, 31)


In [50]:
print(x_test.shape, y_test.shape)

(770, 26) (770, 31)


In [51]:
print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs)

Train...
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f84e87c5f28>

# 评估模型

In [52]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)

print('Test score:', score[0])
print('Test f1:', score[1])

Test score: 0.6803552009068526
Test f1: 0.8464262740952628


In [53]:
y_hat_test = model.predict(x_test)

In [55]:
print(y_hat_test.shape)

(770, 31)


## 将 one-hot 张量转换成对应的整数

In [54]:
y_pred = np.argmax(y_hat_test, axis=1).tolist()

In [55]:
y_true = np.argmax(y_test, axis=1).tolist()

## 查看多分类的 准确率、召回率、F1 值

In [56]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85       154
           1       0.92      0.97      0.95        89
           2       0.67      0.62      0.64        60
           3       0.83      0.83      0.83        36
           4       0.79      1.00      0.88        34
           5       0.83      0.65      0.73        23
           6       1.00      0.83      0.91        24
           7       1.00      1.00      1.00        24
           8       0.68      0.65      0.67        23
           9       0.90      0.86      0.88        22
          10       0.85      0.50      0.63        22
          11       0.88      1.00      0.93        21
          12       1.00      0.90      0.95        21
          13       0.91      0.95      0.93        21
          14       1.00      0.95      0.98        21
          15       0.79      0.95      0.86        20
          16       0.90      0.47      0.62        19
          17       0.79    