# $$SMP2018中文人机对话技术评测（ECDT）$$

# **不要直接修改此文件，可把该文件拷贝至自己的文件夹下再进行操作**

1. 下面是一个完整的针对 [SMP2018中文人机对话技术评测（ECDT）](http://smp2018.cips-smp.org/ecdt_index.html) 的实验，由该实验训练的基线模型能达到评测排行榜的前三的水平。
2. 通过本实验，可以掌握处理自然语言文本数据的一般方法。
3. 推荐自己修改此文件，达到更好的实验效果，比如改变以下几个超参数 

```python
# 词嵌入的维度
embedding_word_dims = 32
# 批次大小
batch_size = 30
# 周期
epochs = 20
```

# 本实验还可以改进的地方举例 

1. 预处理阶段使用其它的分词工具
2. 采用字符向量和词向量结合的方式
3. 使用预先训练好的词向量
4. 改变模型结构
5. 改变模型超参数

# 导入依赖库

In [1]:
import numpy as np
import pandas as pd
import collections
import jieba
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.utils import to_categorical,plot_model
from keras.callbacks import TensorBoard, Callback

from sklearn.metrics import classification_report

import requests 

import time

import os

Using TensorFlow backend.


# 辅助函数

In [2]:
from keras import backend as K

# 计算 F1 值的函数
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [3]:
# 获取自定义时间格式的字符串
def get_customization_time():
    # return '2018_10_10_18_11_45' 年月日时分秒
    time_tuple = time.localtime(time.time())
    customization_time = "{}_{}_{}_{}_{}_{}".format(time_tuple[0], time_tuple[1], time_tuple[2], time_tuple[3], time_tuple[4], time_tuple[5])
    return customization_time

# 准备数据

## [下载SMP2018官方数据](https://worksheets.codalab.org/worksheets/0x27203f932f8341b79841d50ce0fd684f/)

In [3]:
raw_train_data_url = "https://worksheets.codalab.org/rest/bundles/0x0161fd2fb40d4dd48541c2643d04b0b8/contents/blob/"
raw_test_data_url = "https://worksheets.codalab.org/rest/bundles/0x1f96bc12222641209ad057e762910252/contents/blob/"

# 如果不存在 SMP2018 数据，则下载
if (not os.path.exists('./data/train.json')) or (not os.path.exists('./data/dev.json')):
    raw_train = requests.get(raw_train_data_url) 
    raw_test = requests.get(raw_test_data_url) 
    if not os.path.exists('./data'):
        os.makedirs('./data')
    with open("./data/train.json", "wb") as code:
         code.write(raw_train.content)
    with open("./data/dev.json", "wb") as code:
         code.write(raw_test.content)

In [4]:
def get_json_data(path):
    # read data
    data_df = pd.read_json(path)
    # change row and colunm
    data_df = data_df.transpose()
    # change colunm order
    data_df = data_df[['query', 'label']]
    return data_df

In [5]:
train_data_df = get_json_data(path="data/train.json")

test_data_df = get_json_data(path="data/dev.json")

In [6]:
train_data_df.head()

Unnamed: 0,query,label
0,今天东莞天气如何,weather
1,从观音桥到重庆市图书馆怎么走,map
2,鸭蛋怎么腌？,cookbook
3,怎么治疗牛皮癣,health
4,唠什么,chat


In [7]:
test_data_df.head()

Unnamed: 0,query,label
0,毛泽东的诗哦。,poetry
1,有房有车吗微笑,chat
2,2013年亚洲冠军联赛恒广州恒大比赛时间。,match
3,若相惜不弃下一句是什么？,poetry
4,苹果翻译成英语,translation


In [8]:
train_data_df.describe()

Unnamed: 0,query,label
count,2299,2299
unique,2299,31
top,中国新闻网网站,chat
freq,1,455


In [9]:
test_data_df.describe()

Unnamed: 0,query,label
count,770,770
unique,770,31
top,查下安徽电视台今天节目单,chat
freq,1,154


In [10]:
# 获取所以标签，也就是分类的类别
labels = list(set(train_data_df['label'].tolist()))

In [11]:
label_numbers = len(labels)
print('label_numbers:\t', label_numbers)

label_numbers:	 31


## 标签和对应ID的映射字典

In [12]:
label_2_index_dict = dict([(label, index) for index, label in enumerate(labels)])
index_2_label_dict = dict([(index, label) for index, label in enumerate(labels)])

---

## [结巴分词](https://github.com/fxsjy/jieba)示例，下面将使用结巴分词对原数据进行处理

In [13]:
seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
print(list(seg_list))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.903 seconds.
Prefix dict has been built succesfully.


['他', '来到', '了', '网易', '杭研', '大厦']


---

# 序列化

In [14]:
def use_jieba_cut(a_sentence):
    return list(jieba.cut(a_sentence))

train_data_df['cut_query'] = train_data_df['query'].apply(use_jieba_cut)
test_data_df['cut_query'] = test_data_df['query'].apply(use_jieba_cut)

In [15]:
train_data_df.head(10)

Unnamed: 0,query,label,cut_query
0,今天东莞天气如何,weather,"[今天, 东莞, 天气, 如何]"
1,从观音桥到重庆市图书馆怎么走,map,"[从, 观音桥, 到, 重庆市, 图书馆, 怎么, 走]"
2,鸭蛋怎么腌？,cookbook,"[鸭蛋, 怎么, 腌, ？]"
3,怎么治疗牛皮癣,health,"[怎么, 治疗, 牛皮癣]"
4,唠什么,chat,"[唠, 什么]"
5,阳澄湖大闸蟹的做法。,cookbook,"[阳澄湖, 大闸蟹, 的, 做法, 。]"
6,昆山大润发在哪里,map,"[昆山, 大润发, 在, 哪里]"
7,红烧肉怎么做？嗯？,cookbook,"[红烧肉, 怎么, 做, ？, 嗯, ？]"
8,南京到厦门的火车票,train,"[南京, 到, 厦门, 的, 火车票]"
9,6的平方,calc,"[6, 的, 平方]"


In [16]:
# 获取数据的所有词汇
def get_all_vocab_from_data(data, colunm_name):
    train_vocab_list = []
    max_cut_query_lenth = 0
    for cut_query in data[colunm_name]:
        if len(cut_query) > max_cut_query_lenth:
            max_cut_query_lenth = len(cut_query)
        train_vocab_list += cut_query
    return train_vocab_list, max_cut_query_lenth   

In [17]:
train_vocab_list, max_cut_query_lenth = get_all_vocab_from_data(train_data_df, 'cut_query')

In [18]:
print('Number of words：\t', len(train_vocab_list))

Number of words：	 11498


In [19]:
print('max_cut_query_lenth:\t', max_cut_query_lenth)

max_cut_query_lenth:	 26


In [20]:
test_vocab_list, test_max_cut_query_lenth = get_all_vocab_from_data(train_data_df, 'cut_query')

In [21]:
print('test_max_cut_query_lenth:\t', test_max_cut_query_lenth)

test_max_cut_query_lenth:	 26


In [22]:
train_vocab_list[:10]

['今天', '东莞', '天气', '如何', '从', '观音桥', '到', '重庆市', '图书馆', '怎么']

In [23]:
train_vocab_counter = collections.Counter(train_vocab_list)

In [24]:
print('Number of different words:\t', len(train_vocab_counter.keys()))

Number of different words:	 2887


## 不同种类的词汇个数，预留一个位置给不存在的词汇（不存在的词汇标记为0）  

In [26]:
max_features = len(train_vocab_counter.keys()) + 1

In [27]:
print(max_features)

2888


In [28]:
# 10 words with the highest frequency
train_vocab_counter.most_common(10)

[('的', 605),
 ('。', 341),
 ('我', 320),
 ('你', 297),
 ('怎么', 273),
 ('？', 251),
 ('什么', 210),
 ('到', 165),
 ('给', 154),
 ('做', 148)]

## 统计低频词语

In [29]:
word_times_zero = 0
for word, word_times in train_vocab_counter.items():
    if word_times <=1:
        word_times_zero+=1
print('word_times_zero:\t', word_times_zero)
print('word_times_zero/all:\t', word_times_zero/len(train_vocab_counter))

word_times_zero:	 1978
word_times_zero/all:	 0.685140284031867


## 制作词汇字典

In [30]:
def create_train_vocab_dict(train_vocab_counter):
    word_2_index, index_2_word = {}, {}
    # Reserve 0 for masking via pad_sequences
    index_number = 1
    for word, word_times in train_vocab_counter.most_common():
        word_2_index[word] = index_number
        index_2_word[index_number] = word
        index_number += 1
    return word_2_index, index_2_word 

In [31]:
word_2_index_dict, index_2_word_dict = create_train_vocab_dict(train_vocab_counter)

In [32]:
print(word_2_index_dict['的'], word_2_index_dict['。'])

1 2


In [33]:
print(index_2_word_dict[1], index_2_word_dict[2])

的 。


In [34]:
pq= 0
for index, row in train_data_df.iterrows():
    print(row[0], row[1], row[2])
    pq+=1
    if pq==10:
        break

今天东莞天气如何 weather ['今天', '东莞', '天气', '如何']
从观音桥到重庆市图书馆怎么走 map ['从', '观音桥', '到', '重庆市', '图书馆', '怎么', '走']
鸭蛋怎么腌？ cookbook ['鸭蛋', '怎么', '腌', '？']
怎么治疗牛皮癣 health ['怎么', '治疗', '牛皮癣']
唠什么 chat ['唠', '什么']
阳澄湖大闸蟹的做法。 cookbook ['阳澄湖', '大闸蟹', '的', '做法', '。']
昆山大润发在哪里 map ['昆山', '大润发', '在', '哪里']
红烧肉怎么做？嗯？ cookbook ['红烧肉', '怎么', '做', '？', '嗯', '？']
南京到厦门的火车票 train ['南京', '到', '厦门', '的', '火车票']
6的平方 calc ['6', '的', '平方']


In [35]:
word_2_index_dict.get('的2', 0)

0

In [36]:
def vectorize_data(data, label_2_index_dict, word_2_index_dict, max_cut_query_lenth):
    x_train = []
    y_train = []
    for index, row in data.iterrows():
        query_sentence = row[2]
        label = row[1]
        # 字典找不到的情况下用 0 填充
        x = [word_2_index_dict.get(w, 0) for w in query_sentence]
        y = [label_2_index_dict[label]]
        x_train.append(x)
        y_train.append(y)
    return (pad_sequences(x_train, maxlen=max_cut_query_lenth),
            pad_sequences(y_train, maxlen=1))

In [37]:
x_train, y_train = vectorize_data(train_data_df, label_2_index_dict, word_2_index_dict, max_cut_query_lenth)

In [38]:
x_test, y_test = vectorize_data(test_data_df, label_2_index_dict, word_2_index_dict, test_max_cut_query_lenth)

In [39]:
print(x_train[0], y_train[0])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0  33 318  27  90] [7]


In [40]:
y_train = to_categorical(y_train, label_numbers)
y_test = to_categorical(y_test, label_numbers)

In [41]:
print(x_train.shape, y_train.shape)

(2299, 26) (2299, 31)


In [42]:
print(x_test.shape, y_test.shape)

(770, 26) (770, 31)


# 存储预处理过的数据

In [43]:
print(type(x_test))

<class 'numpy.ndarray'>


In [44]:
np.savez("preprocessed_data", x_train, y_train, x_test, y_test)

## 直接加载预处理的数据

In [4]:
# 使用已经经过预处理的数据，默认不使用
use_preprocessed_data = True

if use_preprocessed_data == True:
    preprocessed_data = np.load('preprocessed_data.npz')
    x_train, y_train, x_test, y_test = preprocessed_data['arr_0'], preprocessed_data['arr_1'], preprocessed_data['arr_2'], preprocessed_data['arr_3'],

In [5]:
print(x_train.shape, y_train.shape)

(2299, 26) (2299, 31)


# 设计模型

In [6]:
def create_SMP2018_lstm_model(max_features, max_cut_query_lenth, label_numbers):
    model = Sequential()
    model.add(Embedding(input_dim=max_features, output_dim=32, input_length=max_cut_query_lenth))
    model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(label_numbers, activation='softmax'))
    # try using different optimizers and different optimizer configs
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=[f1])

    plot_model(model, to_file='SMP2018_lstm_model.png', show_shapes=True)
    
    return model

# 训练模型

In [7]:
if 'max_features'  not in  dir():
    max_features = 2888
    print('not find max_features variable, use default max_features values:\t{}'.format(max_features))
if 'max_cut_query_lenth'  not in  dir():
    max_cut_query_lenth = 26
    print('not find max_cut_query_lenth, use default max_features values:\t{}'.format(max_cut_query_lenth))
if 'label_numbers'  not in  dir():
    label_numbers = 31
    print('not find label_numbers, use default max_features values:\t{}'.format(label_numbers))

not find max_features variable, use default max_features values:	2888
not find max_cut_query_lenth, use default max_features values:	26
not find label_numbers, use default max_features values:	31


In [8]:
model = create_SMP2018_lstm_model(max_features, max_cut_query_lenth, label_numbers)

In [9]:
batch_size = 20
epochs = 300

In [None]:
print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[TensorBoard(log_dir='../logs/{}'.format("SMP2018_lstm_{}".format(get_customization_time())))],
          validation_split=0.2
          )

Train...
Train on 1839 samples, validate on 460 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoc

# 评估模型

In [53]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)

print('Test score:', score[0])
print('Test f1:', score[1])

Test score: 0.7415416103291821
Test f1: 0.8223602949798882


In [54]:
y_hat_test = model.predict(x_test)

In [55]:
print(y_hat_test.shape)

(770, 31)


## 将 one-hot 张量转换成对应的整数

In [56]:
y_pred = np.argmax(y_hat_test, axis=1).tolist()

In [57]:
y_true = np.argmax(y_test, axis=1).tolist()

## 查看多分类的 准确率、召回率、F1 值

In [58]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.90      0.95        21
           1       0.86      0.75      0.80         8
           2       1.00      0.95      0.98        21
           3       0.52      0.57      0.54        23
           4       0.91      0.91      0.91        11
           5       0.82      0.97      0.89        34
           6       0.25      0.17      0.20         6
           7       0.86      0.86      0.86        22
           8       1.00      0.88      0.93         8
           9       0.89      1.00      0.94         8
          10       0.95      0.95      0.95        21
          11       1.00      0.62      0.77         8
          12       0.62      0.70      0.66        60
          13       0.86      0.90      0.88        20
          14       0.55      0.58      0.56        19
          15       0.76      0.78      0.77        36
          16       0.87      0.90      0.89       154
          17       0.57    