# 语料加载

In [1]:
import jieba
import random
import pandas as pd

stopwords = pd.read_csv("data/stopwords.txt", quoting=3, index_col=False, sep="\t", names=["stopword"], encoding="utf-8")
stopwords = stopwords["stopword"].values

laogong_df = pd.read_csv("data/beilaogongda.csv", encoding="utf-8")
laopo_df = pd.read_csv("data/beilaopoda.csv", encoding="utf-8")
erzi_df = pd.read_csv("data/beierzida.csv", encoding="utf-8")
nver_df = pd.read_csv("data/beinverda.csv", encoding="utf-8")

laogong_df.dropna(inplace=True)
laopo_df.dropna(inplace=True)
erzi_df.dropna(inplace=True)
nver_df.dropna(inplace=True)

laogong = laogong_df.segment.values.tolist()
laopo = laopo_df.segment.values.tolist()
erzi = erzi_df.segment.values.tolist()
nver = nver_df.segment.values.tolist()

# 分词和去停用词

In [2]:
def preprocess_text(content_lines, sentences, category):
    for line in content_lines:
        try:
            segs = jieba.lcut(line)
            # 去空格
            segs = list(filter(lambda x: x.strip(), segs))
            # 去数字
            segs = [v for v in segs if not str(v).isdigit()]
            # 去长度为1的字符
            segs = list(filter(lambda x: len(x) > 1, segs))
            # 去掉停用词
            segs = list(filter(lambda x: x not in stopwords, segs))
            sentences.append(((" ").join(segs), category))
        except Exception as e:
            print(line, str(e))
            continue

In [3]:
sentences = []
preprocess_text(laogong, sentences, 0)
preprocess_text(laopo, sentences, 1)
preprocess_text(erzi, sentences, 2)
preprocess_text(nver, sentences, 3)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.869 seconds.
Prefix dict has been built succesfully.


In [4]:
random.shuffle(sentences)
sentences[:5]

[('母亲 女儿 凳子 烟缸 人伤 在场 民警 到场 通知 民警 到场 确认 不需 通知', 3),
 ('老公 无需 救护 民警 到场', 0),
 ('老公 无需 救护 民警 到场 民警 携带 防护 装备', 0),
 ('女儿 报警 女儿 接听 老人 老年痴呆 民警 到场', 3),
 ('老公 皮带 人伤 无需 民警 到场', 0)]

In [5]:
all_texts = [sentence[0] for sentence in sentences]
all_labels = [sentence[1] for sentence in sentences]

# 使用 LSTM 对数据进行分类

In [11]:
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import LSTM, Embedding, GRU
from keras.models import Sequential

import numpy as np

## 预定义变量

In [8]:
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 200
VALIDATION_SPLIT = 0.16
TEST_SPLIT = 0.2

## keras的sequence模块文本序列填充

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_texts)
sequences = tokenizer.texts_to_sequences(all_texts)
word_index = tokenizer.word_index

In [10]:
len(word_index)

480

In [13]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(all_labels))

data.shape

(1674, 100)

In [14]:
labels.shape

(1674, 4)

## 数据切分

In [15]:
p1 = int(len(data) * (1 - VALIDATION_SPLIT - TEST_SPLIT))
p2 = int(len(data) * (1 - TEST_SPLIT))

x_train = data[:p1]
y_train = labels[:p1]
x_val = data[p1:p2]
y_val = labels[p1:p2]
x_test = data[p2:]
y_test = labels[p2:]

## LSTM训练模型

In [16]:
model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(64, activation="relu"))
model.add(Dense(labels.shape[1], activation="softmax"))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 200)          96200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                12864     
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 260       
Total params: 430,124
Trainable params: 430,124
Non-trainable params: 0
_________________________________________________________________


## 模型编译

In [17]:
model.compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics=["acc"])
model.metrics_names

['loss', 'acc']

In [19]:
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=128)
model.save("lstm.h5")

Train on 1071 samples, validate on 268 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
model.evaluate(x_test, y_test)



[0.062186394020247816, 0.9880597016704616]

# 使用GRU进行文本分类

In [22]:
model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(GRU(200, dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(64, activation="relu"))
model.add(Dense(labels.shape[1], activation="softmax"))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 200)          96200     
_________________________________________________________________
gru_1 (GRU)                  (None, 200)               240600    
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                12864     
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 260       
Total params: 349,924
Trainable params: 349,924
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics=["acc"])
model.metrics_names

['loss', 'acc']

In [24]:
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=128)
model.save("gru.h5")

model.evaluate(x_test, y_test)

Train on 1071 samples, validate on 268 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.016858730530505305, 0.9940298507462687]

In [None]:
model.evaluate(x_rt)