In [275]:
import os
import random
import json
from collections import Counter
from math import sqrt
import gensim
import pandas as pd
import numpy as np
from keras.layers.merge import concatenate
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.layers import Conv1D,MaxPool1D,Flatten,Dropout,Dense,Input
from keras.models import Model
from sklearn import metrics
import gensim
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec
import multiprocessing
import jieba


In [276]:
'''
后续计算的参数设置
'''
#数据集路径
dataSource = "data/goods_zh.txt"
stopWordSource = "data/stopword.txt"

#分词后保留大于等于最低词频的词
miniFreq=1

#统一输入文本序列的定长，取了所有序列长度的均值。超出将被截断，不足则补0
sequenceLength = 200  
batchSize=64
epochs=10

numClasses = 2
#训练集的比例
rate = 0.8  

#生成嵌入词向量的维度
embeddingSize = 150

#卷积核数
numFilters = 30

#卷积核大小
filterSizes = [2,3,4,5]
dropoutKeepProb = 0.5

#L2正则系数
l2RegLambda = 0.1

In [277]:
'''
导入语料库，进行分词
'''
file = open("data/goods_zh.txt",encoding='utf-8') 
sentences=[]
labels=[]
for line in file:
    temp=line.replace('\n','').split(',,')
    sentences.append(jieba.lcut(temp[0]))
    labels.append((temp[1]))
file.close()
reviews=sentences
# sentences存储分词之后的语料库
# labels存储0,1
# 两者数据类型都是list
len(sentences)

28766

In [278]:
len(labels)

28766

In [279]:
if not os.path.exists("model/word2VecModel"):
    print("重新分成模型")
    model = word2vec.Word2Vec(sentences,vector_size=embeddingSize,
                         min_count=miniFreq,
                         window=2,
                         workers=multiprocessing.cpu_count(),sg=1,
                         epochs=20)
    model.save('model/word2VecModel')
else:
    print("直接调用模型")
    model = gensim.models.Word2Vec.load('model/word2VecModel')


直接调用模型


In [280]:
# 停用词的字典变量
stopWordDict = {}

def readStopWord(stopWordPath):
    """
    读取停用词
    """       
    with open(stopWordPath, "r",encoding='utf-8') as f:
        stopWords = f.read()
        stopWordList = stopWords.splitlines()
        
        # 将停用词用列表的形式生成，之后查找停用词时会比较快
        stopWordDict = dict(zip(stopWordList, list(range(len(stopWordList)))))
        return (stopWordDict)

In [281]:
stopWordDict=readStopWord(stopWordPath="data/stopword.txt")
type(stopWordDict)

dict

In [282]:
trainReviews = []
trainLabels = []

evalReviews = []
evalLabels = []

wordEmbedding =None
n_symbols=0

wordToIndex = {}
indexToWord = {}


In [283]:

def getWordEmbedding(words):
    """
    按照我们的数据集中的单词取出预训练好的word2vec中的词向量
    """
    #中文
    model = gensim.models.Word2Vec.load('model/word2VecModel')

    vocab = []
    wordEmbedding = []

    # 添加 "pad_b" 和 "UNK",  分别表示补齐的用词和未见词 注意，这些词不要在语料中出现
    vocab.append("pad_b")
    wordEmbedding.append(np.zeros(embeddingSize))

    vocab.append("UNK")
    wordEmbedding.append(np.random.randn(embeddingSize))
    print("for循环前的vocab",vocab)
    for word in words:
        vector = model.wv[word]
        vocab.append(word)
        wordEmbedding.append(vector)

    print("函数内的vocab1",vocab[:10])
    return vocab, np.array(wordEmbedding)

In [284]:
"""
生成词向量和词汇-索引映射字典，可以用全数据集
"""

allWords = [word for review in reviews for word in review]


In [285]:
#去掉停用词
subWords = [word for word in allWords if word not in stopWordDict]


In [286]:
#统计词频，排序
wordCount = Counter(subWords)  
sortWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)

In [287]:
#去除低频词
words = [item[0] for item in sortWordCount if item[1] >= miniFreq ]

In [288]:
#获取词列表和顺序对应的预训练权重矩阵
vocab, wordEmbedding = getWordEmbedding(words)

for循环前的vocab ['pad_b', 'UNK']
函数内的vocab1 ['pad_b', 'UNK', ' ', '手机', '买', '不错', '京东', '说', '客服', '质量']


In [289]:
type(vocab)

list

In [290]:
vocab[:10]

['pad_b', 'UNK', ' ', '手机', '买', '不错', '京东', '说', '客服', '质量']

In [291]:
wordToIndex = dict(zip(vocab, list(range(len(vocab)))))
indexToWord = dict(zip(list(range(len(vocab))), vocab))
n_symbols = len(vocab)

In [292]:
# 判断有无json文件
if os.path.exists("data/wordToIndex.json"):
    pass
else:
    print("存储json文件")
    # 将词汇-索引映射表保存为json数据，之后做inference时直接加载来处理数据
    with open("data/wordToIndex.json", "w", encoding="utf-8") as f:
        json.dump(wordToIndex, f)


if os.path.exists("data/indexToWord.json"):
    pass
else:
    print("存储json文件")
    with open("data/indexToWord.json", "w", encoding="utf-8") as f:
        json.dump(indexToWord, f)

In [293]:
def reviewProcess(review, sequenceLength, wordToIndex):
        """
        将数据集中的每条评论里面的词，根据词表，映射为index表示
        每条评论 用index组成的定长数组来表示
        
        """
        
        reviewVec = np.zeros((sequenceLength))
        sequenceLen = sequenceLength
        
        # 判断当前的序列是否小于定义的固定序列长度
        if len(review) < sequenceLength:
            sequenceLen = len(review)
            
        for i in range(sequenceLen):
            if review[i] in wordToIndex:
                reviewVec[i] = wordToIndex[review[i]]
            else:
                reviewVec[i] = wordToIndex["UNK"]

        return reviewVec

In [294]:
"""
生成训练集和验证集
"""

reviews_index = []
labels_index = []

# 遍历所有的文本，将文本中的词转换成index表示
for i in range(len(reviews)):

    reviewVec = reviewProcess(reviews[i], sequenceLength, wordToIndex)
    reviews_index.append(reviewVec)

    labels_index.append([labels[i]])

trainIndex = int(len(reviews) * rate)


#trainReviews = sequence.pad_sequences(reviews[:trainIndex], maxlen=self.sequenceLength)
trainReviews = np.asarray(reviews_index[:trainIndex], dtype="int64")
trainLabels = np.array(labels_index[:trainIndex], dtype="float32")

#evalReviews = sequence.pad_sequences(reviews[trainIndex:], maxlen=self.sequenceLength)
evalReviews = np.asarray(reviews_index[trainIndex:], dtype="int64")
evalLabels = np.array(labels_index[trainIndex:], dtype="float32")

In [295]:
trainReviews

array([[  1,   1,   1, ...,   0,   0,   0],
       [  1,   1,  14, ...,   0,   0,   0],
       [152,   1,   1, ...,   0,   0,   0],
       ...,
       [ 25,   5,   1, ...,   0,   0,   0],
       [  1, 432,   1, ...,   0,   0,   0],
       [142,  86,  47, ...,   0,   0,   0]])

In [296]:
type(vocab)
# list

list

In [297]:
from tensorflow import keras
'''
模型的搭建
'''
# 模型结构：词嵌入-卷积池化*3-拼接-全连接-dropout-全连接
main_input = Input(shape=(sequenceLength,), dtype='float64')
# 词嵌入（使用预训练的词向量）
embedder = Embedding(len(vocab) , embeddingSize, input_length=sequenceLength, weights=[wordEmbedding], trainable=False)
#embedder = Embedding(len(vocab) + 1, 300, input_length=50, trainable=False)
embed = embedder(main_input)
# 词窗大小分别为3,4,5
cnn1 = Conv1D(256, 3, padding='same', strides=1, activation='relu')(embed)
cnn1 = MaxPool1D(pool_size=38)(cnn1)
cnn2 = Conv1D(256, 4, padding='same', strides=1, activation='relu')(embed)
cnn2 = MaxPool1D(pool_size=37)(cnn2)
cnn3 = Conv1D(256, 5, padding='same', strides=1, activation='relu')(embed)
cnn3 = MaxPool1D(pool_size=36)(cnn3)
# 合并三个模型的输出向量
cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
flat = Flatten()(cnn)
drop = Dropout(0.2)(flat)
main_output = Dense(2, activation='softmax')(drop)
model = Model(inputs=main_input, outputs=main_output)
one_hot_labels = keras.utils.to_categorical(trainLabels)  # 将标签转换为one-hot编码
# 模型展示
model.summary()

Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_10 (InputLayer)          [(None, 200)]        0           []                               
                                                                                                  
 embedding_9 (Embedding)        (None, 200, 150)     3663300     ['input_10[0][0]']               
                                                                                                  
 conv1d_27 (Conv1D)             (None, 200, 256)     115456      ['embedding_9[0][0]']            
                                                                                                  
 conv1d_28 (Conv1D)             (None, 200, 256)     153856      ['embedding_9[0][0]']            
                                                                                            

In [298]:
if  os.path.exists("model/Text-Categorization.h5"):
    print("调入模型")
    model = keras.models.load_model("model/Text-Categorization.h5")
else:
    print("重新训练")
    # 模型编译
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(trainReviews, one_hot_labels, batch_size=800, epochs=10)

重新训练
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [299]:
result = model.predict(evalReviews)  # 预测样本属于每个类别的概率

In [300]:
result_labels = np.argmax(result, axis=1)  # 获得最大概率对应的标签

In [301]:
y_predict = list(map(str, result_labels))

In [302]:
from sklearn import metrics
print('准确率', metrics.accuracy_score(evalLabels, result_labels))
print('平均f1-score:', metrics.f1_score(evalLabels, result_labels, average='weighted'))
'''
准确率 0.9214459506430309
平均f1-score: 0.9214503352775644
重新训练
Epoch 1/10
29/29 [==============================] - 46s 2s/step - loss: 0.6048 - accuracy: 0.7414
Epoch 2/10
29/29 [==============================] - 47s 2s/step - loss: 0.2569 - accuracy: 0.9008
Epoch 3/10
29/29 [==============================] - 43s 1s/step - loss: 0.2143 - accuracy: 0.9173
Epoch 4/10
29/29 [==============================] - 43s 1s/step - loss: 0.1990 - accuracy: 0.9242
Epoch 5/10
29/29 [==============================] - 46s 2s/step - loss: 0.1876 - accuracy: 0.9288
Epoch 6/10
29/29 [==============================] - 48s 2s/step - loss: 0.1818 - accuracy: 0.9319
Epoch 7/10
29/29 [==============================] - 62s 2s/step - loss: 0.1739 - accuracy: 0.9349
Epoch 8/10
29/29 [==============================] - 49s 2s/step - loss: 0.1671 - accuracy: 0.9375
Epoch 9/10
29/29 [==============================] - 48s 2s/step - loss: 0.1629 - accuracy: 0.9400
Epoch 10/10
29/29 [==============================] - 47s 2s/step - loss: 0.1582 - accuracy: 0.9414
'''

准确率 0.9214459506430309
平均f1-score: 0.9214503352775644


In [303]:
# model.save("model/Text-Categorization.h5")