<a href="https://colab.research.google.com/github/edenlau/Text-Classification/blob/master/Beauty_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive


In [0]:
% cd /gdrive/My Drive/LSTM

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import font_manager
from itertools import accumulate

# 设置matplotlib绘图时的字体
from google.colab import files
#uploaded = files.upload()
my_font = font_manager.FontProperties(fname="./NotoSansSC-Regular.otf")

/gdrive/My Drive/LSTM


In [0]:
# 统计句子长度及长度出现的频数
filepath = "./beauty_corpus.csv"
df = pd.read_csv(filepath)
print(df.groupby('label')['label'].count())

df['length'] = df['evaluation'].apply(lambda x: len(x))
len_df = df.groupby('length').count()
sent_length = len_df.index.tolist()
sent_freq = len_df['evaluation'].tolist()

# 绘制句子长度及出现频数统计图
plt.bar(sent_length, sent_freq)
plt.title("句子长度及出现频数统计图", fontproperties=my_font)
plt.xlabel("句子长度", fontproperties=my_font)
plt.ylabel("句子长度出现的频数", fontproperties=my_font)
plt.savefig("句子长度及出现频数统计图.png")
#files.download("句子长度及出现频数统计图.png")
plt.close()

# 绘制句子长度累积分布函数(CDF)
sent_pentage_list = [(count/sum(sent_freq)) for count in accumulate(sent_freq)]

# 绘制CDF
plt.plot(sent_length, sent_pentage_list)

# 寻找分位点为quantile的句子长度
quantile = 0.91
#print(list(sent_pentage_list))
for length, per in zip(sent_length, sent_pentage_list):
    if round(per, 2) == quantile:
        index = length
        break
print("\n分位点为%s的句子长度:%d." % (quantile, index))

# 绘制句子长度累积分布函数图
plt.plot(sent_length, sent_pentage_list)
plt.hlines(quantile, 0, index, colors="c", linestyles="dashed")
plt.vlines(index, 0, quantile, colors="c", linestyles="dashed")
plt.text(0, quantile, str(quantile))
plt.text(index, 0, str(index))
plt.title("句子长度累积分布函数图", fontproperties=my_font)
plt.xlabel("句子长度", fontproperties=my_font)
plt.ylabel("句子长度累积频率", fontproperties=my_font)
plt.savefig("句子长度累积分布函数图.png")
#files.download("句子长度累积分布函数图.png") 
plt.close()

label
中性     1105
正面    15906
负面      823
Name: label, dtype: int64

分位点为0.91的句子长度:80.


In [0]:
# -*- coding: utf-8 -*-

import pickle
import numpy as np
import pandas as pd
from keras.utils import np_utils, plot_model
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Dense, Embedding, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 导入数据
# 文件的数据中，特征为evaluation, 类别为label.
def load_data(filepath, input_shape=20):
    df = pd.read_csv(filepath)

    # 标签及词汇表
    labels, vocabulary = list(df['label'].unique()), list(df['evaluation'].unique())

    # 构造字符级别的特征
    string = ''
    for word in vocabulary:
        string += word

    vocabulary = set(string)

    # 字典列表
    word_dictionary = {word: i+1 for i, word in enumerate(vocabulary)}
    with open('word_dict.pk', 'wb') as f:
        pickle.dump(word_dictionary, f)
    inverse_word_dictionary = {i+1: word for i, word in enumerate(vocabulary)}
    label_dictionary = {label: i for i, label in enumerate(labels)}
    with open('label_dict.pk', 'wb') as f:
        pickle.dump(label_dictionary, f)
    output_dictionary = {i: labels for i, labels in enumerate(labels)}

    vocab_size = len(word_dictionary.keys()) # 词汇表大小
    label_size = len(label_dictionary.keys()) # 标签类别数量

    # 序列填充，按input_shape填充，长度不足的按0补充
    x = [[word_dictionary[word] for word in sent] for sent in df['evaluation']]
    x = pad_sequences(maxlen=input_shape, sequences=x, padding='post', value=0)
    y = [[label_dictionary[sent]] for sent in df['label']]
    y = [np_utils.to_categorical(label, num_classes=label_size) for label in y]
    y = np.array([list(_[0]) for _ in y])

    return x, y, output_dictionary, vocab_size, label_size, inverse_word_dictionary

# 创建深度学习模型， Embedding + LSTM + Softmax.
def create_LSTM(n_units, input_shape, output_dim, filepath):
    x, y, output_dictionary, vocab_size, label_size, inverse_word_dictionary = load_data(filepath)
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size + 1, output_dim=output_dim,
                        input_length=input_shape, mask_zero=True))
    model.add(LSTM(n_units, input_shape=(x.shape[0], x.shape[1])))
    model.add(Dropout(0.2))
    model.add(Dense(label_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    plot_model(model, to_file='./model_lstm.png', show_shapes=True)
    model.summary()

    return model

# 模型训练
def model_train(input_shape, filepath, model_save_path):

    # 将数据集分为训练集和测试集，占比为9:1
    # input_shape = 100
    x, y, output_dictionary, vocab_size, label_size, inverse_word_dictionary = load_data(filepath, input_shape)
    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.1, random_state = 42)

    # 模型输入参数，需要自己根据需要调整
    n_units = 100
    batch_size = 32
    epochs = 5
    output_dim = 20

    # 模型训练
    lstm_model = create_LSTM(n_units, input_shape, output_dim, filepath)
    lstm_model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size, verbose=1)

    # 模型保存
    lstm_model.save(model_save_path)

    N = test_x.shape[0]  # 测试的条数
    predict = []
    label = []
    for start, end in zip(range(0, N, 1), range(1, N+1, 1)):
        sentence = [inverse_word_dictionary[i] for i in test_x[start] if i != 0]
        y_predict = lstm_model.predict(test_x[start:end])
        label_predict = output_dictionary[np.argmax(y_predict[0])]
        label_true = output_dictionary[np.argmax(test_y[start:end])]
        print(''.join(sentence), label_true, label_predict) # 输出预测结果
        predict.append(label_predict)
        label.append(label_true)

    acc = accuracy_score(predict, label) # 预测准确率
    print('模型在测试集上的准确率为: %s.' % acc)

if __name__ == '__main__':
    filepath = './beauty_corpus.csv'
    input_shape = 180
    model_save_path = './beauty_corpus_model.h5'
    model_train(input_shape, filepath, model_save_path)

Using TensorFlow backend.
W0618 03:36:52.111222 140485073356672 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0618 03:36:52.142178 140485073356672 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0618 03:36:52.151009 140485073356672 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0618 03:36:52.393677 140485073356672 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:2974: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 180, 20)           67060     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               48400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 404       
Total params: 115,864
Trainable params: 115,864
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
 2816/16051 [====>.........................] - ETA: 2:23 - loss: 0.1943 - acc: 0.9315

KeyboardInterrupt: ignored

In [0]:
# -*- coding: utf-8 -*-

# Import the necessary modules
import pickle
import numpy as np
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences


# 导入字典
with open('word_dict.pk', 'rb') as f:
    word_dictionary = pickle.load(f)
with open('label_dict.pk', 'rb') as f:
    output_dictionary = pickle.load(f)

try:
    # 数据预处理
    input_shape = 180
    sent = "作为一个男的，我是不懂口红怎么样的，买来送人的，但作为一个牌子的旗舰店包装都是这么简陋的吗"
    x = [[word_dictionary[word] for word in sent]]
    x = pad_sequences(maxlen=input_shape, sequences=x, padding='post', value=0)

    # 载入模型
    model_save_path = './corpus_model.h5'
    lstm_model = load_model(model_save_path)

    # 模型预测
    y_predict = lstm_model.predict(x)
    label_dict = {v:k for k,v in output_dictionary.items()}
    print('输入语句: %s' % sent)
    print('情感预测结果: %s' % label_dict[np.argmax(y_predict)])

except KeyError as err:
    print("您输入的句子有汉字不在词汇表中，请重新输入！")
    print("不在词汇表中的单词为：%s." % err)

输入语句: 作为一个男的，我是不懂口红怎么样的，买来送人的，但作为一个牌子的旗舰店包装都是这么简陋的吗
情感预测结果: 正面


In [0]:
bdf = pd.read_csv("./ECData.csv")

In [0]:
# -*- coding: utf-8 -*-

# Import the necessary modules
import pickle
import numpy as np
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences


# 导入字典
with open('word_dict.pk', 'rb') as f:
    word_dictionary = pickle.load(f)
with open('label_dict.pk', 'rb') as f:
    output_dictionary = pickle.load(f)

sbdf = bdf.sample(frac=0.01) # set sampling ratio for evaluation
    
for i, j in sbdf.iterrows(): 
    sent = sbdf.loc[i, "Comment"]
    try:
        # 数据预处理
        input_shape = 180
        #sent = "作为一个男的，我是不懂口红怎么样的，买来送人的，但作为一个牌子的旗舰店包装都是这么简陋的吗"
        x = [[word_dictionary[word] for word in sent]]
        x = pad_sequences(maxlen=input_shape, sequences=x, padding='post', value=0)

        # 载入模型
        model_save_path = './corpus_model.h5'
        lstm_model = load_model(model_save_path)

        # 模型预测
        y_predict = lstm_model.predict(x)
        label_dict = {v:k for k,v in output_dictionary.items()}
        print('输入语句: %s' % sent)
        print('情感预测结果: %s' % label_dict[np.argmax(y_predict)])

    except KeyError as err:
        print("您输入的句子有汉字不在词汇表中，请重新输入！")
        print("不在词汇表中的单词为：%s." % err)

输入语句: 很快就收到，很喜欢味道香香的。颜色挺像mac的diva，不过ysl的唇釉我真的很喜欢不干而且不沾杯。上嘴我是无论什么色都会偏紫，一定涂手上就是中间那个颜色。
情感预测结果: 负面
输入语句: 相当满意的一次购物
情感预测结果: 正面
输入语句: 好看。喜欢。
情感预测结果: 负面
输入语句: 颜值真的超高，包装也很高大上，很喜欢，控油效果真的不错，但隐形毛孔感觉作用不大，也有可能个人不怎么会用，然后还有就可以定妆也不知道怎么用，总的还是比较满意
情感预测结果: 负面
输入语句: 好看，还有股悠悠的香味～
情感预测结果: 负面
输入语句: 双11购的这么快就收到了太美了也谢谢快递员，包装完美，颜色完美，还收到了好多赠品，非常愉快的购物！双11购的这么快就收到了太美了也谢谢快递员，包装完美，颜色完美，还收到了好多赠品，非常愉快的购物！
情感预测结果: 负面
输入语句: 味道很平淡，很好用，下次还会买的。好评，好评。
情感预测结果: 正面
输入语句: 回购的，之前是在免税店买的
情感预测结果: 负面
输入语句: 太美了把...
情感预测结果: 正面
输入语句: 很好看薄荷绿的壳很仙很喜欢代言人千玺宝宝的推荐💗
情感预测结果: 负面
输入语句: 还可以，等了好久，双十一下手的，蛮好用
情感预测结果: 负面
输入语句: 双十一买的，也很划算，赠品很喜欢，送的化妆包好看
情感预测结果: 负面
输入语句: 买给那个她的，很好！她非常喜欢╮(￣▽￣")╭
情感预测结果: 负面
输入语句: 虽然是个口红但是包装很走心了讲真的
情感预测结果: 负面
输入语句: 心动
情感预测结果: 负面
输入语句: 这颜色，棒的很嘛。很显白呢。
情感预测结果: 负面
输入语句: 好看的
情感预测结果: 正面
输入语句: 大牌就是大牌，真的很好，很服帖
情感预测结果: 负面
输入语句: 感觉还好吧。反正是送人的。女孩子收到的时候大叫了一声妈耶。
情感预测结果: 负面
输入语句: 女友拿到了很开心，颜色也超喜欢，上色很美
情感预测结果: 负面
输入语句: 超级好看超级好看涂在嘴巴上那好看的颜色真的拍不出来好看到爆炸
情感预测结果: 正面
输入语句: 刚开始上脸有些假白，一段时间之后会越来越自然，妆面很服帖，也不会干，很好
情感预测结果: 负面
输入语句: 此用户没有填写评论!
情感预测结果: 