In [1]:
import pandas as pd
import numpy as np
from keras.utils.np_utils import to_categorical
import jieba
import io

Using TensorFlow backend.


In [2]:
path = '../../data/comment-classification/'

In [3]:
df = pd.read_csv(path+'sentiment_analysis_trainingset.csv', nrows=1000)

## 处理文本数据

In [4]:
content = df['content']

In [5]:
def cut(text):
    return ' '.join(jieba.cut(text))

In [6]:
content = content.apply(cut)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\DINGLI~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.058 seconds.
Prefix dict has been built succesfully.


## 加载预训练词向量

In [19]:
def load_word_embeddings(fname):
    vocab_and_vectors = {}
    with open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') as file:
        # put words as dict indexes and vectors as words values
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            vocab_and_vectors[word] = vector
    return vocab_and_vectors

In [20]:
word_embeddings = load_word_embeddings('../../data/fasttext/cc.zh.300.vec')

In [43]:
def vectorize_sequence(texts, max_encoder_seq_length, word_embeddings):
    embedding_size = word_embeddings['测试'].shape[0]
    results = np.zeros((len(texts), max_encoder_seq_length, embedding_size), dtype='float32')

    for i, text in enumerate(texts):
        words = text.split()
        words = [w for w in words if w in word_embeddings]
        seq_length = len(words)
        padding_size = (max_encoder_seq_length - seq_length) if (max_encoder_seq_length > seq_length) else 0
        for j in range(max_encoder_seq_length):
            if j < padding_size:
                results[i][j] = np.zeros_like(word_embeddings['测试'])
            else:
                results[i][j] = word_embeddings[words[j-padding_size]]
    return results

In [44]:
x_train = vectorize_sequence(content, 256,word_embeddings)

In [46]:
x_train.shape

(1000, 256, 300)

## 处理目标数据

In [47]:
columns = df.columns.values.tolist()

In [48]:
train_labels = df[columns[2:]]

In [49]:
y_train = to_categorical(train_labels+2, num_classes=4)
print("y_train shape {}".format(y_train.shape))

y_train shape (1000, 20, 4)


In [59]:
x2_train = np.vstack([np.zeros((1,20,4), dtype='float32'), y_train[:-1]])

In [61]:
x2_train.shape

(1000, 20, 4)

# 建立seq2seq模型

In [51]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.utils.vis_utils import plot_model

In [68]:
# returns train, inference_encoder and inference_decoder models
def define_models(n_input, n_output, n_units):
    '''
    n_input: word vec size
    n_output: class numbers
    n_units: The number of cells to create in the encoder and decoder models, e.g. 128 or 256.
    '''
    # define training encoder
    encoder_inputs = Input(shape=(None, n_input))
    encoder = LSTM(n_units, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    encoder_states = [state_h, state_c]
    # define training decoder
    decoder_inputs = Input(shape=(None, n_output))
    decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_dense = Dense(n_output, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    # define inference encoder
    encoder_model = Model(encoder_inputs, encoder_states)
    # define inference decoder
    decoder_state_input_h = Input(shape=(n_units,))
    decoder_state_input_c = Input(shape=(n_units,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    # return all models
    return model, encoder_model, decoder_model

In [69]:
# configure
num_encoder_tokens = 300  # 300 维的词向量
num_decoder_tokens = 4    # 4 种情感等级

latent_dim = 128

batch_size = 32
epochs = 2

encoder_input_data = x_train
decoder_input_data = x2_train
decoder_target_data = y_train

In [70]:
# define model
train, infenc, infdec = define_models(num_encoder_tokens, num_decoder_tokens, latent_dim)
train.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
# train model
train.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=epochs)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x23424835780>

## 验证模型

In [74]:
def predict_sequence(infenc, infdec, source, n_steps, cardinality):
    '''
    infenc: Encoder model used when making a prediction for a new source sequence.
    infdec: Decoder model use when making a prediction for a new source sequence.
    source: Encoded source sequence.
    n_steps: Number of time steps in the target sequence.
    cardinality: The cardinality of the output sequence, e.g. the number of features, words, or characters for each time step.
    '''
    # encode
    state = infenc.predict(source)
    # start of sequence input
    target_seq = np.array([0.0 for _ in range(cardinality)]).reshape(1, 1, cardinality)
    # collect predictions
    output = list()
    for t in range(n_steps):
        # predict next char
        yhat, h, c = infdec.predict([target_seq] + state)
        # store prediction
        output.append(yhat[0,0,:])
        # update state
        state = [h, c]
        # update target sequence
        target_seq = yhat
    return np.array(output)


In [78]:
y_train_pred = predict_sequence(infenc, infdec, x_train[0:1], 20, 4)

In [79]:
y_train_pred

array([[0.77009726, 0.02088405, 0.01089752, 0.19812119],
       [0.78496355, 0.0287317 , 0.01616323, 0.17014155],
       [0.78814805, 0.03786965, 0.02382274, 0.15015954],
       [0.78157836, 0.04684557, 0.03336274, 0.13821335],
       [0.7688455 , 0.05472791, 0.04410048, 0.13232611],
       [0.7524493 , 0.0607225 , 0.05531617, 0.13151205],
       [0.73344266, 0.06421298, 0.06628405, 0.13606033],
       [0.7123251 , 0.06535235, 0.07643841, 0.14588408],
       [0.6895445 , 0.0648436 , 0.08538803, 0.16022395],
       [0.66568786, 0.06341065, 0.09286465, 0.17803682],
       [0.64149517, 0.06157431, 0.09875781, 0.1981727 ],
       [0.617762  , 0.05964941, 0.10313462, 0.21945389],
       [0.59521884, 0.05780599, 0.106194  , 0.2407812 ],
       [0.57443714, 0.05612673, 0.108195  , 0.2612411 ],
       [0.5557863 , 0.05464415, 0.1093986 , 0.28017092],
       [0.53943443, 0.05336286, 0.11003404, 0.29716864],
       [0.52538323, 0.05227229, 0.11028495, 0.31205955],
       [0.51351434, 0.05135427,

In [125]:
def one_hot_decode(encoded_seq):
    result = np.array([np.argmax(vector) for vector in encoded_seq], dtype=int)
    result = result -2
    return result

In [88]:
y_train_pred_decode = one_hot_decode(y_train_pred)

In [90]:
y_train_pred_decode

array([-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
       -2, -2, -2], dtype=int64)

## 预测模型

### 加载测试数据

In [97]:
df_test = pd.read_csv(path+'sentiment_analysis_testa.csv', nrows=100)
data_test = df_test['content']
data_test = data_test.apply(cut)

In [98]:
x_test = vectorize_sequence(data_test, 256, word_embeddings)

In [107]:
from keras.models import load_model
def load_train_model(model_name, n_units):
    model = load_model(model_name)
    encoder_inputs = model.input[0] # input_1
    encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
    encoder_states = [state_h_enc, state_c_enc]
    encoder_model = Model(encoder_inputs, encoder_states)

    decoder_inputs = model.input[1] # input_2
    decoder_state_input_h = Input(shape=(n_units,), name='input_3')
    decoder_state_input_c = Input(shape=(n_units,), name='input_4')
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_lstm = model.layers[3]
    decoder_outputs, state_h_enc, state_c_enc = decoder_lstm(decoder_inputs, initial_state = decoder_states_inputs)
    decoder_states = [state_h_enc, state_c_enc]
    decoder_dense = model.layers[4]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    return encoder_model, decoder_model

In [108]:
infenc, infdec = load_train_model('./seq/s2s_train.h5', latent_dim)

In [126]:
for i, _ in enumerate(x_test):
    encoded_seq = predict_sequence(infenc, infdec, x_test[i:i+1], 20, 4)
    seq = one_hot_decode(encoded_seq)
    df_test.iloc[i, 2:] = seq

In [127]:
df_test

Unnamed: 0,id,content,location_traffic_convenience,location_distance_from_business_district,location_easy_to_find,service_wait_time,service_waiters_attitude,service_parking_convenience,service_serving_speed,price_level,...,environment_decoration,environment_noise,environment_space,environment_cleaness,dish_portion,dish_taste,dish_look,dish_recommendation,others_overall_experience,others_willing_to_consume_again
0,0,"""我想说他们家的优惠活动好持久啊，我预售的时候买的券，前两天心血来潮去吃的活动还在继续\n首...",-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
1,1,"""终于开到心心念念的LAB loft。第一次来就随便点也一些～【香辣虾意面】蛮辣的，但其实一...",-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
2,2,"""地理位置好，交通方便，就在124车站对面交通方便，很好，我晚上7点多去买的了，已经没有什么...",-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
3,3,"""运气很好，抽中了大众点评的霸王餐。这家主题餐厅心仪已久了，种种原因一直未能成行，没想到抽中...",-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
4,4,"""幸运随点评团体验霸王餐，心情好~蜀九香刚进驻泉州不久，招牌大名气响，以至于刚到店门口的我被...",-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,"""先是办好了会员卡，因为用会员卡可以打8折买披萨，只限披萨，其他的不能打折。用会员卡买好披萨...",-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
96,96,"""环境看着挺整洁温馨的。里面有很多小楠很喜欢玩具和模型。一进去就和楠爹研究上了。我团购的，选...",-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
97,97,"""平心而论，这家店味道还OK，至少足够吸引武汉人民的新鲜感，然而对于这家店对深圳乐凯撒各方面...",-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
98,98,"""位于梅川路步行街 很好找 5点到的 不需要排队 坐下来把锅底和菜品点好 点的都是招牌 比如...",-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
