### 1. 模块加载


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import jieba
import glob
import copy
import time

from keras.models import Model
from keras.layers import multiply
from keras.layers import Dense, Embedding, Input, Flatten
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint,TensorBoard
from keras.layers.wrappers import TimeDistributed


import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))

Using TensorFlow backend.


### 2. 目录定义查看


In [2]:
from subprocess import check_output
path = '../nocode/'
inp = 'input/testproject/'
oup = 'output/testproject/'
logp = 'logs/testproject/'
modp = 'models/testproject/'

# print(check_output(["ls", path + inp]).decode("utf8"))

EMBEDDING_FILE = path + 'wordvector/wiki.zh.vec'
EMBEDDING_FILE = path + 'wordvector/crawl-300d-2M.vec'

# 训练数据总集
data_all_file = path + inp + 'sematic_label_train.csv'

# 训练数据集
train_file = path + inp + 'train.csv'
# 测试数据集
test_file = path + inp + 'test_alll.csv'
# test_file = path + inp + 'test.csv'

TXT_DATA_FILE = path + inp + 'sematic_train.txt'
XLSX_DATA_FILE = path + inp + 'sematic_train.xlsx'
CSV_DATA_FILE = path + inp + 'sematic_train.csv'

# 结果文件
res_file = path + inp + 'baseline.csv'
# 模型文件
model_path = path + modp + 'model_t.h5'
out_path = path + oup + 'baseline.csv'
tensor_path = path + logp + 'baseline.csv'

# model_path = './model/model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'


### 4. 数据集 训练转化

In [3]:
# translate to 1 hot
# 去非nan
data_all = pd.read_csv(data_all_file)
data_all = data_all[~(data_all["评论内容"].isnull())]
# list_sentences_test = test["评论内容"].fillna("CVxTz").values

# 变成one hot
data_all['postive']=0
data_all['neutral']=0
data_all['negative']=0
data_all.loc[data_all['分类'] == 1, 'postive'] = 1
data_all.loc[data_all['分类'] == 0, 'neutral'] = 1
data_all.loc[data_all['分类'] == -1, 'negative'] = 1


### 5. 开始训练

In [4]:
# max_features = 999999
max_features = 200000
maxlen = 300
memn = 100
dropn = 0.5

# train_file = path + inp + 'train.csv'
train = pd.read_csv(train_file)
print(train.shape)
# test_file = path + inp + 'test_alll.csv'
test = pd.read_csv(test_file)
print(test.shape)
for i1 in train.index:
    train.loc[i1, "评论内容"]=" ".join(jieba.cut(train.loc[i1, "评论内容"]))
for i1 in test.index:
    test.loc[i1, "评论内容"]=" ".join(jieba.cut(test.loc[i1, "评论内容"]))
    
list_sentences_train = train["评论内容"].fillna("CVxTz").values
list_sentences_test = test["评论内容"].fillna("CVxTz").values
# list_classes = [i1 for i1 in train.columns]
list_classes = ["negative","neutral","postive"]
try:
    list_classes.remove("评论内容")
    list_classes.remove("id")
except Exception as e:
    pass

y = train[list_classes].values

# list_sentences_test = test["评论内容"].fillna("CVxTz").values

(737, 7)


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


(218296, 3)


Loading model cost 0.686 seconds.
Prefix dict has been built succesfully.


In [5]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)


In [6]:

def get_model(memn,dropn):
    embed_size = 300
    # 时间步 = maxlen
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    # memory units = 50
    x = Bidirectional(LSTM(memn, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(dropn)(x)
    x = Dense(memn, activation="relu")(x)
    x = Dropout(dropn)(x)
    x = Dense(3, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [7]:
batch_size = 32
epochs = 100

checkpoint = ModelCheckpoint(model_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
tensorb = TensorBoard(log_dir=tensor_path, histogram_freq=10, write_graph=True, write_images=True, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [checkpoint, early, tensorb] #early

model = get_model(memn, dropn)
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 300, 300)          60000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 300, 200)          320800    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 200)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
__________

In [8]:
# start = time.time()
# model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks_list)
# end = time.time()
# print(end-start)

In [9]:
model.load_weights(model_path)

y_test = model.predict(X_te)
# predict(self, x, batch_size=32, verbose=0)
# predict_classes(self, x, batch_size=32, verbose=1)
# predict_proba(self, x, batch_size=32, verbose=1)
# evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None)

# test_file = path + inp + 'test_alll.csv'
sample_submission = pd.read_csv(test_file)
sample_submission["分类"]=0
sample_submission["postive"]=0
sample_submission["neutral"]=0
sample_submission["negative"]=0
print(train.head())
print(sample_submission.head())
sample_submission[list_classes] = y_test
sample_submission["max"]=sample_submission[list_classes].max(axis=1)

for indexs in sample_submission.index:  
    for  i2 in list_classes:  
        if(sample_submission.loc[indexs,i2] ==sample_submission.loc[indexs,"max"]):
            sample_submission.loc[indexs,"predict"]=i2
for i1 in list_classes:
    sample_submission.rename(columns={i1: "pred_" + i1}, inplace=True)
sample_submission.to_csv(res_file, index=False)

                                                评论内容   评论数    点赞数  分类  \
0                                  肖杰   实力派   有 资本 狂   1.0   73.0   1   
1                                 鹿晗 鹿晗 ， 一鹿 伴晗 ， 加油  26.0  174.0   1   
2    冯 老板 的 这段 舞 真的 精彩 ， 那个 慢动作 太帅 了 ， 看 了 N 遍 了 ， 赞   3.0   36.0   1   
3                        感觉 有 黑幕 ， 陈伟霆队 跳 的 更好 ， ， ，   0.0    7.0  -1   
4  白羊座 本来 就 很 冲动 一点 的 ， 所以 请 任何人 不要 这样 说 鹿晗 ， 换个 ...   0.0    3.0  -1   

   postive  neutral  negative  
0        1        0         0  
1        1        0         0  
2        1        0         0  
3        0        0         1  
4        0        0         1  
                                                评论内容  评论数   点赞数  分类  postive  \
0                    商人会关心科学？如果是，那应该也是如何科学赚钱的方法吧[捂脸]  NaN  23.0   0        0   
1     川普:盖兄到我这来当差如何？\n盖茨:去你妹的，滚！你有我有钱？我捐出去的钱都比你有的钱多！  NaN  19.0   0        0   
2                     比尔盖茨干这个确实屈才了，这个也不是盖茨强项，而且耽误干大事  NaN   8.0   0        0   
3  特朗普在没当选总统之前，在盖茨眼里都不算什么，财富是一方面，格局是另一方面，现在让盖茨屈尊伺

In [28]:
# 正确率评估
score = model.evaluate(X_t, y, batch_size=batch_size)
print(score)
print(list_classes)
res_pd = pd.read_csv(res_file)
type_ori_num = [res_pd[(res_pd["predict"] == m)].shape[0] for m in list_classes]
print(type_ori_num)
type_num = [res_pd["pred_" + m].sum() for m in list_classes]
print(type_num)

# 3. 能量转化
Nall =  219268
k = 10
Nall *= k
Et = 1
# Ep = k * (Et - 1 / np.log(resnp / Nall))
# Ep = -Et / np.log(resnp / Nall)
energy = [-Et / np.log(n / Nall) for n in type_num]
print(energy)

[0.2558635689692226, 0.9081863470608053]
['negative', 'neutral', 'postive']
[104474, 0, 113822]
[115009.34027261566, 19083.862225225195, 89660.32110563666]
[0.3392283906962068, 0.2107909471513053, 0.3128077736362239]
