## Constants and libraries

In [None]:
#文件目录
DATASET_DIR = './data/'
GLOVE_DIR = './model_weights/glove.6B/'
SAVE_DIR = './model_weights/'

import os
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec

from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score

import nltk
nltk.download('stopwords')# 下载停止词，即不能表现内容意义的词，如：'ourselves', 'between', 'but', 'again', 'there'
nltk.download('punkt')# 下载分词工具


  from collections import Mapping, defaultdict
Using TensorFlow backend.
[nltk_data] Downloading package stopwords to /home/limin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/limin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Preprocessing the Data

我们将预处理所有文章并将其转换为特征向量，以便将其输入到RNN中。

这些都是用于整理文章数据的辅助函数。

In [None]:

def essay_to_wordlist(essay_v, remove_stopwords):
    """清洗句子/文章，得到句子/文章的词列表"""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)  # 去除文章中非大小写字母以外的字符
    words = essay_v.lower().split() #小写，分词成词列表
    # 去除停止符
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)


def essay_to_sentences(essay_v, remove_stopwords):
    """将文章分句，并调用essay_to_wordlist（）对句子处理"""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')#加载英文的划分句子的模型(英文句子特点：.之后有空格)
    raw_sentences = tokenizer.tokenize(essay_v.strip())#得到句子列表 #strip()去首尾的空格
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences



def makeFeatureVec(words, model, num_features):
    """从文章的单词列表中制作特征向量"""
    featureVec = np.zeros((num_features,), dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index2word) #训练集中出现的词列表

    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec, model[word])#将每个词向量叠加
    featureVec = np.divide(featureVec, num_words)#文章的特征向量为文章中词向量的平均
    return featureVec


def getAvgFeatureVecs(essays, model, num_features):
    """将文章集生成word2vec模型的词向量"""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays), num_features), dtype="float32")
    # 每篇文章的特征向量
    for essay in essays: # 对每个文章向量化调用makeFeatureVec()向量化
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs


## Defining the model 

在这里，我们定义了一个2层LSTM模型。

因为我们没有对训练标签进行标准化，我们将在输出层中使用Relu，而不是Sigmoid激活。

In [None]:

def get_model():
    """构建RNN模型"""
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))
    # 对网络的学习过程进行配置，损失函数为均方误差，评价参数为平均绝对误差
    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()# 输出模型各层的参数状况
    return model


## Importing the Data

In [None]:

X = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel3.tsv'), sep='\t', encoding='ISO-8859-1')  # 读取文件
X = X.dropna(axis=1)#删除缺省的属性
X = X.drop(columns=['rater1_domain1', 'rater2_domain1'])#删除各评委的打分

[r, c] = X.shape
y = X['domain1_score']  # 文章分数y：两位评委对文章的评分和
max_score = [12, 6, 3, 3, 4, 4, 30, 60]

for i in range(r):

    for j in range(8):
        if X.iloc[i, 1] == j + 1:
           
            X.iloc[i, 3] =X.iloc[i, 3] /max_score[j]
           
            

In [None]:

X.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",0.666667
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",0.75
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",0.583333
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",0.833333
4,5,1,"Dear @LOCATION1, I know having computers has a...",0.666667


## Training Phase

现在我们在数据集上训练模型。

我们将使用5折交叉验证，并测量每折的二次加权Kappa。 然后，我们将计算所有折的平均kappa值。

In [None]:

cv = KFold(n_splits=5, shuffle=True)  # 5折交叉验证
results = []
y_pred_list = []
count = 1


for traincv, testcv in cv.split(X):# 将数据集划分成训练集和测试集，返回5组索引

    print("\n--------Fold {}--------\n".format(count))

    """划分训练集和测试集"""
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
    train_essays = X_train['essay']
    test_essays = X_test['essay']



    '''word2vec模型'''
    sentences = []

    # 从训练集中获取所有句子及分词
    for essay in train_essays:
        sentences += essay_to_sentences(essay, remove_stopwords=True)

    # word2vec模型的参数
    num_features = 300  # 特征向量的维度
    min_word_count = 40  # 最小词频，小于min_word_count的词被丢弃
    num_workers = 4  # 训练的并行数
    context = 10 # 当前词与预测词在一个句子中的最大距离
    downsampling = 1e-3 # 高频词汇的随机降采样的配置阈值

    # 训练模型
    print("Training Word2Vec Model...")
    model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count, window=context,
                     sample=downsampling)
    model.init_sims(replace=True)  # 结束训练后锁定模型，使模型的存储更加高效
    model.wv.save_word2vec_format('word2vecmodel.bin', binary=True) # 保存模型






    '''LSTM模型'''

    # 用word2vec模型向量化训练和测试数据中文章
    clean_train_essays = []
    for essay_v in train_essays:  # 生成文章的词列表
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)  # 向量化的文章集

    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    testDataVecs = getAvgFeatureVecs(clean_test_essays, model, num_features)

    # 转换训练向量和测试向量为numpy数组，提高运行效率
    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    # 将训练向量和测试向量重塑为3维 (1代表一个时间步长)
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

    # 训练lstm模型
    lstm_model = get_model()
    lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=40)
    # lstm_model.load_weights('./model_weights/final_lstm.h5')

    # 使用测试集预测模型输出
    y_pred = lstm_model.predict(testDataVecs)

    # 存储5个模型中最后一个.
    if count == 5:
        lstm_model.save('./model_weights/final_lstm.h5')

    # 评估测试结果
    y_pred = np.around(y_pred) # 将预测值y_pred舍入到最接近的整数
    result = cohen_kappa_score(y_test.values, y_pred, weights='quadratic') # 获取二次均值平均kappa值
    print("Kappa Score: {}".format(result))
    results.append(result)

    count += 1

In [41]:
# 输出五次训练的平均kappa值
print("Average Kappa score after a 5-fold cross validation: ",np.around(np.array(results).mean(),decimals=4))

Average Kappa score after a 5-fold cross validation:  0.9591
