In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os
import time
import tensorflow as tf
import tensorflow_core as tfc
import csv
import json
import re
import sys
import count_data as cd
import load_data as ld
import gensim
import nltk

from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow_core.python.keras.callbacks import LearningRateScheduler
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from gensim import corpora
from nltk import pos_tag
from nltk.corpus import wordnet
from collections import defaultdict

unable to import 'smart_open.gcs', disabling that module


In [2]:
def single_f1(y_pred, y_label, activation='softmax', threshold=0.5):
    y_true = [[]]
    if activation == 'softmax':  
        for item in y_label.numpy():
            y_true[0].append(item)
        y_true = tf.constant(y_true)
        y_hat = []
        for item in y_pred:
            if item[0] > item[1]:
                y_hat.append([0.])
            else:
                y_hat.append([1.])
        y_hat = tf.constant(y_hat)
    if activation == 'sigmoid':
        for item in y_label.numpy():
            y_true[0].append(item)
        y_true = tf.constant(y_true)
        y_hat = []
        for item in y_pred:
            if item < threshold:
                y_hat.append([0.])
            else:
                y_hat.append([1.])
        y_hat = tf.constant(y_hat)
        
    epsilon = 1e-7
  
    tp = tf.cast(tf.matmul(y_true,y_hat), 'float')
    #tn = tf.sum(tf.cast((1-y_hat)*(1-y_true), 'float'), axis=0)
    fp = tf.cast(tf.matmul(1-y_true,y_hat), 'float')
    fn = tf.cast(tf.matmul(y_true,1-y_hat), 'float')
   
    p = tp/(tp+fp+epsilon)
    r = tp/(tp+fn+epsilon)
    
    f1 = 2*p*r/(p+r+epsilon)
    
    result = [f1[0][0].numpy(),
              p[0][0].numpy(), 
              r[0][0].numpy(),
              tp[0][0].numpy(),
              fp[0][0].numpy(),
              fn[0][0].numpy()]
    return result

计算F1得分<br>对于sigmoid激活函数的输出可设置判定阈值<br>返回F1，recall，precision，tp，fp，fn

In [3]:
def prepare_train(max_length = 100):
    (x_train, y_train), (x_test, y_test) = ld.load_data('data/clean_data_0.5_no_repeat.csv')
    y_train = ld.transform_to_multi_class(y_train)
    y_test = ld.transform_to_multi_class(y_test)
    tokenizer = Tokenizer(oov_token='<OOV>')
    tokenizer.fit_on_texts(x_train)
    word_index = tokenizer.word_index

    x_train = tokenizer.texts_to_sequences(x_train)
    x_train = pad_sequences(x_train, padding='post', maxlen=max_length)

    x_test = tokenizer.texts_to_sequences(x_test)
    x_test = pad_sequences(x_test, padding='post', maxlen=max_length)
    return (x_train, y_train), (x_test, y_test),word_index

生成训练测试集以及词表

In [4]:
def get_model(model_type,model_name, embedding_dim=100):
    if model_type == 'word2vec':
        model = gensim.models.word2vec.Word2Vec.load("data/{}.w2v".format(model_name)).wv
    elif model_type == 'doc2vec':
        model = gensim.models.doc2vec.Doc2Vec.load("data/{}.d2v".format(model_name))
    elif model_type == 'google':
        model = gensim.models.KeyedVectors.load_word2vec_format(
            '../input/GoogleNews-vectors-negative300.bin', binary=True)
    return model

加载3种不同模型，分别为word2vec，doc2vec以及google-news预训练模型

In [5]:
def get_embedding_matrix(model):
    embedding_matrix = np.zeros((len(word_index) + 1, model.vector_size))
    for word, index in word_index.items():
        try:
            embedding_vector = model.__getitem__(str(word))
            embedding_matrix[int(index)] = embedding_vector
        except KeyError:
            continue
    return embedding_matrix

生成词向量矩阵

In [6]:
def bi_LSTM_attention_model(word_index,embedding_dim,max_length, embedding_matrix,weight):
    main_input = tf.keras.layers.Input(shape=(max_length,))
    embedder = tf.keras.layers.Embedding(len(word_index) + 1, 
                                         embedding_dim, 
                                         input_length=max_length,
                                         weights=[embedding_matrix],
                                         trainable=True)
    embed = embedder(main_input)
    bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(embed)
    attention = tf.keras.layers.Attention()([bilstm, bilstm])
    pooling1 = tf.keras.layers.GlobalMaxPooling1D()(bilstm)
    pooling2 = tf.keras.layers.GlobalMaxPooling1D()(attention)
    merge = tf.keras.layers.Concatenate()([pooling1, pooling2])
    dense = tf.keras.layers.Dense(units=64, activation='relu')(merge)
    main_output = tf.keras.layers.Dense(1, activation='sigmoid')(dense)
    
    model = tf.keras.models.Model(inputs=main_input, outputs=main_output)
    model.compile(loss=get_weight(weight), optimizer='adam', metrics=['accuracy']) 
    return model

In [7]:
def TextCNN_model(word_index,embedding_dim,max_length, embedding_matrix,weight):    
    main_input = tf.keras.layers.Input(shape=(max_length,))
    embedder = tf.keras.layers.Embedding(len(word_index) + 1, 
                                         embedding_dim, 
                                         input_length=max_length,
                                         weights=[embedding_matrix],
                                         trainable=True)
    embed = embedder(main_input)
    cnn1 = tf.keras.layers.Conv1D(128, 3, padding='same', strides=1, activation='relu')(embed)
    cnn1 = tf.keras.layers.GlobalMaxPooling1D()(cnn1)
    drop = tf.keras.layers.Dropout(0.5)(cnn1)
    dense = tf.keras.layers.Dense(units=32, activation='relu')(drop)
    main_output = tf.keras.layers.Dense(1, activation='sigmoid')(dense)
    
    model = tf.keras.models.Model(inputs=main_input, outputs=main_output)
    model.compile(loss=get_weight(weight), optimizer='adam', metrics=['accuracy']) 
    return model

In [8]:
num_epochs =20
learning_rate=0.0005
def scheduler(epoch):
    if epoch < num_epochs * 0.3:
        return learning_rate
    if epoch < num_epochs * 0.6:
        return learning_rate * 0.5
    return learning_rate * 0.1

自定义学习率下降策略，实际没有使用

In [10]:
def get_weight_value(y_true):
    count = [0, 0]
    weight = [0, 0]
    for i in y_true.numpy():
        if i == 0:
            count[0] += 1
    count[1] = len(y_true) - count[0]
    weight[0] = count[1]/len(y_true)
    weight[1] = count[0]/len(y_true)
    return weight

def get_weight(weight):
    def mycrossentropy(y_true, y_pred):
        #pt_1 = tf.ones_like(y_pred)-tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        #pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.zeros_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.ones_like(y_pred))
        loss = (1-weight)*K.binary_crossentropy(y_true, y_pred)*pt_1+(weight)*K.binary_crossentropy(y_true, y_pred)*pt_0
        return loss
    return mycrossentropy

定义损失函数权重

In [11]:
def run_model(x_train,y_train,
              x_test,y_test, 
              word_index,
              embedding_dim,
              max_length, 
              embedding_matrix,
              weight,
              model_name='TextCNN_model',
              if_scheduler=False):
    callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)]
    if if_scheduler:
        callbacks.append(tfc.python.keras.callbacks.LearningRateScheduler(scheduler))
    models = []
    histories = []
    for i in range(12):
        if model_name == 'TextCNN_model':
            model = TextCNN_model(word_index,embedding_dim,max_length,embedding_matrix,weight)
        elif model_name == 'bi_LSTM_attention_model':
            model = bi_LSTM_attention_model(word_index,embedding_dim,max_length,embedding_matrix,weight)
        histories.append(model.fit(x_train,
                                   y_train[i],
                                   epochs=50,
                                   validation_split=0.2,
                                   callbacks=callbacks,
                                   verbose=2))
        model.evaluate(x_test, y_test[i], verbose=2)
        models.append(model)
    return (models,histories)

训练分类器，一次性训练12个针对各类别的分类器

In [12]:
def get_score(models,x_test,y_test):
    type_dic = {'Introductory/Generic': 0, 'Practice not covered': 1,
                'Privacy contact information': 2, 'User Access, Edit and Deletion': 3,
                'Data Security': 4, 'International and Specific Audiences': 5,
                'Do Not Track': 6, 'User Choice/Control': 7,
                'Data Retention': 8, 'Policy Change': 9,
                'First Party Collection/Use': 10, 'Third Party Sharing/Collection': 11}
    index = 0
    table_result = []
    for key, value in type_dic.items():
        result = single_f1(models[index].predict(x_test), y_test[index], 'sigmoid', 0.5)
        index = index+1
        table_result.append([result[0], result[1], result[2]])
    return table_result

在测试集测试，获取得分

In [13]:
def save_result(result,file_name,run_num):
    av_F = []
    av_p = []
    av_r = []
    for i in range(12):
        f = 0
        p = 0
        r = 0
        for j in range(run_num):
            f = f+result[j][i][0]
            p = p+result[j][i][1]
            r = r+result[j][i][2]
        av_F.append(f/run_num)
        av_p.append(p/run_num)
        av_r.append(r/run_num)
    table_result = []
    all_F1 = 0
    all_P = 0
    all_R = 0
    for i in range(12):
        table_result.append(['/'.join([str('%.2f' % e) for e in [av_F[i],av_p[i],av_r[i]]])])
        all_F1 = all_F1+av_F[i]
        all_P = all_P+av_p[i]
        all_R = all_R+av_r[i]
    table_result.append(['/'.join([str('%.2f' % e) for e in [all_F1/12, all_P/12, all_R/12]])])
    print(table_result)
    with open(r'{}.csv'.format(file_name), 'w', encoding='gbk', newline='') as f:
                writer = csv.writer(f, dialect=csv.excel, delimiter=',')
                for data in table_result:
                    writer.writerow(data)

计算平均值，将数据导出保存

In [42]:
result = []
run_num = 10
max_length = 200
embedding_dim = 100
weight = 0.2
(x_train, y_train), (x_test, y_test),word_index = prepare_train(max_length)
embedding_matrix = get_embedding_matrix(get_model(model_type='word2vec', model_name='word2vec'))
#  选择使用word2vec或doc2vec
for i in range(run_num):
    (models, histories) = run_model(x_train,y_train,
                                    x_test,y_test, 
                                    word_index=word_index,
                                    embedding_dim=embedding_dim,
                                    max_length=max_length, 
                                    embedding_matrix=embedding_matrix,
                                    weight = weight,
                                    model_name='TextCNN_model') #  选择使用的模型
    result.append(get_score(models=models,x_test=x_test,y_test=y_test))
save_result(result=result,file_name='TextCNN_word2vec_embed100_len200_10run',run_num=run_num) #  保存得分到指定文件

[0.17559322033898306, 0.8244067796610169]
Train on 2360 samples, validate on 590 samples
Epoch 1/50
2360/2360 - 2s - loss: 0.1126 - accuracy: 0.6072 - val_loss: 0.0836 - val_accuracy: 0.8271
Epoch 2/50
2360/2360 - 1s - loss: 0.0907 - accuracy: 0.6725 - val_loss: 0.0802 - val_accuracy: 0.7763
Epoch 3/50
2360/2360 - 1s - loss: 0.0847 - accuracy: 0.7055 - val_loss: 0.0787 - val_accuracy: 0.8068
Epoch 4/50
2360/2360 - 1s - loss: 0.0835 - accuracy: 0.7364 - val_loss: 0.0769 - val_accuracy: 0.8220
Epoch 5/50
2360/2360 - 1s - loss: 0.0785 - accuracy: 0.7305 - val_loss: 0.0748 - val_accuracy: 0.7780
Epoch 6/50
2360/2360 - 1s - loss: 0.0736 - accuracy: 0.7695 - val_loss: 0.0742 - val_accuracy: 0.7542
Epoch 7/50
2360/2360 - 1s - loss: 0.0699 - accuracy: 0.7928 - val_loss: 0.0775 - val_accuracy: 0.8525
Epoch 8/50
2360/2360 - 1s - loss: 0.0604 - accuracy: 0.8237 - val_loss: 0.0809 - val_accuracy: 0.8593
738/1 - 0s - loss: 0.0478 - accuracy: 0.8509
[0.1511864406779661, 0.848813559322034]
Train on 2

Epoch 4/50
2360/2360 - 1s - loss: 0.0156 - accuracy: 0.8708 - val_loss: 0.0078 - val_accuracy: 0.9034
Epoch 5/50
2360/2360 - 1s - loss: 0.0116 - accuracy: 0.8958 - val_loss: 0.0049 - val_accuracy: 0.9746
Epoch 6/50
2360/2360 - 1s - loss: 0.0113 - accuracy: 0.9157 - val_loss: 0.0055 - val_accuracy: 0.9492
Epoch 7/50
2360/2360 - 1s - loss: 0.0097 - accuracy: 0.9292 - val_loss: 0.0055 - val_accuracy: 0.9525
738/1 - 0s - loss: 0.0087 - accuracy: 0.9553
[0.40779661016949154, 0.5922033898305085]
Train on 2360 samples, validate on 590 samples
Epoch 1/50
2360/2360 - 2s - loss: 0.1804 - accuracy: 0.6280 - val_loss: 0.1327 - val_accuracy: 0.7237
Epoch 2/50
2360/2360 - 1s - loss: 0.1370 - accuracy: 0.7017 - val_loss: 0.1310 - val_accuracy: 0.7390
Epoch 3/50
2360/2360 - 1s - loss: 0.1251 - accuracy: 0.7492 - val_loss: 0.1288 - val_accuracy: 0.7797
Epoch 4/50
2360/2360 - 1s - loss: 0.1109 - accuracy: 0.7831 - val_loss: 0.1166 - val_accuracy: 0.7678
Epoch 5/50
2360/2360 - 1s - loss: 0.1037 - accurac

Epoch 4/50
2360/2360 - 1s - loss: 0.0243 - accuracy: 0.8970 - val_loss: 0.0371 - val_accuracy: 0.7254
Epoch 5/50
2360/2360 - 1s - loss: 0.0210 - accuracy: 0.9021 - val_loss: 0.0336 - val_accuracy: 0.7644
Epoch 6/50
2360/2360 - 1s - loss: 0.0169 - accuracy: 0.9233 - val_loss: 0.0298 - val_accuracy: 0.8356
Epoch 7/50
2360/2360 - 1s - loss: 0.0156 - accuracy: 0.9390 - val_loss: 0.0292 - val_accuracy: 0.8881
Epoch 8/50
2360/2360 - 1s - loss: 0.0117 - accuracy: 0.9589 - val_loss: 0.0285 - val_accuracy: 0.8797
Epoch 9/50
2360/2360 - 1s - loss: 0.0104 - accuracy: 0.9682 - val_loss: 0.0291 - val_accuracy: 0.8305
Epoch 10/50
2360/2360 - 1s - loss: 0.0104 - accuracy: 0.9648 - val_loss: 0.0280 - val_accuracy: 0.9203
Epoch 11/50
2360/2360 - 1s - loss: 0.0088 - accuracy: 0.9725 - val_loss: 0.0293 - val_accuracy: 0.9102
Epoch 12/50
2360/2360 - 1s - loss: 0.0078 - accuracy: 0.9763 - val_loss: 0.0324 - val_accuracy: 0.9458
738/1 - 0s - loss: 0.0275 - accuracy: 0.9634
[0.009152542372881356, 0.990847457

2360/2360 - 1s - loss: 0.0318 - accuracy: 0.9462 - val_loss: 0.0748 - val_accuracy: 0.8729
738/1 - 0s - loss: 0.0473 - accuracy: 0.8401
[0.17559322033898306, 0.8244067796610169]
Train on 2360 samples, validate on 590 samples
Epoch 1/50
2360/2360 - 2s - loss: 0.1196 - accuracy: 0.5890 - val_loss: 0.0824 - val_accuracy: 0.7542
Epoch 2/50
2360/2360 - 1s - loss: 0.0931 - accuracy: 0.6559 - val_loss: 0.0796 - val_accuracy: 0.7814
Epoch 3/50
2360/2360 - 1s - loss: 0.0846 - accuracy: 0.7089 - val_loss: 0.0773 - val_accuracy: 0.7559
Epoch 4/50
2360/2360 - 1s - loss: 0.0814 - accuracy: 0.7220 - val_loss: 0.0756 - val_accuracy: 0.7356
Epoch 5/50
2360/2360 - 1s - loss: 0.0749 - accuracy: 0.7653 - val_loss: 0.0784 - val_accuracy: 0.8169
Epoch 6/50
2360/2360 - 1s - loss: 0.0722 - accuracy: 0.7542 - val_loss: 0.0736 - val_accuracy: 0.7983
Epoch 7/50
2360/2360 - 1s - loss: 0.0717 - accuracy: 0.7695 - val_loss: 0.0776 - val_accuracy: 0.8339
Epoch 8/50
2360/2360 - 1s - loss: 0.0636 - accuracy: 0.8034 -

738/1 - 0s - loss: 0.0591 - accuracy: 0.8442
[0.04135593220338983, 0.9586440677966102]
Train on 2360 samples, validate on 590 samples
Epoch 1/50
2360/2360 - 2s - loss: 0.0331 - accuracy: 0.5754 - val_loss: 0.0293 - val_accuracy: 0.2102
Epoch 2/50
2360/2360 - 1s - loss: 0.0266 - accuracy: 0.5695 - val_loss: 0.0277 - val_accuracy: 0.5898
Epoch 3/50
2360/2360 - 1s - loss: 0.0269 - accuracy: 0.6089 - val_loss: 0.0289 - val_accuracy: 0.7729
Epoch 4/50
2360/2360 - 1s - loss: 0.0242 - accuracy: 0.6826 - val_loss: 0.0280 - val_accuracy: 0.3339
738/1 - 0s - loss: 0.0243 - accuracy: 0.3591
[0.05322033898305085, 0.9467796610169491]
Train on 2360 samples, validate on 590 samples
Epoch 1/50
2360/2360 - 2s - loss: 0.0353 - accuracy: 0.7288 - val_loss: 0.0169 - val_accuracy: 0.7271
Epoch 2/50
2360/2360 - 1s - loss: 0.0250 - accuracy: 0.7962 - val_loss: 0.0187 - val_accuracy: 0.6441
Epoch 3/50
2360/2360 - 1s - loss: 0.0180 - accuracy: 0.8182 - val_loss: 0.0107 - val_accuracy: 0.8729
Epoch 4/50
2360/23

训练分类器，每个类别训练10个分类器用于计算平均值，总共训练120次<br>
根据对应参数选择训练方法：<br>

run_num = 10 -> 训练多少轮求均值<br>
max_length = 200 -> 文本定长<br>
embedding_dim = 100 -> embedding_dim 词向量维数<br>
weight = 0.2 -> 损失函数权重<br>
