In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os
import time
import tensorflow as tf
import csv
import json
import re
import sys
import load_data as ld
import count_data as cd
import gensim
import nltk

from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


unable to import 'smart_open.gcs', disabling that module


In [2]:
(x_train, y_train), (x_test, y_test) = ld.load_data('data/clean_data_0.5_no_repeat.csv')

In [3]:
y_train = ld.transform_to_multi_class(y_train)
y_test = ld.transform_to_multi_class(y_test)

In [4]:
loss_weight_matrix = []
for i in range(12):
    num_of_positive = 0
    num_of_negative = 0
    num_all = 0
    for j in y_train[i]:
        num_all += 1
        if j == 0:
            num_of_negative += 1
        else:
            num_of_positive += 1
    pos_ratio=num_of_positive/num_all
    neg_ratio=num_of_negative/num_all
    loss_weight_matrix.append([pos_ratio, neg_ratio])

In [7]:
word2vec = gensim.models.word2vec.Word2Vec.load("data/word2vec_300.w2v").wv

In [5]:
doc2vec = gensim.models.doc2vec.Doc2Vec.load("data/doc2vec_300.d2v")

In [6]:
max_length = 200
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index

x_train = tokenizer.texts_to_sequences(x_train)
x_train = pad_sequences(x_train, padding='post', maxlen=max_length)

x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, padding='post', maxlen=max_length)

In [14]:
embedding_matrix = np.zeros((len(word_index) + 1, doc2vec.vector_size))
for word, index in word_index.items():
    try:
        embedding_vector = doc2vec.__getitem__(str(word))
        embedding_matrix[int(index)] = embedding_vector
    except KeyError:
        continue

In [15]:
def bi_LSTM_attention_model(word_index,embedding_dim,max_length, embedding_matrix, weight):
    main_input = tf.keras.layers.Input(shape=(max_length,))
    # 词嵌入（使用预训练的词向量）
    embedder = tf.keras.layers.Embedding(len(word_index) + 1, 
                                         embedding_dim, 
                                         input_length=max_length,
                                         weights=[embedding_matrix],
                                         trainable=True)
    embed = embedder(main_input)
    bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(embed)
    attention = tf.keras.layers.Attention()([bilstm, bilstm])
    pooling1 = tf.keras.layers.GlobalMaxPooling1D()(bilstm)
    pooling2 = tf.keras.layers.GlobalMaxPooling1D()(attention)
    merge = tf.keras.layers.Concatenate()([pooling1, pooling2])
    dense = tf.keras.layers.Dense(units=64, activation='relu')(merge)
    main_output = tf.keras.layers.Dense(1, activation='sigmoid')(dense)
    
    model = tf.keras.models.Model(inputs=main_input, outputs=main_output)
    model.compile(loss=get_weight(weight), optimizer='adam', metrics=['accuracy']) 
    return model

In [16]:
def LSTM_model(word_index,embedding_dim,max_length, embedding_matrix):    
    main_input = tf.keras.layers.Input(shape=(max_length,))
    embedder = tf.keras.layers.Embedding(len(word_index) + 1, 
                                         embedding_dim, 
                                         input_length=max_length, 
                                         weights=[embedding_matrix],  
                                         trainable=True)
    embed = embedder(main_input)
    LSTM = keras.layers.Bidirectional(
        keras.layers.LSTM(units = 128, return_sequences = False))(embed)
    flat = keras.layers.Flatten()(LSTM)
    drop1 = keras.layers.Dropout(0.5)(flat)
    main_output = tf.keras.layers.Dense(2, activation='softmax')(drop1)
    
    model = tf.keras.models.Model(inputs=main_input, outputs=main_output)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [17]:
def TextCNN_model(word_index,embedding_dim,max_length, embedding_matrix, weight):    
    main_input = tf.keras.layers.Input(shape=(max_length,))
    embedder = tf.keras.layers.Embedding(len(word_index) + 1, 
                                         embedding_dim, 
                                         input_length=max_length,
                                         weights=[embedding_matrix],
                                         trainable=True)
    embed = embedder(main_input)
    cnn1 = tf.keras.layers.Conv1D(128, 3, padding='same', strides=1, activation='relu')(embed)
    cnn1 = tf.keras.layers.GlobalMaxPooling1D()(cnn1)
    drop = tf.keras.layers.Dropout(0.5)(cnn1)
    dense = tf.keras.layers.Dense(units=32, activation='relu')(drop)
    main_output = tf.keras.layers.Dense(1, activation='sigmoid')(dense)
    
    model = tf.keras.models.Model(inputs=main_input, outputs=main_output)
    model.compile(loss=get_weight(weight), optimizer='adam', metrics=['accuracy']) 
    return model

In [18]:
def standard_TextCNN_model(word_index,embedding_dim,max_length, embedding_matrix, weight):    
    main_input = tf.keras.layers.Input(shape=(max_length,))
    embedder = tf.keras.layers.Embedding(len(word_index) + 1, 
                                         embedding_dim, 
                                         input_length=max_length,
                                         weights=[embedding_matrix],
                                         trainable=True)
    embed = embedder(main_input)
    cnn1 = tf.keras.layers.Conv1D(256, 3, padding='same', strides=1, activation='relu')(embed)
    cnn1 = tf.keras.layers.MaxPooling1D(pool_size=48)(cnn1)
    cnn2 = tf.keras.layers.Conv1D(256, 4, padding='same', strides=1, activation='relu')(embed)
    cnn2 = tf.keras.layers.MaxPooling1D(pool_size=47)(cnn2)
    cnn3 = tf.keras.layers.Conv1D(256, 5, padding='same', strides=1, activation='relu')(embed)
    cnn3 = tf.keras.layers.MaxPooling1D(pool_size=46)(cnn3)
    # 合并三个模型的输出向量
    cnn = tf.keras.layers.concatenate([cnn1, cnn2, cnn3], axis=-1)
    flat = tf.keras.layers.Flatten()(cnn)
    drop = tf.keras.layers.Dropout(0.2)(flat)
    main_output = tf.keras.layers.Dense(1, activation='sigmoid')(drop)
    
    model = tf.keras.models.Model(inputs=main_input, outputs=main_output)
    model.compile(loss=get_weight(weight), optimizer='adam', metrics=['accuracy']) 
    return model

In [19]:
def save_result(result,file_name,run_num):
    av_F = []
    av_p = []
    av_r = []
    for i in range(12):
        f = 0
        p = 0
        r = 0
        for j in range(run_num):
            f = f+result[j][i][0]
            p = p+result[j][i][1]
            r = r+result[j][i][2]
        av_F.append(f/run_num)
        av_p.append(p/run_num)
        av_r.append(r/run_num)
    table_result = []
    all_F1 = 0
    all_P = 0
    all_R = 0
    for i in range(12):
        table_result.append(['/'.join([str('%.2f' % e) for e in [av_F[i],av_p[i],av_r[i]]])])
        all_F1 = all_F1+av_F[i]
        all_P = all_P+av_p[i]
        all_R = all_R+av_r[i]
    table_result.append(['/'.join([str('%.2f' % e) for e in [all_F1/12, all_P/12, all_R/12]])])
    print(table_result)
    with open(r'{}.csv'.format(file_name), 'w', encoding='gbk', newline='') as f:
                writer = csv.writer(f, dialect=csv.excel, delimiter=',')
                for data in table_result:
                    writer.writerow(data)

In [20]:
import tensorflow_core as tfc
from tensorflow_core.python.keras.callbacks import LearningRateScheduler
num_epochs =20
learning_rate=0.0005
#定义学习率衰减函数
def scheduler(epoch):
    if epoch < num_epochs * 0.3:
        return learning_rate
    if epoch < num_epochs * 0.6:
        return learning_rate * 0.5
    return learning_rate * 0.1



def get_weight(weights):
    def mycrossentropy(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.zeros_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.ones_like(y_pred))
        loss = (1-weights)*K.binary_crossentropy(y_true, y_pred)*pt_1+weights*K.binary_crossentropy(y_true, y_pred)*pt_0
        return loss
    return mycrossentropy


def customized_sparse_categorical_crossentrophy(y_true, y_pred):
    y_true_one_hot = []
    num_of_positive = 0
    num_of_negtive = 0
    num_all = 0
    for i in y_true:
        num_all += 1
        if i == 0:
            y_true_one_hot.append([1., 0.])
            num_of_negtive += 1
        else:
            y_true_one_hot.append([0., 1.])
            num_of_positive += 1
    y_true_one_hot = tf.constant(y_true_one_hot)
    pos_ratio=num_of_positive/num_all
    neg_ratio=num_of_negative/num_all
    cross_entropy = tf.reduce_mean(-neg_ratio*tf.reduce_sum(y_true_one_hot * tf.log(y_pred)+pos_ratio*(y_true_one_hot-1)* tf.log(1-y_pred))) 

In [None]:
embedding_dim = 300
logdir = '../output/cnn_multi_label_callbacks'
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)#,
            #tfc.python.keras.callbacks.LearningRateScheduler(scheduler)
            ]
best_score = []
for i in range(12):
    F1 = []
    p = []
    r = []
    for weight in np.linspace(0.1,0.6,10,endpoint=False):
        model = bi_LSTM_attention_model(word_index,embedding_dim,max_length, embedding_matrix, weight)
        model.summary()
        history = model.fit(x_train,
                            y_train[i],
                            epochs=50,
                            validation_split=0.2,
                            callbacks=callbacks,
                            verbose=2)
        model.evaluate(x_test, y_test[i], verbose=2)
        for threshold in np.linspace(0.2,0.8,100,endpoint=False):
            result = single_f1(model.predict(x_test), y_test[i], 'sigmoid', threshold)
            F1.append([result[0], result[1], result[2]])
    max_F1 = 0
    max_p = 0
    max_r = 0
    for score in F1:
        if score[0] > max_F1:
            max_F1 = score[0]
            max_p = score[1]
            max_r = score[2]
    best_score.append([max_F1, max_p, max_r])    

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 200, 300)     1275900     input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 200, 256)     439296      embedding[0][0]                  
__________________________________________________________________________________________________
attention (Attention)           (None, 200, 256)     0           bidirectional[0][0]              
                                                                 bidirectional[0][0]          

2360/2360 - 7s - loss: 0.1050 - accuracy: 0.8297 - val_loss: 0.0775 - val_accuracy: 0.8847
Epoch 2/50
2360/2360 - 3s - loss: 0.0736 - accuracy: 0.8801 - val_loss: 0.0763 - val_accuracy: 0.8610
Epoch 3/50
2360/2360 - 3s - loss: 0.0587 - accuracy: 0.9030 - val_loss: 0.0774 - val_accuracy: 0.8508
Epoch 4/50
2360/2360 - 3s - loss: 0.0453 - accuracy: 0.9254 - val_loss: 0.1004 - val_accuracy: 0.8254
738/1 - 0s - loss: 0.0485 - accuracy: 0.8374
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 200, 300)     1275900     input_4[0][0]                    
______________________________________________

2360/2360 - 6s - loss: 0.1400 - accuracy: 0.8233 - val_loss: 0.1008 - val_accuracy: 0.8763
Epoch 2/50
2360/2360 - 3s - loss: 0.0945 - accuracy: 0.8886 - val_loss: 0.0933 - val_accuracy: 0.8932
Epoch 3/50
2360/2360 - 3s - loss: 0.0651 - accuracy: 0.9275 - val_loss: 0.1054 - val_accuracy: 0.8831
Epoch 4/50
2360/2360 - 3s - loss: 0.0554 - accuracy: 0.9386 - val_loss: 0.1212 - val_accuracy: 0.8559
738/1 - 0s - loss: 0.0574 - accuracy: 0.8672
Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 200, 300)     1275900     input_7[0][0]                    
______________________________________________

2360/2360 - 8s - loss: 0.1735 - accuracy: 0.8381 - val_loss: 0.1295 - val_accuracy: 0.8797
Epoch 2/50
2360/2360 - 3s - loss: 0.1157 - accuracy: 0.8852 - val_loss: 0.1239 - val_accuracy: 0.8780
Epoch 3/50
2360/2360 - 3s - loss: 0.0800 - accuracy: 0.9271 - val_loss: 0.1379 - val_accuracy: 0.8864
Epoch 4/50
2360/2360 - 3s - loss: 0.0717 - accuracy: 0.9347 - val_loss: 0.1482 - val_accuracy: 0.8695
738/1 - 0s - loss: 0.0648 - accuracy: 0.8821
Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 200, 300)     1275900     input_10[0][0]                   
______________________________________________

2360/2360 - 7s - loss: 0.0806 - accuracy: 0.8517 - val_loss: 0.0982 - val_accuracy: 0.8169
Epoch 2/50
2360/2360 - 3s - loss: 0.0705 - accuracy: 0.8606 - val_loss: 0.0880 - val_accuracy: 0.8119
Epoch 3/50
2360/2360 - 3s - loss: 0.0583 - accuracy: 0.8869 - val_loss: 0.0874 - val_accuracy: 0.8322
Epoch 4/50
2360/2360 - 3s - loss: 0.0474 - accuracy: 0.9068 - val_loss: 0.0921 - val_accuracy: 0.8305
Epoch 5/50
2360/2360 - 3s - loss: 0.0379 - accuracy: 0.9258 - val_loss: 0.0918 - val_accuracy: 0.8102
738/1 - 0s - loss: 0.0417 - accuracy: 0.8238
Model: "model_12"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 200

2360/2360 - 9s - loss: 0.1128 - accuracy: 0.8462 - val_loss: 0.1384 - val_accuracy: 0.8373
Epoch 2/50
2360/2360 - 3s - loss: 0.0922 - accuracy: 0.8780 - val_loss: 0.1229 - val_accuracy: 0.8068
Epoch 3/50
2360/2360 - 3s - loss: 0.0745 - accuracy: 0.9034 - val_loss: 0.1222 - val_accuracy: 0.8356
Epoch 4/50
2360/2360 - 3s - loss: 0.0543 - accuracy: 0.9301 - val_loss: 0.1442 - val_accuracy: 0.7746
Epoch 5/50
2360/2360 - 3s - loss: 0.0427 - accuracy: 0.9445 - val_loss: 0.1700 - val_accuracy: 0.8169
738/1 - 0s - loss: 0.0705 - accuracy: 0.8482
Model: "model_15"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_15 (Embedding)        (None, 200

2360/2360 - 6s - loss: 0.1445 - accuracy: 0.8508 - val_loss: 0.1602 - val_accuracy: 0.8271
Epoch 2/50
2360/2360 - 3s - loss: 0.1185 - accuracy: 0.8780 - val_loss: 0.1624 - val_accuracy: 0.8288
Epoch 3/50
2360/2360 - 3s - loss: 0.0934 - accuracy: 0.9089 - val_loss: 0.1899 - val_accuracy: 0.8322
738/1 - 0s - loss: 0.0841 - accuracy: 0.8333
Model: "model_18"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_18 (Embedding)        (None, 200, 300)     1275900     input_19[0][0]                   
__________________________________________________________________________________________________
bidirectional_18 (Bidirectional (None, 200, 256)

2360/2360 - 6s - loss: 0.0376 - accuracy: 0.9157 - val_loss: 0.0414 - val_accuracy: 0.9051
Epoch 2/50
2360/2360 - 3s - loss: 0.0344 - accuracy: 0.9212 - val_loss: 0.0414 - val_accuracy: 0.9051
Epoch 3/50
2360/2360 - 3s - loss: 0.0343 - accuracy: 0.9212 - val_loss: 0.0414 - val_accuracy: 0.9051
Epoch 4/50
2360/2360 - 3s - loss: 0.0343 - accuracy: 0.9212 - val_loss: 0.0414 - val_accuracy: 0.9051
Epoch 5/50
2360/2360 - 3s - loss: 0.0343 - accuracy: 0.9212 - val_loss: 0.0415 - val_accuracy: 0.9051
738/1 - 0s - loss: 0.2368 - accuracy: 0.9092
Model: "model_21"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_22 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_21 (Embedding)        (None, 200

2360/2360 - 11s - loss: 0.0609 - accuracy: 0.9153 - val_loss: 0.0393 - val_accuracy: 0.9576
Epoch 2/50
2360/2360 - 3s - loss: 0.0264 - accuracy: 0.9597 - val_loss: 0.0305 - val_accuracy: 0.9627
Epoch 3/50
2360/2360 - 3s - loss: 0.0221 - accuracy: 0.9746 - val_loss: 0.0314 - val_accuracy: 0.9644
Epoch 4/50
2360/2360 - 3s - loss: 0.0158 - accuracy: 0.9822 - val_loss: 0.0416 - val_accuracy: 0.9475
738/1 - 0s - loss: 0.0186 - accuracy: 0.9661
Model: "model_24"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_24 (Embedding)        (None, 200, 300)     1275900     input_25[0][0]                   
____________________________________________

2360/2360 - 6s - loss: 0.0771 - accuracy: 0.9157 - val_loss: 0.0436 - val_accuracy: 0.9610
Epoch 2/50
2360/2360 - 3s - loss: 0.0350 - accuracy: 0.9682 - val_loss: 0.0401 - val_accuracy: 0.9610
Epoch 3/50
2360/2360 - 3s - loss: 0.0219 - accuracy: 0.9809 - val_loss: 0.0410 - val_accuracy: 0.9593
Epoch 4/50
2360/2360 - 3s - loss: 0.0191 - accuracy: 0.9831 - val_loss: 0.0492 - val_accuracy: 0.9559
738/1 - 0s - loss: 0.0527 - accuracy: 0.9743
Model: "model_27"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_28 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_27 (Embedding)        (None, 200, 300)     1275900     input_28[0][0]                   
_____________________________________________

2360/2360 - 6s - loss: 0.0972 - accuracy: 0.9203 - val_loss: 0.0533 - val_accuracy: 0.9576
Epoch 2/50
2360/2360 - 3s - loss: 0.0409 - accuracy: 0.9682 - val_loss: 0.0592 - val_accuracy: 0.9542
Epoch 3/50
2360/2360 - 3s - loss: 0.0267 - accuracy: 0.9814 - val_loss: 0.0680 - val_accuracy: 0.9559
738/1 - 0s - loss: 0.0736 - accuracy: 0.9661
Model: "model_30"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_31 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_30 (Embedding)        (None, 200, 300)     1275900     input_31[0][0]                   
__________________________________________________________________________________________________
bidirectional_30 (Bidirectional (None, 200, 256)

2360/2360 - 7s - loss: 0.0457 - accuracy: 0.9263 - val_loss: 0.0465 - val_accuracy: 0.9271
Epoch 2/50
2360/2360 - 3s - loss: 0.0250 - accuracy: 0.9644 - val_loss: 0.0353 - val_accuracy: 0.9508
Epoch 3/50
2360/2360 - 3s - loss: 0.0157 - accuracy: 0.9742 - val_loss: 0.0359 - val_accuracy: 0.9525
Epoch 4/50
2360/2360 - 3s - loss: 0.0073 - accuracy: 0.9894 - val_loss: 0.0346 - val_accuracy: 0.9508
Epoch 5/50
2360/2360 - 3s - loss: 0.0062 - accuracy: 0.9915 - val_loss: 0.0376 - val_accuracy: 0.9475
Epoch 6/50
2360/2360 - 3s - loss: 0.0053 - accuracy: 0.9919 - val_loss: 0.0375 - val_accuracy: 0.9441
738/1 - 0s - loss: 0.0110 - accuracy: 0.9526
Model: "model_33"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_34 (InputLayer)           [(None, 200)]        0                                            
_______________________________________

2360/2360 - 6s - loss: 0.0620 - accuracy: 0.9343 - val_loss: 0.0563 - val_accuracy: 0.9373
Epoch 2/50
2360/2360 - 3s - loss: 0.0275 - accuracy: 0.9720 - val_loss: 0.0565 - val_accuracy: 0.9407
Epoch 3/50
2360/2360 - 3s - loss: 0.0189 - accuracy: 0.9818 - val_loss: 0.0494 - val_accuracy: 0.9492
Epoch 4/50
2360/2360 - 3s - loss: 0.0106 - accuracy: 0.9881 - val_loss: 0.0635 - val_accuracy: 0.9424
Epoch 5/50
2360/2360 - 3s - loss: 0.0086 - accuracy: 0.9928 - val_loss: 0.0689 - val_accuracy: 0.9305
738/1 - 0s - loss: 0.0188 - accuracy: 0.9675
Model: "model_36"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_37 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_36 (Embedding)        (None, 200

2360/2360 - 6s - loss: 0.0806 - accuracy: 0.9424 - val_loss: 0.0680 - val_accuracy: 0.9373
Epoch 2/50
2360/2360 - 3s - loss: 0.0360 - accuracy: 0.9746 - val_loss: 0.0791 - val_accuracy: 0.9356
Epoch 3/50
2360/2360 - 3s - loss: 0.0258 - accuracy: 0.9826 - val_loss: 0.0674 - val_accuracy: 0.9458
Epoch 4/50
2360/2360 - 3s - loss: 0.0161 - accuracy: 0.9898 - val_loss: 0.0778 - val_accuracy: 0.9424
Epoch 5/50
2360/2360 - 3s - loss: 0.0101 - accuracy: 0.9941 - val_loss: 0.0859 - val_accuracy: 0.9492
738/1 - 0s - loss: 0.0242 - accuracy: 0.9702
Model: "model_39"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_40 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_39 (Embedding)        (None, 200

2360/2360 - 7s - loss: 0.0551 - accuracy: 0.8996 - val_loss: 0.0344 - val_accuracy: 0.9492
Epoch 2/50
2360/2360 - 3s - loss: 0.0314 - accuracy: 0.9390 - val_loss: 0.0332 - val_accuracy: 0.9407
Epoch 3/50
2360/2360 - 3s - loss: 0.0268 - accuracy: 0.9525 - val_loss: 0.0279 - val_accuracy: 0.9322
Epoch 4/50
2360/2360 - 3s - loss: 0.0174 - accuracy: 0.9682 - val_loss: 0.0286 - val_accuracy: 0.9407
Epoch 5/50
2360/2360 - 3s - loss: 0.0113 - accuracy: 0.9784 - val_loss: 0.0357 - val_accuracy: 0.9356
738/1 - 0s - loss: 0.0165 - accuracy: 0.9363
Model: "model_42"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_43 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_42 (Embedding)        (None, 200

2360/2360 - 6s - loss: 0.0779 - accuracy: 0.9038 - val_loss: 0.0420 - val_accuracy: 0.9373
Epoch 2/50
2360/2360 - 3s - loss: 0.0363 - accuracy: 0.9538 - val_loss: 0.0401 - val_accuracy: 0.9441
Epoch 3/50
2360/2360 - 3s - loss: 0.0241 - accuracy: 0.9665 - val_loss: 0.0451 - val_accuracy: 0.9475
Epoch 4/50
2360/2360 - 3s - loss: 0.0126 - accuracy: 0.9856 - val_loss: 0.0457 - val_accuracy: 0.9475
738/1 - 0s - loss: 0.0254 - accuracy: 0.9485
Model: "model_45"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_46 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_45 (Embedding)        (None, 200, 300)     1275900     input_46[0][0]                   
_____________________________________________

2360/2360 - 6s - loss: 0.1017 - accuracy: 0.9093 - val_loss: 0.0573 - val_accuracy: 0.9441
Epoch 2/50
2360/2360 - 3s - loss: 0.0441 - accuracy: 0.9593 - val_loss: 0.0520 - val_accuracy: 0.9559
Epoch 3/50
2360/2360 - 3s - loss: 0.0235 - accuracy: 0.9792 - val_loss: 0.0631 - val_accuracy: 0.9559
Epoch 4/50
2360/2360 - 3s - loss: 0.0131 - accuracy: 0.9886 - val_loss: 0.0802 - val_accuracy: 0.9407
738/1 - 0s - loss: 0.0368 - accuracy: 0.9485
Model: "model_48"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_49 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_48 (Embedding)        (None, 200, 300)     1275900     input_49[0][0]                   
_____________________________________________

2360/2360 - 16s - loss: 0.0443 - accuracy: 0.9051 - val_loss: 0.0444 - val_accuracy: 0.8983
Epoch 2/50
2360/2360 - 3s - loss: 0.0414 - accuracy: 0.9051 - val_loss: 0.0443 - val_accuracy: 0.8983
Epoch 3/50
2360/2360 - 3s - loss: 0.0413 - accuracy: 0.9051 - val_loss: 0.0443 - val_accuracy: 0.8983
Epoch 4/50
2360/2360 - 3s - loss: 0.0413 - accuracy: 0.9051 - val_loss: 0.0443 - val_accuracy: 0.8983
Epoch 5/50
2360/2360 - 3s - loss: 0.0412 - accuracy: 0.9051 - val_loss: 0.0443 - val_accuracy: 0.8983
Epoch 6/50
2360/2360 - 3s - loss: 0.0412 - accuracy: 0.9051 - val_loss: 0.0444 - val_accuracy: 0.8983
738/1 - 0s - loss: 0.0205 - accuracy: 0.9065
Model: "model_51"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_52 (InputLayer)           [(None, 200)]        0                                            
______________________________________

2360/2360 - 7s - loss: 0.0647 - accuracy: 0.9089 - val_loss: 0.0281 - val_accuracy: 0.9627
Epoch 2/50
2360/2360 - 3s - loss: 0.0161 - accuracy: 0.9801 - val_loss: 0.0419 - val_accuracy: 0.9203
Epoch 3/50
2360/2360 - 3s - loss: 0.0100 - accuracy: 0.9873 - val_loss: 0.0211 - val_accuracy: 0.9644
Epoch 4/50
2360/2360 - 3s - loss: 0.0037 - accuracy: 0.9970 - val_loss: 0.0368 - val_accuracy: 0.9542
Epoch 5/50
2360/2360 - 3s - loss: 0.0040 - accuracy: 0.9949 - val_loss: 0.0434 - val_accuracy: 0.9508
738/1 - 0s - loss: 0.0176 - accuracy: 0.9593
Model: "model_54"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_55 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_54 (Embedding)        (None, 200

2360/2360 - 6s - loss: 0.0886 - accuracy: 0.9199 - val_loss: 0.0423 - val_accuracy: 0.9695
Epoch 2/50
2360/2360 - 3s - loss: 0.0247 - accuracy: 0.9780 - val_loss: 0.0373 - val_accuracy: 0.9610
Epoch 3/50
2360/2360 - 3s - loss: 0.0132 - accuracy: 0.9886 - val_loss: 0.0580 - val_accuracy: 0.8983
Epoch 4/50
2360/2360 - 3s - loss: 0.0101 - accuracy: 0.9911 - val_loss: 0.0769 - val_accuracy: 0.9373
738/1 - 0s - loss: 0.0170 - accuracy: 0.9634
Model: "model_57"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_58 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_57 (Embedding)        (None, 200, 300)     1275900     input_58[0][0]                   
_____________________________________________

2360/2360 - 7s - loss: 0.1069 - accuracy: 0.9136 - val_loss: 0.0516 - val_accuracy: 0.9678
Epoch 2/50
2360/2360 - 3s - loss: 0.0282 - accuracy: 0.9788 - val_loss: 0.0420 - val_accuracy: 0.9678
Epoch 3/50
2360/2360 - 3s - loss: 0.0126 - accuracy: 0.9919 - val_loss: 0.0989 - val_accuracy: 0.9576
Epoch 4/50
2360/2360 - 3s - loss: 0.0091 - accuracy: 0.9953 - val_loss: 0.0630 - val_accuracy: 0.9627
738/1 - 0s - loss: 0.0264 - accuracy: 0.9729
Model: "model_60"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_61 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_60 (Embedding)        (None, 200, 300)     1275900     input_61[0][0]                   
_____________________________________________

2360/2360 - 7s - loss: 0.0124 - accuracy: 0.9903 - val_loss: 0.0076 - val_accuracy: 0.9898
Epoch 2/50
2360/2360 - 3s - loss: 0.0045 - accuracy: 0.9936 - val_loss: 0.0020 - val_accuracy: 1.0000
Epoch 3/50
2360/2360 - 3s - loss: 0.0010 - accuracy: 0.9987 - val_loss: 7.7270e-05 - val_accuracy: 1.0000
Epoch 4/50
2360/2360 - 3s - loss: 5.9333e-04 - accuracy: 0.9987 - val_loss: 0.0039 - val_accuracy: 0.9966
Epoch 5/50
2360/2360 - 3s - loss: 7.4235e-04 - accuracy: 0.9996 - val_loss: 0.0032 - val_accuracy: 0.9949
738/1 - 0s - loss: 2.1222e-04 - accuracy: 0.9986
Model: "model_63"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_64 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_63 (Embedding)  

2360/2360 - 7s - loss: 0.0241 - accuracy: 0.9911 - val_loss: 0.0121 - val_accuracy: 0.9898
Epoch 2/50
2360/2360 - 3s - loss: 0.0069 - accuracy: 0.9941 - val_loss: 7.7888e-04 - val_accuracy: 1.0000
Epoch 3/50
2360/2360 - 3s - loss: 0.0037 - accuracy: 0.9966 - val_loss: 8.4005e-04 - val_accuracy: 1.0000
Epoch 4/50
2360/2360 - 3s - loss: 0.0018 - accuracy: 0.9992 - val_loss: 0.0030 - val_accuracy: 0.9966
738/1 - 0s - loss: 3.8085e-04 - accuracy: 1.0000
Model: "model_66"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_67 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_66 (Embedding)        (None, 200, 300)     1275900     input_67[0][0]                   
_________________________________

In [None]:
try:
    result = []
    for i in range(12):
        result.append(best_score[i])
    save_result([result],'bilstm_doc2vec_embed300_len200_best_score',run_num=1)
except:
    !sh /root/shutdown.sh

In [None]:
try:  
    embedding_dim = 300
    logdir = '../output/cnn_multi_label_callbacks'
    callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)#,
                #tfc.python.keras.callbacks.LearningRateScheduler(scheduler)
                ]
    best_score = []
    for i in range(12):
        F1 = []
        p = []
        r = []
        for weight in np.linspace(0.1,0.6,10,endpoint=False):
            model = TextCNN_model(word_index,embedding_dim,max_length, embedding_matrix, weight)
            model.summary()
            history = model.fit(x_train,
                                y_train[i],
                                epochs=50,
                                validation_split=0.2,
                                callbacks=callbacks,
                                verbose=2)
            model.evaluate(x_test, y_test[i], verbose=2)
            for threshold in np.linspace(0.2,0.8,100,endpoint=False):
                result = single_f1(model.predict(x_test), y_test[i], 'sigmoid', threshold)
                F1.append([result[0], result[1], result[2]])
        max_F1 = 0
        max_p = 0
        max_r = 0
        for score in F1:
            if score[0] > max_F1:
                max_F1 = score[0]
                max_p = score[1]
                max_r = score[2]
        best_score.append([max_F1, max_p, max_r])  
    result = []
    for i in range(12):
        result.append(best_score[i])
    save_result([result],'textcnn_doc2vec_embed300_len200_best_score',run_num=1)
except:
    !sh /root/shutdown.sh

In [13]:
def single_f1(y_pred, y_label, activation='softmax', threshold=0.5):
    y_true = [[]]
    if activation == 'softmax':  
        for item in y_label.numpy():
            y_true[0].append(item)
        y_true = tf.constant(y_true)
        y_hat = []
        for item in y_pred:
            if item[0] > item[1]:
                y_hat.append([0.])
            else:
                y_hat.append([1.])
        y_hat = tf.constant(y_hat)
    if activation == 'sigmoid':
        for item in y_label.numpy():
            y_true[0].append(item)
        y_true = tf.constant(y_true)
        y_hat = []
        for item in y_pred:
            if item < threshold:
                y_hat.append([0.])
            else:
                y_hat.append([1.])
        y_hat = tf.constant(y_hat)
        
    epsilon = 1e-7
  
    tp = tf.cast(tf.matmul(y_true,y_hat), 'float')
    #tn = tf.sum(tf.cast((1-y_hat)*(1-y_true), 'float'), axis=0)
    fp = tf.cast(tf.matmul(1-y_true,y_hat), 'float')
    fn = tf.cast(tf.matmul(y_true,1-y_hat), 'float')
   
    p = tp/(tp+fp+epsilon)#epsilon的意义在于防止分母为0，否则当分母为0时python会报错
    r = tp/(tp+fn+epsilon)
    
    f1 = 2*p*r/(p+r+epsilon)
    
    result = [f1[0][0].numpy(),
              p[0][0].numpy(), 
              r[0][0].numpy(),
              tp[0][0].numpy(),
              fp[0][0].numpy(),
              fn[0][0].numpy()]
    return result

In [59]:
type_dic = {'Introductory/Generic': 0, 'Practice not covered': 1,
            'Privacy contact information': 2, 'User Access, Edit and Deletion': 3,
            'Data Security': 4, 'International and Specific Audiences': 5,
            'Do Not Track': 6, 'User Choice/Control': 7,
            'Data Retention': 8, 'Policy Change': 9,
            'First Party Collection/Use': 10, 'Third Party Sharing/Collection': 11}
index = 0
tb = pt.PrettyTable()
tb.field_names = ["label", "F1", "Precision", "recall", "TP", "FP", "FN"]
table_result = []
all_F1 = 0
all_P = 0
all_R = 0
for key, value in type_dic.items():
    result = single_f1(models[index].predict(x_test), y_test[index], 'sigmoid', 0.5)
    tb.add_row([key, result[0], result[1], result[2], result[3], result[4], result[5]])
    index = index+1
    table_result.append(['/'.join([str('%.2f' % e) for e in [result[0], result[1], result[2]]])])
    all_F1 = all_F1+result[0]
    all_P = all_P+result[1]
    all_R = all_R+result[2]
print(tb)
table_result.append(['/'.join([str('%.2f' % e) for e in [all_F1/12, all_P/12, all_R/12]])])
with open(r'result\cnn_word2vec_opp_100_unnormal.csv', 'w', encoding='gbk', newline='') as f:
        writer = csv.writer(f, dialect=csv.excel, delimiter=',')
        for data in table_result:
            writer.writerow(data)

+--------------------------------------+------------+-----------+------------+-------+-------+------+
|                label                 |     F1     | Precision |   recall   |   TP  |   FP  |  FN  |
+--------------------------------------+------------+-----------+------------+-------+-------+------+
|         Introductory/Generic         | 0.48888886 | 0.6197183 | 0.40366971 |  44.0 |  27.0 | 65.0 |
|         Practice not covered         |  0.402439  | 0.7173913 | 0.27966103 |  33.0 |  13.0 | 85.0 |
|     Privacy contact information      | 0.7480915  |  0.765625 | 0.73134327 |  49.0 |  15.0 | 18.0 |
|    User Access, Edit and Deletion    | 0.6595744  | 0.6458333 | 0.67391306 |  31.0 |  17.0 | 15.0 |
|            Data Security             | 0.64596266 |    0.65   | 0.6419753  |  52.0 |  28.0 | 29.0 |
| International and Specific Audiences | 0.79699236 |  0.828125 | 0.76811594 |  53.0 |  11.0 | 16.0 |
|             Do Not Track             |    1.0     |    1.0    |    1.0     |  5.