In [2]:
import pandas as pd
import jieba
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.models import Model
from keras.layers import Input, Dense, Embedding, MaxPooling1D, Conv1D, SpatialDropout1D
from keras.layers import add, Dropout, PReLU, BatchNormalization, GlobalMaxPooling1D
from keras import optimizers
from keras import initializers, regularizers, constraints, callbacks

import warnings
warnings.filterwarnings('ignore')

In [3]:
#读取数据集
X_train = pd.read_csv('data/X_train_minmaxscaler.csv')['ChatGPT回答'].iloc[:100]
X_test = pd.read_csv('data/X_test_minmaxscaler.csv')['ChatGPT回答'].iloc[:100]
y_train = pd.read_csv('data/y_train_minmaxscaler.csv').iloc[:100,:]
y_test = pd.read_csv('data/y_test_minmaxscaler.csv').iloc[:100,:]

In [4]:
#处理数据

cw = lambda x: list(jieba.cut(x))
X_train_text = X_train.apply(cw)
X_test_text = X_test.apply(cw)

tokenizer=Tokenizer()  #创建一个Tokenizer对象
#fit_on_texts函数可以将输入的文本中的每个词编号，编号是根据词频的，词频越大，编号越小
tokenizer.fit_on_texts(X_train_text)
tokenizer.fit_on_texts(X_test_text)
vocab=tokenizer.word_index #得到每个词的编号

# 将每个样本中的每个词转换为数字列表，使用每个词的编号进行编号
X_train_word_ids=tokenizer.texts_to_sequences(X_train_text)
X_test_word_ids = tokenizer.texts_to_sequences(X_test_text)
#序列模式
# 每条样本长度不唯一，将每条样本的长度设置一个固定值
X_train_padded_seqs=pad_sequences(X_train_word_ids,maxlen=256) #将超过固定值的部分截掉，不足的在最前面用0填充
X_test_padded_seqs=pad_sequences(X_test_word_ids, maxlen=256)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\mi\AppData\Local\Temp\jieba.cache
Loading model cost 1.264 seconds.
Prefix dict has been built successfully.


In [None]:
#Smote不均衡采样
from imblearn.over_sampling import SMOTE
def smote(X_t, y_t):
    # Create an instance of SMOTE
    s = SMOTE(random_state=10)
    # Apply SMOTE to the training data
    X_train_resampled, y_train_resampled = s.fit_resample(X_t, y_t)
    return X_train_resampled, y_train_resampled

In [None]:
#smote采样
# X_train_padded_seqs, y_train = smote(X_train_padded_seqs, y_train)
# X_test_padded_seqs, y_test = smote(X_test_padded_seqs, y_test)

In [19]:
#model
#wrote out all the blocks instead of looping for simplicity
filter_nr = 64
filter_size = 3
max_pool_size = 3
max_pool_strides = 2
spatial_dropout = 0
dense_dropout = 0.3
train_embed = False
conv_kern_reg = regularizers.l2(0.00001)
conv_bias_reg = regularizers.l2(0.00001)

comment = Input(shape=(256,), dtype='float64')
emb_comment = Embedding(len(vocab) + 1, 300, input_length=256, trainable=False)(comment)
emb_comment = SpatialDropout1D(spatial_dropout)(emb_comment)

block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment)
block1 = BatchNormalization()(block1)
block1 = PReLU()(block1)
block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block1)
block1 = BatchNormalization()(block1)
block1 = PReLU()(block1)

#we pass embedded comment through conv1d with filter size 1 because it needs to have the same shape as block output
#if you choose filter_nr = embed_size (300 in this case) you don't have to do this part and can add emb_comment directly to block1_output
resize_emb = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment)
resize_emb = PReLU()(resize_emb)
    
block1_output = add([block1, resize_emb])
block1_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block1_output)

block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block1_output)
block2 = BatchNormalization()(block2)
block2 = PReLU()(block2)
block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block2)
block2 = BatchNormalization()(block2)
block2 = PReLU()(block2)
    
block2_output = add([block2, block1_output])
block2_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block2_output)

block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block2_output)
block3 = BatchNormalization()(block3)
block3 = PReLU()(block3)
block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block3)
block3 = BatchNormalization()(block3)
block3 = PReLU()(block3)
    
block3_output = add([block3, block2_output])
# block3_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block3_output)
# 
# block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
#             kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block3_output)
# block4 = BatchNormalization()(block4)
# block4 = PReLU()(block4)
# block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
#             kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block4)
# block4 = BatchNormalization()(block4)
# block4 = PReLU()(block4)
# 
# block4_output = add([block4, block3_output])
# block4_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block4_output)
# 
# block5 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
#             kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block4_output)
# block5 = BatchNormalization()(block5)
# block5 = PReLU()(block5)
# block5 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
#             kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block5)
# block5 = BatchNormalization()(block5)
# block5 = PReLU()(block5)
# 
# block5_output = add([block5, block4_output])
# block5_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block5_output)
# 
# block6 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
#             kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block5_output)
# block6 = BatchNormalization()(block6)
# block6 = PReLU()(block6)
# block6 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
#             kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block6)
# block6 = BatchNormalization()(block6)
# block6 = PReLU()(block6)
# 
# block6_output = add([block6, block5_output])
# block6_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block6_output)
# 
# block7 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
#             kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block6_output)
# block7 = BatchNormalization()(block7)
# block7 = PReLU()(block7)
# block7 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
#             kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block7)
# block7 = BatchNormalization()(block7)
# block7 = PReLU()(block7)
# 
# block7_output = add([block7, block6_output])
output = GlobalMaxPooling1D()(block3_output)

output = Dense(128, activation='linear')(output)
output = BatchNormalization()(output)
output = PReLU()(output)
output = Dropout(dense_dropout)(output)
output = Dense(1, activation='sigmoid')(output)

model = Model(comment, output)

In [20]:
model.compile(loss='mean_squared_error', 
            optimizer=optimizers.Adam(),
            metrics=['accuracy'])
            
# Xtrain, Xval, ytrain, yval = train_test_split(x_train, y_train, train_size=0.95, random_state=233)

In [21]:
def on_epoch_end(epoch, logs):  

    # print(f'Epoch {epoch + 1}, Loss: {logs["loss"]}') 
    print(f'Epoch {epoch + 1}') 
    train_loss = logs.get('loss')  
    val_loss = logs.get('val_loss')  
    train_acc = logs.get('accuracy')  
    val_acc = logs.get('val_accuracy')  
      
    print(f'Epoch {epoch + 1}, Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, '  
          f'Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')  
    print("\n")

In [22]:
from keras.src.callbacks import LambdaCallback

# lr = callbacks.LearningRateScheduler(schedule)
lr=0.0001
callback = LambdaCallback(on_epoch_end=on_epoch_end)
# ra_val = RocAucEvaluation(validation_data=(Xval, yval), interval = 1)
history=model.fit(X_train_padded_seqs, y_train, batch_size=32, epochs=32, validation_split=0.2, callbacks = [callback] ,verbose=1)

Epoch 1/32
Epoch 1, Loss: 0.3498, Val Loss: 0.2532, Acc: 0.4000, Val Acc: 0.6500


Epoch 2/32
Epoch 2, Loss: 0.2322, Val Loss: 0.2488, Acc: 0.6500, Val Acc: 0.6500


Epoch 3/32
Epoch 3, Loss: 0.1549, Val Loss: 0.2405, Acc: 0.7875, Val Acc: 0.6500


Epoch 4/32
Epoch 4, Loss: 0.0914, Val Loss: 0.2334, Acc: 0.9625, Val Acc: 0.6500


Epoch 5/32
Epoch 5, Loss: 0.0613, Val Loss: 0.2318, Acc: 0.9875, Val Acc: 0.6500


Epoch 6/32
Epoch 6, Loss: 0.0527, Val Loss: 0.2382, Acc: 0.9750, Val Acc: 0.6500


Epoch 7/32
Epoch 7, Loss: 0.0410, Val Loss: 0.2516, Acc: 0.9875, Val Acc: 0.6500


Epoch 8/32
Epoch 8, Loss: 0.0280, Val Loss: 0.2693, Acc: 1.0000, Val Acc: 0.6500


Epoch 9/32
Epoch 9, Loss: 0.0192, Val Loss: 0.2869, Acc: 1.0000, Val Acc: 0.6500


Epoch 10/32
Epoch 10, Loss: 0.0202, Val Loss: 0.3032, Acc: 1.0000, Val Acc: 0.6500


Epoch 11/32
Epoch 11, Loss: 0.0169, Val Loss: 0.3168, Acc: 1.0000, Val Acc: 0.6500


Epoch 12/32
Epoch 12, Loss: 0.0150, Val Loss: 0.3275, Acc: 1.0000, Val Acc: 0.6500


In [None]:
# 保存整个模型到一个HDF5文件  
# model.save('DPCNN_model.h5')  

In [23]:
# 评估模型
score = model.evaluate(X_test_padded_seqs, y_test, verbose=0)  
print('Test loss:', score[0])  
print('Test accuracy:', score[1])

Test loss: 0.2153167724609375
Test accuracy: 0.7900000214576721


In [None]:
 # history.history 字典将包含每个epoch的loss和val_loss值  
loss = history.history['loss']  
val_loss = history.history['val_loss']
 
# 绘制训练和验证loss曲线  
plt.figure(figsize=(10, 6))
plt.plot(loss, label='Training Loss')  
plt.plot(val_loss, label='Validation Loss')  
plt.title('Loss Curve')  
plt.xlabel('Epoch')  
plt.ylabel('Loss')  
plt.legend()  
#保存loss曲线
# plt.savefig('DPCNN_loss_curve.png')
plt.show()

In [None]:
#绘制准确率曲线
plt.figure(figsize=(10, 6))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')
# plt.savefig('DPCNN_accuracy.png')
plt.show()

In [None]:
#DPCNN模型的评价
def evaluate_DPCNN_model(model, X_test, y_test):

    # Predict probabilities
    y_pred_proba = model.predict(X_test)
    
    # Predict labels
    y_pred=np.where(y_pred_proba>0.5,1,0)
    
    
    # Calculate accuracy, precision, recall, F1-score, and AUC
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    
    return [accuracy, precision, recall, f1, auc,fpr.tolist(), tpr.tolist()]

In [None]:
import json
# # 加载模型
# def load_model(model_name):
#     model = tf.keras.models.load_model(model_name)
#     return model

#计算每个模型的评价指标,
metrics_name = ['accuracy', 'precision', 'recall', 'f1-score','auc','fpr-score','tpr-score']
#计算每个模型的评价指标值，然后按照模型名，指标名称将结果存入一个字典

metrics = evaluate_DPCNN_model(model, tf.convert_to_tensor(X_test_padded_seqs), y_test)
DPCNN_metrics_dict = {metrics_name[j]: metrics[j] for j in range(len(metrics))}

In [None]:
#以json文件保存字典结果
with open('DPCNN_metrics_dict.json', 'w') as f:
    json.dump(DPCNN_metrics_dict, f)