# 导库

In [1]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets
from torchtext import vocab

from tqdm import tqdm

import pandas as pd
import numpy as np
import random

import os

from sklearn.metrics import roc_auc_score

# 导入自己的库
from Util.utils import get_device, seed_everything
from Util.SST2_data import load_sst2
from ModelHandler import *


In [2]:
device_all, n_gpu=get_device()

# 使用第一块 gpu
device = device_all[1]
print(device, n_gpu)

device is cuda, # cuda is:  4
cuda:1 4


In [3]:
# 设置随机种子
seed_everything()
random.randint(1,10)

3

# 加载 SST2 数据

In [4]:
# SST2 数据准备

text_field = data.Field(tokenize='spacy', lower=True, include_lengths=True,fix_length=40, batch_first=True)
label_field = data.LabelField(dtype=torch.long)

In [5]:
BASE_PATH = "sst2_data/"
train_pd = pd.read_csv(BASE_PATH+'train.tsv', sep='\t')
dev_pd = pd.read_csv(BASE_PATH + 'dev.tsv', sep='\t')
test_pd = pd.read_csv(BASE_PATH + 'test.tsv', sep='\t')

print(train_pd.shape)
print(dev_pd.shape)
print(test_pd.shape)

(67349, 2)
(872, 2)
(1821, 2)


In [6]:
batch_size = 128
embedding_file = '/media/HDD5/ddz/androidvuldetection/.vector_cache/glove.6B.100d.txt'
cache_file = '/media/HDD5/ddz/androidvuldetection/.vector_cache/'
train_iter, dev_iter, test_iter = load_sst2(BASE_PATH, text_field, label_field, batch_size, 
                                            embedding_file,cache_file)


the size of train: 67349, dev:872, test:1821
the result of dataset:  ['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units'] 0
the size of train_iter: 527, dev_iter:7, test_iter:1


In [7]:
# 查看数据

for batch_idx, (X_train_var, y_train_var) in enumerate(train_iter):
    print(batch_idx, X_train_var[0].shape, y_train_var.shape)
    print(X_train_var)
    break

0 torch.Size([128, 40]) torch.Size([128])
(tensor([[  32,   30, 3736,  ...,    1,    1,    1],
        [  29,  437,    9,  ...,    1,    1,    1],
        [   8, 1999,  215,  ...,    1,    1,    1],
        ...,
        [   2,   21,   10,  ...,    1,    1,    1],
        [  11,   28,   15,  ...,    1,    1,    1],
        [  92,   64, 8054,  ...,    1,    1,    1]]), tensor([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7]))


# 网络结构

In [None]:
# 1.与维度变换相关函数 view()，permute()，size(), torch.squeeze() / torch.unsqueeze()
# 2.Embedding层加载预训练模型的方式：1）copy，2）from_pretrained。

class Enet(nn.Module):
    def __init__(self,pretrained_embeddings):
        super(Enet, self).__init__()
#         self.embedding = nn.Embedding(len_vocab,100)
        self.embedding = nn.Embedding.from_pretrained(
            pretrained_embeddings, freeze=False)
        # LSTM 参数以及输入输出说明：
        # 结构参数：LSTM(input_size, hidden_size, num_layers)
        # input_size:输入的特征数量
        # hidden_size:隐藏的特征数量
        # num_layers:层数
        self.lstm = nn.LSTM(100,64,3,batch_first=True)#,bidirectional=True)
        self.linear = nn.Linear(64,2)
        
    def forward(self, x):
        text,_ = x
        batch_size,seq_num = text.shape
#         print(x.shape) #(batch_size 128, sent_len 40)
        vec = self.embedding(text)
#         print(vec.shape) #(batch_size 128,sent_len 40,emb_dim 100)
        out, (hn, cn) = self.lstm(vec)
#         print(out.shape) #(batch_size 128,sent_len 40,64)
        out = self.linear(out[:,-1,:])
#         print(out.shape) #(batch_size 128,2)
        out = F.softmax(out,-1)
        return out
    

In [None]:
# 模型单独测试
pretrained_embeddings = text_field.vocab.vectors
net = Enet(pretrained_embeddings)
for i in train_iter:
    net.forward(i.text)
    break


# # 查看网络结构 input_size=(channels, H, W)
# from torchsummary import summary
# summary(net,input_size=(40,100))

## TextCNN 模型

In [None]:
from model.TextCNN import TextCNN


In [None]:
# 模型单独测试
pretrained_embeddings = text_field.vocab.vectors
embedding_dim = 100
output_dim = 2
n_filters = 64 # 卷积核的个数
filter_sizes=[2,3,4]
dropout=0.7
model = TextCNN(embedding_dim, n_filters, filter_sizes, output_dim, pretrained_embeddings,dropout)
for i in train_iter:
    model.forward(i.text)
    break


## TextRNN 模型

In [None]:
from model.TextRNN import TextRNN


In [None]:
# 模型单独测试
pretrained_embeddings = text_field.vocab.vectors
embedding_dim = 100
output_dim = 2
hidden_size = 64
num_layers=5
bidirectional = True
dropout = 0.5
model = TextRNN(embedding_dim,output_dim,hidden_size,num_layers,bidirectional,pretrained_embeddings,dropout)
for i in train_iter:
#     print(i.text, i.text.shape)
    model.forward(i.text)
    break


## TextRNN_Attention 模型

In [None]:
from model.TextRNN_Attention import TextRNN_Attention


In [None]:

# 模型单独测试
pretrained_embeddings = text_field.vocab.vectors
embedding_dim = 100
output_dim = 2
hidden_size = 50
num_layers=5
bidirectional = True
dropout=0.5
model = TextRNN_Attention(embedding_dim,output_dim,hidden_size,num_layers,
                          bidirectional,pretrained_embeddings, device,dropout).to(device)

for i in train_iter:
#     print(i.text, i.text.shape)
    x = i.text
    y = i.label
    x = (j.to(device) for j in x)
    model.forward(x)
    
    parm={}
    for name,parameters in model.named_parameters():
#         print(name,':',parameters.size())
        parm[name]=parameters.detach().cpu().numpy()
        if name == "ws":
            print(name, parameters.detach().cpu().numpy())
#     print(parm)
    break


## TextRCNN 模型

In [None]:
from model.TextRCNN import TextRCNN


In [None]:

# 模型单独测试
pretrained_embeddings = text_field.vocab.vectors
embedding_dim = 100
output_dim = 2
hidden_size = 64
num_layers=5
bidirectional = True
dropout=0.5
model = TextRCNN(embedding_dim,output_dim,hidden_size,num_layers,bidirectional,pretrained_embeddings,dropout)
for i in train_iter:
#     print(i.text, i.text.shape)
    model.forward(i.text)
    break

## Transformer 模型

In [8]:
from model.Text_Transformer import Text_Transformer


In [9]:
# 模型单独测试
pretrained_embeddings = text_field.vocab.vectors
head = 8
n_layer = 6
emd_dim =100
d_model = 512
d_ff = 1024
output_dim = 2
dropout=0.1

model = Text_Transformer(head,n_layer,emd_dim,d_model,d_ff,output_dim,dropout,pretrained_embeddings)
for i in train_iter:
#     print(i.text, i.text.shape)
    model.forward(i.text)
    break



# 训练验证

In [10]:
%%time
# seed_everything()

train_batch_size, val_batch_size = 2**7, 2**7

pretrained_embeddings = text_field.vocab.vectors

modelHandlerParams = {}
modelHandlerParams['epoch_num'] = 1000000
modelHandlerParams['train_batch_size'] = train_batch_size
modelHandlerParams['val_batch_size'] = val_batch_size
modelHandlerParams['device'] = device

modelHandlerParams['model'] = model
modelHandler = ModelHandler(modelHandlerParams)

model.to(device)
# 二分类交叉熵
loss_fn = nn.BCEWithLogitsLoss().to(device)
# 调参地方，分别调整为0.1,0.01,0.001，最优为0.01
optimizer = optim.Adam(model.parameters(), lr=0.01,
                       weight_decay=0.0001) # lr sets the learning rate of the optimizer

modelHandler.fit(train_iter=train_iter, val_iter=dev_iter,loss_fn=loss_fn,optimizer=optimizer,
                 early_stopping_rounds=10, verbose=2)

************************* epoch: 0 *************************
train auc: 0.7742723919702158
train loss: 0.5673074571864655
*****Checking accuracy on validation set*****
batchNum: 7
val_auc: 0.7527926278764164
val_loss: 0.7208619969231742
************************* epoch: 1 *************************
train auc: 0.852260686439005
train loss: 0.47837434926793065
*****Checking accuracy on validation set*****
batchNum: 7
val_auc: 0.7293584061885429
val_loss: 0.6211611543382917
************************* epoch: 2 *************************
train auc: 0.8800520734112616
train loss: 0.43105642932981425
*****Checking accuracy on validation set*****
batchNum: 7
val_auc: 0.77902163166687
val_loss: 0.6573755059923444
************************* epoch: 3 *************************
train auc: 0.8916047769591671
train loss: 0.41090026774940275
*****Checking accuracy on validation set*****
batchNum: 7
val_auc: 0.772788585539259
val_loss: 0.633748744215284
************************* epoch: 4 *******************

val_auc: 0.7729971240086112
val_loss: 0.6194078751972744
************************* epoch: 35 *************************
train auc: 0.8969894502654823
train loss: 0.41730954250642427
*****Checking accuracy on validation set*****
batchNum: 7
val_auc: 0.7817560945283406
val_loss: 0.6836584040096828
************************* epoch: 36 *************************
train auc: 0.9150062400304086
train loss: 0.3751266386070107
*****Checking accuracy on validation set*****
batchNum: 7
val_auc: 0.8243873882844632
val_loss: 0.5413956429277148
************************* epoch: 37 *************************
train auc: 0.924150334410145
train loss: 0.3559653623280534
*****Checking accuracy on validation set*****
batchNum: 7
val_auc: 0.8196669151707902
val_loss: 0.6070291016783033
************************* epoch: 38 *************************
train auc: 0.9227525906097996
train loss: 0.35629059812601876
*****Checking accuracy on validation set*****
batchNum: 7
val_auc: 0.8224789882211915
val_loss: 0.5208652