# 导库

In [1]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets
from torchtext import vocab

from tqdm import tqdm

import pandas as pd
import numpy as np
import random

import os

from sklearn.metrics import roc_auc_score

# 导入自己的库
from Util.utils import get_device, seed_everything
from Util.SST2_data import load_sst2
from ModelHandler import *


In [2]:
device, n_gpu=get_device()
print(device, n_gpu)


device is cpu, not recommend
cpu 0


In [3]:

seed_everything()
random.randint(1,10)

3

# 加载 SST2 数据

In [4]:
# SST2 数据准备

text_field = data.Field(tokenize='spacy', lower=True, include_lengths=True,fix_length=40, batch_first=True)
label_field = data.LabelField(dtype=torch.long)

In [5]:
BASE_PATH = "sst2_data/"
train_pd = pd.read_csv(BASE_PATH+'train.tsv', sep='\t')
dev_pd = pd.read_csv(BASE_PATH + 'dev.tsv', sep='\t')
test_pd = pd.read_csv(BASE_PATH + 'test.tsv', sep='\t')

print(train_pd.shape)
print(dev_pd.shape)
print(test_pd.shape)

(67349, 2)
(872, 2)
(1821, 2)


In [6]:
batch_size = 128
train_iter, dev_iter, test_iter = load_sst2(BASE_PATH, text_field, label_field, batch_size, device)


the size of train: 67349, dev:872, test:1821
the result of dataset:  ['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units'] 0
the size of train_iter: 527, dev_iter:7, test_iter:1


In [7]:
# 查看数据

for batch_idx, (X_train_var, y_train_var) in enumerate(train_iter):
    print(batch_idx, X_train_var[0].shape, y_train_var.shape)
    print(X_train_var)
    break

0 torch.Size([128, 40]) torch.Size([128])
(tensor([[  32,   30, 3736,  ...,    1,    1,    1],
        [  29,  437,    9,  ...,    1,    1,    1],
        [   8, 1999,  215,  ...,    1,    1,    1],
        ...,
        [   2,   21,   10,  ...,    1,    1,    1],
        [  11,   28,   15,  ...,    1,    1,    1],
        [  92,   64, 8054,  ...,    1,    1,    1]]), tensor([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7]))


# 网络结构

In [None]:
# 1.与维度变换相关函数 view()，permute()，size(), torch.squeeze() / torch.unsqueeze()
# 2.Embedding层加载预训练模型的方式：1）copy，2）from_pretrained。

class Enet(nn.Module):
    def __init__(self,pretrained_embeddings):
        super(Enet, self).__init__()
#         self.embedding = nn.Embedding(len_vocab,100)
        self.embedding = nn.Embedding.from_pretrained(
            pretrained_embeddings, freeze=False)
        # LSTM 参数以及输入输出说明：
        # 结构参数：LSTM(input_size, hidden_size, num_layers)
        # input_size:输入的特征数量
        # hidden_size:隐藏的特征数量
        # num_layers:层数
        self.lstm = nn.LSTM(100,64,3,batch_first=True)#,bidirectional=True)
        self.linear = nn.Linear(64,2)
        
    def forward(self, x):
        text,_ = x
        batch_size,seq_num = text.shape
#         print(x.shape) #(batch_size 128, sent_len 40)
        vec = self.embedding(text)
#         print(vec.shape) #(batch_size 128,sent_len 40,emb_dim 100)
        out, (hn, cn) = self.lstm(vec)
#         print(out.shape) #(batch_size 128,sent_len 40,64)
        out = self.linear(out[:,-1,:])
#         print(out.shape) #(batch_size 128,2)
        out = F.softmax(out,-1)
        return out
    

In [None]:
# 模型单独测试
pretrained_embeddings = text_field.vocab.vectors
net = Enet(pretrained_embeddings)
for i in train_iter:
    net.forward(i.text)
    break


# # 查看网络结构 input_size=(channels, H, W)
# from torchsummary import summary
# summary(net,input_size=(40,100))

## TextCNN 模型

In [None]:
from model.TextCNN import TextCNN


In [None]:
# 模型单独测试
pretrained_embeddings = text_field.vocab.vectors
model = TextCNN(100,100,[3,4,5],2,pretrained_embeddings)
for i in train_iter:
    model.forward(i.text)
    break


## TextRNN 模型

In [None]:
from model.TextRNN import TextRNN


In [None]:
# 模型单独测试
pretrained_embeddings = text_field.vocab.vectors
embedding_dim = 100
output_dim = 2
hidden_size = 50
num_layers=5
bidirectional = True
model = TextRNN(embedding_dim,output_dim,hidden_size,num_layers,bidirectional,pretrained_embeddings)
for i in train_iter:
#     print(i.text, i.text.shape)
    model.forward(i.text)
    break


## TextRNN_Attention 模型

In [None]:
from model.TextRNN_Attention import TextRNN_Attention

In [None]:

# 模型单独测试
pretrained_embeddings = text_field.vocab.vectors
embedding_dim = 100
output_dim = 2
hidden_size = 50
num_layers=5
bidirectional = True
model = TextRNN_Attention(embedding_dim,output_dim,hidden_size,num_layers,bidirectional,pretrained_embeddings, device)

for i in train_iter:
#     print(i.text, i.text.shape)
    model.forward(i.text)
    
    parm={}
    for name,parameters in model.named_parameters():
#         print(name,':',parameters.size())
        parm[name]=parameters.detach().numpy()
        if name == "ws":
            print(name, parameters.detach().numpy())
#     print(parm)
    break


## TextRCNN 模型

In [None]:
from model.TextRCNN import TextRCNN


In [None]:

# 模型单独测试
pretrained_embeddings = text_field.vocab.vectors
embedding_dim = 100
output_dim = 2
hidden_size = 50
num_layers=5
bidirectional = True
model = TextRCNN(embedding_dim,output_dim,hidden_size,num_layers,bidirectional,pretrained_embeddings, device)
for i in train_iter:
#     print(i.text, i.text.shape)
    model.forward(i.text)
    break

## Transformer 模型

In [8]:
from torch import nn

from model.transformer.Embeddings import Embeddings
from model.transformer.Encoder import Encoder
from model.transformer.EncoderLayer import EncoderLayer
from model.transformer.MultiHeadedAttention import MultiHeadedAttention
from model.transformer.PositionalEncoding import PositionalEncoding
from model.transformer.PositionwiseFeedForward import PositionwiseFeedForward


class Transformer_TextClassifier(nn.Module):
    """ 用 Transformer 来作为特征抽取的基本单元 """

    def __init__(self, head, n_layer, emd_dim, d_model, d_ff, output_dim, dropout, pretrained_embeddings):
        super(Transformer_TextClassifier, self).__init__()

        self.word_embedding = Embeddings(pretrained_embeddings, emd_dim)
        self.position_embedding = PositionalEncoding(emd_dim, dropout)

        # 这一层主要是调整维度，也可以放在最后的全连接层
        self.trans_linear = nn.Linear(emd_dim, d_model)

        multi_attn = MultiHeadedAttention(head, d_model)
        feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)

        self.encoder = Encoder(EncoderLayer(d_model, multi_attn, feed_forward, dropout), n_layer)
        self.fc = nn.Linear(d_model, output_dim)

    def forward(self, x):
        """
        x:
            text: [sent len, batch size], 文本数据
            text_lens: [batch_size], 文本数据长度
        """
        text, _ = x
        # text: [batch_size, sent_len]

        embeddings = self.word_embedding(text)
        # embeddings: [batch_size, sent_len, emd_dim]
        embeddings = self.position_embedding(embeddings)
        # embeddings: [batch_size, sent_len, emd_dim]

        embeddings = self.trans_linear(embeddings)
        # embeddings: [batch_size, sent_len, d_model]
        
        embeddings = self.encoder(embeddings)
        # embeddings: [batch_size, sent_len, d_model]

        features = embeddings[:, -1, :]
        # features: [batch_size, d_model]
        
        return self.fc(features)



In [9]:
# 模型单独测试
pretrained_embeddings = text_field.vocab.vectors
head = 8
n_layer = 6
emd_dim =100
d_model = 512
d_ff = 1024
output_dim = 2
dropout=0.1

model = Transformer_TextClassifier(head,n_layer,emd_dim,d_model,d_ff,output_dim,dropout,pretrained_embeddings)
for i in train_iter:
#     print(i.text, i.text.shape)
    model.forward(i.text)
    break



  requires_grad=False)


# 训练验证

In [11]:
%%time
# seed_everything()

train_batch_size, val_batch_size = 2**7, 2**7

pretrained_embeddings = text_field.vocab.vectors

modelHandlerParams = {}
modelHandlerParams['epoch_num'] = 1000000
modelHandlerParams['train_batch_size'] = train_batch_size
modelHandlerParams['val_batch_size'] = val_batch_size
modelHandlerParams['device'] = device

modelHandlerParams['model'] = model
modelHandler = ModelHandler(modelHandlerParams)

# 二分类交叉熵
loss_fn = nn.BCEWithLogitsLoss().to(device)
# 调参地方，分别调整为0.1,0.01,0.001，最优为0.01
optimizer = optim.Adam(model.parameters(), lr=0.01,
                       weight_decay=0.00001) # lr sets the learning rate of the optimizer

modelHandler.fit(train_iter=train_iter, val_iter=dev_iter,loss_fn=loss_fn,optimizer=optimizer,
                 early_stopping_rounds=10, verbose=2)

************************* epoch: 0 *************************


  requires_grad=False)


KeyboardInterrupt: 