# 导库

In [1]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets
from torchtext import vocab

from tqdm import tqdm

import pandas as pd
import numpy as np
import random

import os

from sklearn.metrics import roc_auc_score

# 导入自己的库
from Util.utils import seed_everything,get_device
from Util.SST2_data import load_sst2
from ModelHandler import *


device is cpu, not recommend
cpu
(67349, 2)
(872, 2)
(1821, 2)
the size of train: 67349, dev:872, test:1821
['contains', 'no', 'wit', ',', 'only', 'labored', 'gags'] 0
['unflinchingly', 'bleak', 'and', 'desperate'] 0
['this', 'film', "'s", 'relationship', 'to', 'actual', 'tension', 'is', 'the', 'same', 'as', 'what', 'christmas', '-', 'tree', 'flocking', 'in', 'a', 'spray', 'can', 'is', 'to', 'actual', 'snow', ':', 'a', 'poor', '--', 'if', 'durable', '--', 'imitation', '.']
16292
1821
the size of train_iter: 527, dev_iter:7, test_iter:1
0 torch.Size([128, 40]) torch.Size([128])
the size of train: 67349, dev:872, test:1821
the result of dataset:  ['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units'] 0
the size of train_iter: 527, dev_iter:7, test_iter:1
the shape of train_x: torch.Size([128, 40]), train_y:torch.Size([128])


In [3]:
device, n_gpu=get_device()
print(device, n_gpu)

device is cpu, not recommend
cpu 0


# 加载 SST2 数据

In [4]:
# SST2 数据准备

text_field = data.Field(tokenize='spacy', lower=True, fix_length=40, batch_first=True)
label_field = data.LabelField(dtype=torch.long)

In [5]:
BASE_PATH = "/home/dudaizhong/Downloads/SST-2/"
train_pd = pd.read_csv(BASE_PATH+'train.tsv', sep='\t')
dev_pd = pd.read_csv(BASE_PATH + 'dev.tsv', sep='\t')
test_pd = pd.read_csv(BASE_PATH + 'test.tsv', sep='\t')

print(train_pd.shape)
print(dev_pd.shape)
print(test_pd.shape)

(67349, 2)
(872, 2)
(1821, 2)


In [6]:
batch_size = 128
train_iter, dev_iter, test_iter = load_sst2(BASE_PATH, text_field, label_field, batch_size, device)

the size of train: 67349, dev:872, test:1821
the result of dataset:  ['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units'] 0
the size of train_iter: 527, dev_iter:7, test_iter:1
the shape of train_x: torch.Size([128, 40]), train_y:torch.Size([128])


# 网络结构

In [7]:
# 1.与维度变换相关函数 view()，permute()，size(), torch.squeeze() / torch.unsqueeze()
# 2.Embedding层加载预训练模型的方式：1）copy，2）from_pretrained。

class Enet(nn.Module):
    def __init__(self,pretrained_embeddings):
        super(Enet, self).__init__()
#         self.embedding = nn.Embedding(len_vocab,100)
        self.embedding = nn.Embedding.from_pretrained(
            pretrained_embeddings, freeze=False)
        # LSTM 参数以及输入输出说明：
        # 结构参数：LSTM(input_size, hidden_size, num_layers)
        # input_size:输入的特征数量
        # hidden_size:隐藏的特征数量
        # num_layers:层数
        self.lstm = nn.LSTM(100,64,3,batch_first=True)#,bidirectional=True)
        self.linear = nn.Linear(64,2)
        
    def forward(self, x):
        batch_size,seq_num = x.shape
#         print(x.shape) #(batch_size 128, sent_len 40)
        vec = self.embedding(x)
#         print(vec.shape) #(batch_size 128,sent_len 40,emb_dim 100)
        out, (hn, cn) = self.lstm(vec)
#         print(out.shape) #(batch_size 128,sent_len 40,64)
        out = self.linear(out[:,-1,:])
#         print(out.shape) #(batch_size 128,2)
        out = F.softmax(out,-1)
        return out
    

## TextCNN 模型

In [10]:
# 卷积网络一般情况
class Conv1d(nn.Module):
    def __init__(self, in_channels, out_channels, filter_sizes):
        super(Conv1d, self).__init__()
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=in_channels,
                      out_channels=out_channels,
                      kernel_size=fs)
            for fs in filter_sizes
        ])

        self.init_params()

    def init_params(self):
        for m in self.convs:
            nn.init.xavier_uniform_(m.weight.data)
            nn.init.constant_(m.bias.data, 0.1)

    def forward(self, x):
        return [F.relu(conv(x)) for conv in self.convs]

In [26]:

class TextCNN(nn.Module):
    def __init__(self,embedding_dim, n_filters, filter_sizes, output_dim,
                  pretrained_embeddings):
        super(TextCNN, self).__init__()
#         self.embedding = nn.Embedding(len_vocab,100)
        self.embedding = nn.Embedding.from_pretrained(
            pretrained_embeddings, freeze=False)
        
        self.convs = Conv1d(embedding_dim,n_filters,filter_sizes)
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
    def forward(self, x):
        batch_size,seq_num = x.shape
#         print(x.shape) #(batch_size,sent_len),(128,40)
        vec = self.embedding(x)
#         print(vec.shape) #(batch_size,sent_len,emb_dim),(128,40,100)
        vec = vec.permute(0,2,1)
#         print(vec.shape) #(batch_size,emb_dim,sent_len),(128,100,40)
        
        conved = self.convs(vec)
#         print([conv.shape for conv in conved]) 
#         (batch_size,n_filters,sent_len - filter_sizes[n] - 1)([128, 100, 40-2+1]), torch.Size([128, 100, 37]), torch.Size([128, 100, 36])
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2)
                  for conv in conved]
#         print([pool.shape for pool in pooled]) 
#        (batch_size, n_filters)([128, 100]), torch.Size([128, 100]), torch.Size([128, 100])
        
#         cat函数将（A,B），dim=0按行拼接，dim=1按列拼接
        cat = torch.cat(pooled, dim=1)
#         print(cat.shape) # [128, 300]
        out = self.fc(cat)
#         print(out.shape) # [128, 2]
        
        return out
    

In [27]:
# 模型单独测试
pretrained_embeddings = text_field.vocab.vectors
textCNN = TextCNN(100,100,[3,4,5],2,pretrained_embeddings)
for i in train_iter:
    print(i.text, i.text.shape)
    textCNN.forward(i.text)
    break



tensor([[ 139,  562,   16,  ...,    1,    1,    1],
        [  31,    6,    2,  ...,    1,    1,    1],
        [  13,  358,    6,  ...,    1,    1,    1],
        ...,
        [  52, 2509,    3,  ...,    1,    1,    1],
        [  10,   35,  665,  ...,    1,    1,    1],
        [4908,    3, 5795,  ...,    1,    1,    1]]) torch.Size([128, 40])


## TextRNN 模型

In [129]:
import torch.nn as nn
import torch.nn.functional as F
import torch


class LSTM(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, bidirectional, batch_first=True,dropout=0):
        """
        Args: 
            input_size: x 的特征维度
            hidden_size: 隐层的特征维度
            num_layers: LSTM 层数
        """
        super(LSTM, self).__init__()

        self.rnn = nn.LSTM(
            input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, 
            batch_first=batch_first, dropout=dropout
        )

        self.init_params()

    def init_params(self):
        for i in range(self.rnn.num_layers):
            nn.init.orthogonal_(getattr(self.rnn, f'weight_hh_l{i}'))
            nn.init.kaiming_normal_(getattr(self.rnn, f'weight_ih_l{i}'))
            nn.init.constant_(getattr(self.rnn, f'bias_hh_l{i}'), val=0)
            nn.init.constant_(getattr(self.rnn, f'bias_ih_l{i}'), val=0)
            getattr(self.rnn, f'bias_hh_l{i}').chunk(4)[1].fill_(1)

            if self.rnn.bidirectional:
                nn.init.orthogonal_(
                    getattr(self.rnn, f'weight_hh_l{i}_reverse'))
                nn.init.kaiming_normal_(
                    getattr(self.rnn, f'weight_ih_l{i}_reverse'))
                nn.init.constant_(
                    getattr(self.rnn, f'bias_hh_l{i}_reverse'), val=0)
                nn.init.constant_(
                    getattr(self.rnn, f'bias_ih_l{i}_reverse'), val=0)
                getattr(self.rnn, f'bias_hh_l{i}_reverse').chunk(4)[1].fill_(1)

    def forward(self, x, lengths):
        # x: [seq_len, batch_size, input_size]
        # lengths: [batch_size]
        packed_x = nn.utils.rnn.pack_padded_sequence(x, lengths)

        # packed_x， packed_output: PackedSequence 对象
        # hidden: [num_layers * bidirectional, batch_size, hidden_size]
        # cell: [num_layers * bidirectional, batch_size, hidden_size]
        packed_output, (hidden, cell) = self.rnn(packed_x)

        # output: [real_seq_len, batch_size, hidden_size * 2]
        # output_lengths: [batch_size]
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        return hidden, output

In [175]:
# 卷积网络一般情况
class TextRNN(nn.Module):
    def __init__(self,embedding_dim, output_dim, hidden_size, num_layers, bidirectional, pretrained_embeddings):
        super(TextRNN, self).__init__()
        
        self.hidden_size = hidden_size
        
#         self.embedding = nn.Embedding(len_vocab,100)
        self.embedding = nn.Embedding.from_pretrained(
            pretrained_embeddings, freeze=False)
        # LSTM 参数以及输入输出说明：
        # 结构参数：LSTM(input_size, hidden_size, num_layers)
        # input_size:输入的特征数量
        # hidden_size:隐藏的特征数量
        # num_layers:层数
        self.lstm = nn.LSTM(embedding_dim,hidden_size,num_layers,bidirectional=bidirectional)#,bidirectional=True)
        
        self.fc = nn.Linear(hidden_size * 2, output_dim)
        
    def forward(self, x):
        batch_size,sent_len = x.shape
#         print(x.shape) #(batch_size 128, sent_len 40)
        vec = self.embedding(x)
#         print(vec.shape) #(batch_size 128,sent_len 40,emb_dim 100)
        vec = vec.permute(1,0,2)
        lstm_out, (hn, cn) = self.lstm(vec)
#         print(lstm_out.shape) #(sent_len 40,batch_size 128,hidden_size*2 128)
        # 这里进行前后连接时，使用的隐藏状态 hn 的最后一层 与 直接使用lstm_out中最后一层有不一样
        hn = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim=1)
#         print("hn shape:",hn.shape) # ([128, 128])
        out = self.fc(hn)
#         print(out.shape)
        return out

In [174]:
# 模型单独测试
pretrained_embeddings = text_field.vocab.vectors
embedding_dim = 100
output_dim = 2
hidden_size = 50
num_layers=5
bidirectional = True
model = TextRNN(embedding_dim,output_dim,hidden_size,num_layers,bidirectional,pretrained_embeddings)
for i in train_iter:
#     print(i.text, i.text.shape)
    model.forward(i.text)
    break


torch.Size([128, 40])
torch.Size([128, 40, 100])
torch.Size([40, 128, 100])
hn shape: torch.Size([128, 100])
torch.Size([128, 2])


# 训练验证

In [None]:
%%time
# seed_everything()

train_batch_size, val_batch_size = 2**7, 2**7

pretrained_embeddings = text_field.vocab.vectors
# model = Enet(pretrained_embeddings)
# model = TextCNN(100,100,[3,4,5],2,pretrained_embeddings)

pretrained_embeddings = text_field.vocab.vectors
embedding_dim = 100
output_dim = 2
hidden_size = 50
num_layers=5
bidirectional = True
model = TextRNN(embedding_dim,output_dim,hidden_size,num_layers,bidirectional,pretrained_embeddings)

modelHandlerParams = {}
modelHandlerParams['epoch_num'] = 1000000
modelHandlerParams['train_batch_size'] = train_batch_size
modelHandlerParams['val_batch_size'] = val_batch_size
modelHandlerParams['device'] = device

modelHandlerParams['model'] = model
modelHandler = ModelHandler(modelHandlerParams)

# 二分类交叉熵
loss_fn = nn.BCEWithLogitsLoss().to(device)
# 调参地方，分别调整为0.1,0.01,0.001，最优为0.01
optimizer = optim.Adam(model.parameters(), lr=0.01,
                       weight_decay=0.00001) # lr sets the learning rate of the optimizer

modelHandler.fit(train_iter=train_iter, val_iter=dev_iter,loss_fn=loss_fn,optimizer=optimizer,
                 early_stopping_rounds=10, verbose=2)

************************* epoch: 0 *************************


In [73]:
# input_size, hidden_size, num_layers
# rnn = nn.RNN(10, 5, 2)

# inputR = torch.randn(2, 1, 10)

# h0 = torch.randn(2, 1, 5)

# output, hn = rnn(inputR, h0)
# print(output.shape)
# print(hn.shape)

# print(rnn.weight_ih_l0.data.shape,rnn.weight_hh_l0.data.shape)
# print(rnn.weight_ih_l1.data.shape,rnn.weight_hh_l1.data.shape)

torch.Size([5, 10]) torch.Size([5, 5])
torch.Size([5, 5]) torch.Size([5, 5])
