# 导库

In [1]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets
from torchtext import vocab

from tqdm import tqdm

import pandas as pd
import numpy as np
import random

import os

from sklearn.metrics import roc_auc_score

# 导入自己的库
from Util.utils import seed_everything,get_device
from Util.SST2_data import load_sst2
from ModelHandler import *


device is cpu, not recommend
cpu
(67349, 2)
(872, 2)
(1821, 2)
the size of train: 67349, dev:872, test:1821
['contains', 'no', 'wit', ',', 'only', 'labored', 'gags'] 0
['unflinchingly', 'bleak', 'and', 'desperate'] 0
['this', 'film', "'s", 'relationship', 'to', 'actual', 'tension', 'is', 'the', 'same', 'as', 'what', 'christmas', '-', 'tree', 'flocking', 'in', 'a', 'spray', 'can', 'is', 'to', 'actual', 'snow', ':', 'a', 'poor', '--', 'if', 'durable', '--', 'imitation', '.']
16292
1821
the size of train_iter: 527, dev_iter:7, test_iter:1
0 torch.Size([128, 40]) torch.Size([128])
the size of train: 67349, dev:872, test:1821
the result of dataset:  ['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units'] 0
the size of train_iter: 527, dev_iter:7, test_iter:1
the shape of train_x: torch.Size([128, 40]), train_y:torch.Size([128])


In [3]:
device, n_gpu=get_device()
print(device, n_gpu)

device is cpu, not recommend
cpu 0


# 加载 SST2 数据

In [4]:
# SST2 数据准备

text_field = data.Field(tokenize='spacy', lower=True, fix_length=40, batch_first=True)
label_field = data.LabelField(dtype=torch.long)

In [5]:
BASE_PATH = "/home/dudaizhong/Downloads/SST-2/"
train_pd = pd.read_csv(BASE_PATH+'train.tsv', sep='\t')
dev_pd = pd.read_csv(BASE_PATH + 'dev.tsv', sep='\t')
test_pd = pd.read_csv(BASE_PATH + 'test.tsv', sep='\t')

print(train_pd.shape)
print(dev_pd.shape)
print(test_pd.shape)

(67349, 2)
(872, 2)
(1821, 2)


In [6]:
batch_size = 128
train_iter, dev_iter, test_iter = load_sst2(BASE_PATH, text_field, label_field, batch_size, device)

the size of train: 67349, dev:872, test:1821
the result of dataset:  ['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units'] 0
the size of train_iter: 527, dev_iter:7, test_iter:1
the shape of train_x: torch.Size([128, 40]), train_y:torch.Size([128])


# 网络结构

In [7]:
# 1.与维度变换相关函数 view()，permute()，size(), torch.squeeze() / torch.unsqueeze()
# 2.Embedding层加载预训练模型的方式：1）copy，2）from_pretrained。

class Enet(nn.Module):
    def __init__(self,pretrained_embeddings):
        super(Enet, self).__init__()
#         self.embedding = nn.Embedding(len_vocab,100)
        self.embedding = nn.Embedding.from_pretrained(
            pretrained_embeddings, freeze=False)
        # LSTM 参数以及输入输出说明：
        # 结构参数：LSTM(input_size, hidden_size, num_layers)
        # input_size:输入的特征数量
        # hidden_size:隐藏的特征数量
        # num_layers:层数
        self.lstm = nn.LSTM(100,64,3,batch_first=True)#,bidirectional=True)
        self.linear = nn.Linear(64,2)
        
    def forward(self, x):
        batch_size,seq_num = x.shape
#         print(x.shape) #(batch_size 128, sent_len 40)
        vec = self.embedding(x)
#         print(vec.shape) #(batch_size 128,sent_len 40,emb_dim 100)
        out, (hn, cn) = self.lstm(vec)
#         print(out.shape) #(batch_size 128,sent_len 40,64)
        out = self.linear(out[:,-1,:])
#         print(out.shape) #(batch_size 128,2)
        out = F.softmax(out,-1)
        return out
    

## TextCNN 模型

In [10]:
# 卷积网络一般情况
class Conv1d(nn.Module):
    def __init__(self, in_channels, out_channels, filter_sizes):
        super(Conv1d, self).__init__()
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=in_channels,
                      out_channels=out_channels,
                      kernel_size=fs)
            for fs in filter_sizes
        ])

        self.init_params()

    def init_params(self):
        for m in self.convs:
            nn.init.xavier_uniform_(m.weight.data)
            nn.init.constant_(m.bias.data, 0.1)

    def forward(self, x):
        return [F.relu(conv(x)) for conv in self.convs]

In [26]:

class TextCNN(nn.Module):
    def __init__(self,embedding_dim, n_filters, filter_sizes, output_dim,
                  pretrained_embeddings):
        super(TextCNN, self).__init__()
#         self.embedding = nn.Embedding(len_vocab,100)
        self.embedding = nn.Embedding.from_pretrained(
            pretrained_embeddings, freeze=False)
        
        self.convs = Conv1d(embedding_dim,n_filters,filter_sizes)
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
    def forward(self, x):
        batch_size,seq_num = x.shape
#         print(x.shape) #(batch_size 128, sent_len 40)
        vec = self.embedding(x)
#         print(vec.shape) #(batch_size 128,sent_len 40,emb_dim 100)
        vec = vec.permute(0,2,1)
#         print(vec.shape) #(batch_size 128,emb_dim 100,sent_len 40)
        
        conved = self.convs(vec)
#         print([conv.shape for conv in conved]) # ([128, 100, 40-2+1]), torch.Size([128, 100, 37]), torch.Size([128, 100, 36])
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2)
                  for conv in conved]
#         print([pool.shape for pool in pooled]) # ([128, 100]), torch.Size([128, 100]), torch.Size([128, 100])
        
        cat = torch.cat(pooled, dim=1)
#         print(cat.shape) # [128, 300]
        out = self.fc(cat)
#         print(out.shape) # [128, 2]
        
        return out
    

In [27]:
# 模型单独测试
pretrained_embeddings = text_field.vocab.vectors
textCNN = TextCNN(100,100,[3,4,5],2,pretrained_embeddings)
for i in train_iter:
    print(i.text, i.text.shape)
    textCNN.forward(i.text)
    break



tensor([[ 139,  562,   16,  ...,    1,    1,    1],
        [  31,    6,    2,  ...,    1,    1,    1],
        [  13,  358,    6,  ...,    1,    1,    1],
        ...,
        [  52, 2509,    3,  ...,    1,    1,    1],
        [  10,   35,  665,  ...,    1,    1,    1],
        [4908,    3, 5795,  ...,    1,    1,    1]]) torch.Size([128, 40])


# 训练验证

In [None]:
%%time
# seed_everything()

train_batch_size, val_batch_size = 2**7, 2**7

pretrained_embeddings = text_field.vocab.vectors
# model = Enet(pretrained_embeddings)
model = TextCNN(100,100,[3,4,5],2,pretrained_embeddings)

modelHandlerParams = {}
modelHandlerParams['epoch_num'] = 1000000
modelHandlerParams['train_batch_size'] = train_batch_size
modelHandlerParams['val_batch_size'] = val_batch_size
modelHandlerParams['device'] = device

modelHandlerParams['model'] = model
modelHandler = ModelHandler(modelHandlerParams)

# 二分类交叉熵
loss_fn = nn.BCEWithLogitsLoss().to(device)
# 调参地方，分别调整为0.1,0.01,0.001，最优为0.01
optimizer = optim.Adam(model.parameters(), lr=0.01,
                       weight_decay=0.00001) # lr sets the learning rate of the optimizer

modelHandler.fit(train_iter=train_iter, val_iter=dev_iter,loss_fn=loss_fn,optimizer=optimizer,
                 early_stopping_rounds=10, verbose=2)

************************* epoch: 0 *************************
train auc: 0.9364563184657851
train loss: 0.3402541098829917
*****Checking accuracy on validation set*****
batchNum: 7
val_auc: 0.8895408693254895
val_loss: 0.46029689056532724
************************* epoch: 1 *************************
