# 导库

In [21]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets
from torchtext import vocab

from tqdm import tqdm

import pandas as pd
import numpy as np
import random

import os

from sklearn.metrics import roc_auc_score


In [2]:
def seed_everything(seed=2019):
    '''
    设置随机种子，最好在训练的时候调用
    '''
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
def get_device():
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    if torch.cuda.is_available():
        print("device is cuda, # cuda is: ", n_gpu)
    else:
        print("device is cpu, not recommend")
    return device, n_gpu

In [4]:
DEVICE, n_gpu = get_device()
print(DEVICE)

device is cpu, not recommend
cpu


# 数据预处理

In [5]:
BASE_PATH = "/home/dudaizhong/Downloads/SST-2/"
train_pd = pd.read_csv(BASE_PATH+'train.tsv', sep='\t')
dev_pd = pd.read_csv(BASE_PATH + 'dev.tsv', sep='\t')
test_pd = pd.read_csv(BASE_PATH + 'test.tsv', sep='\t')

print(train_pd.shape)
print(dev_pd.shape)
print(test_pd.shape)

(67349, 2)
(872, 2)
(1821, 2)


## 定义 Field

In [6]:
# 1. 定义 Field

text_field = data.Field(tokenize='spacy', lower=True, fix_length=40, batch_first=True)
label_field = data.LabelField(dtype=torch.long)


## 定义 DataSet

In [7]:
# 2. 定义 DataSet

train, dev = data.TabularDataset.splits(
        path=BASE_PATH, train='train.tsv', validation='dev.tsv',format='tsv', skip_header=True,
        fields=[('text', text_field), ('label', label_field)])

# 这里需要注意单独处理的时候不能用 splits 方法。
test = data.TabularDataset(BASE_PATH+'test.tsv', format='tsv', skip_header=True,
        fields=[('index', label_field), ('text', text_field)])

print("the size of train: {}, dev:{}, test:{}".format(
    len(train), len(dev), len(test)))


the size of train: 67349, dev:872, test:1821


In [8]:
# 查看 Example
print(train[1].text, train[1].label)

print(dev[1].text, dev[1].label)

print(test[1].text)

['contains', 'no', 'wit', ',', 'only', 'labored', 'gags'] 0
['unflinchingly', 'bleak', 'and', 'desperate'] 0
['this', 'film', "'s", 'relationship', 'to', 'actual', 'tension', 'is', 'the', 'same', 'as', 'what', 'christmas', '-', 'tree', 'flocking', 'in', 'a', 'spray', 'can', 'is', 'to', 'actual', 'snow', ':', 'a', 'poor', '--', 'if', 'durable', '--', 'imitation', '.']


## 建立 Vocab

In [9]:
# 3. 建立 vocab，大小是text_field里面的词数量
# vectors = vocab.Vectors(embedding_file, cache_dir)

text_field.build_vocab(
        train, dev, test, max_size=25000,
        vectors='glove.6B.100d', unk_init=torch.Tensor.normal_)

label_field.build_vocab(train, dev, test)

In [10]:
len_vocab = len(text_field.vocab)
print(len_vocab)

print(len(label_field.vocab))
# for step, batch in enumerate(tqdm(train_iter, desc="Iteration")):
#     print(batch.text, batch.label)
    

16292
1821


## 构造 Iterater

In [11]:
# 4. 构造迭代器

train_iter, dev_iter = data.BucketIterator.splits(
        (train, dev), batch_sizes=(128, 128), sort_key=lambda x: len(x.text), 
        sort_within_batch=True, repeat=False, shuffle=True, device=DEVICE)

# 同样单独处理的时候
test_iter = data.Iterator(test, batch_size=len(test), train=False,
                          sort=False, device=DEVICE)

print("the size of train_iter: {}, dev_iter:{}, test_iter:{}".format(
    len(train_iter), len(dev_iter), len(test_iter)))

the size of train_iter: 527, dev_iter:7, test_iter:1


In [12]:
# 查看 Iterater
# seed_everything()
for batch_idx, (X_train_var, y_train_var) in enumerate(train_iter):
    print(batch_idx, X_train_var, y_train_var)
    break


0 tensor([[ 3816,   145,     4,  ...,     1,     1,     1],
        [    2,  1117,     6,  ...,     1,     1,     1],
        [   70,    28, 12204,  ...,     1,     1,     1],
        ...,
        [ 3877,    12,     2,  ...,     1,     1,     1],
        [   10,     4,   269,  ...,     1,     1,     1],
        [   63,     2,   133,  ...,     1,     1,     1]]) tensor([0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
        1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
        1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,
        1, 1, 0, 0, 0, 1, 0, 0])


# 网络结构

In [13]:
# 1.与维度变换相关函数 view()，permute()，size(), torch.squeeze() / torch.unsqueeze()
# 2.Embedding层加载预训练模型的方式：1）copy，2）from_pretrained。

class Enet(nn.Module):
    def __init__(self,pretrained_embeddings):
        super(Enet, self).__init__()
#         self.embedding = nn.Embedding(len_vocab,100)
        self.embedding = nn.Embedding.from_pretrained(
            pretrained_embeddings, freeze=False)
        # LSTM 参数以及输入输出说明：
        # 结构参数：LSTM(input_size, hidden_size, num_layers)
        # input_size:输入的特征数量
        # hidden_size:隐藏的特征数量
        # num_layers:层数
        self.lstm = nn.LSTM(100,64,3,batch_first=True)#,bidirectional=True)
        self.linear = nn.Linear(64,2)
        
    def forward(self, x):
        batch_size,seq_num = x.shape
#         print(x.shape) #(128,40)
        vec = self.embedding(x)
#         print(vec.shape) #(128,40,100)
        out, (hn, cn) = self.lstm(vec)
#         print(out.shape) #(128,40,64)
        out = self.linear(out[:,-1,:])
#         print(out.shape) #(128,2)
        out = F.softmax(out,-1)
        return out
    

# ModelHandler

In [22]:
class ModelHandler(nn.Module):
    def __init__(self, params):
        super(ModelHandler, self).__init__()
        self.device = params['device']
        self.epoch_num = params['epoch_num']
        self.train_batch_size = params['train_batch_size']
        self.val_batch_size = params['val_batch_size']
        self.model = params['model']
        
    def forward(self, x):
        return self.model(x)
    
    def fit(self, train_iter, loss_fn, optimizer, modelPath=None,
            dev_iter=None, early_stopping_rounds=None, verbose=2):
        '''
        early_stopping_rounds：是否使用 early_stop 来防止过拟合
        verbose：2分类还是多分类
        '''
        
        if early_stopping_rounds != None:
            # 用来计数多少epoch在验证集上的结果没有改进了
            count = 0

        batch_size = self.train_batch_size
        batchNumInEveryEpoch = len(train_iter)
        
        epochNums = self.epoch_num
        best_val_acc = -1000000
        best_val_loss = 1000000
        
        num = 0
        for epoch in range(epochNums):
            print ('************************* epoch:', epoch, '*************************')
            
            self.train()
            torch.set_grad_enabled(True)

            # 随机化数据
#             randPermNums = np.random.permutation(trainDataNum)
#             X_train = X_train[randPermNums]
#             y_train = np.array(y_train)[randPermNums]
            
    
            trainAcc = 0.0
            trainLoss = 0.0
            for index,(X_train_var, y_train_var) in enumerate(train_iter):
                
                X_train_var = X_train_var.reshape(self.train_batch_size, -1).to(self.device)
                
                # 将 label 转为 one-hot编码，这里针对多分类和二分类的softmax形式。如果是二分类的sogmid，则注释
                y_train_var = y_train_var.unsqueeze(1)
                y_train_var = torch.zeros(len(X_train_var), verbose).scatter_(1, y_train_var, 1)
                y_train_var = y_train_var.to(self.device)
    
                self.zero_grad()
                print (X_train_var.shape)
                scores = self.forward(X_train_var)
                loss = loss_fn(scores.squeeze(), y_train_var)
                trainAcc = trainAcc + self.getAUC(y_train_var, torch.sigmoid(scores).squeeze())
                trainLoss = trainLoss + loss.data.item()
                self.train()
                torch.set_grad_enabled(True)
                loss.backward()
                optimizer.step()
            if verbose == 2:
                print ('train auc:', trainAcc / float(batchNumInEveryEpoch))
                print ('train loss:', trainLoss / float(batchNumInEveryEpoch))

            val_acc, val_loss = self.check_accuracy(self, val_loader, self.val_batch_size, loss_fn, 
                                                    True, verbose)   
            if verbose == 2:
                print ('val_auc:', val_acc)
                print ('val_loss:', val_loss)
        #     print ('val_acc:', val_acc, file=file, flush=True)
            if val_acc > best_val_acc:
#             if val_loss < best_val_loss:
                best_val_acc = val_acc
                best_val_loss = val_loss
                bestEpoch = epoch
                count = 0
                if modelPath != None:
                    torch.save(self.state_dict(), modelPath)
            elif early_stopping_rounds != None:
                count += 1
                if count >= early_stopping_rounds:
                    if verbose >= 1:
                        print ('Stopping.')
                        print ('Best Epoch:', bestEpoch)
                        print ('Best Val Auc:', best_val_acc)
                        print ('Best Val Loss:', best_val_loss)
                    break
    
    def check_accuracy(self, model, val_loader, valBatchSize, loss_fn, 
                       isTrain, verbose, temperature=1):
        if verbose == 2:
            if isTrain:
                print('*****Checking accuracy on validation set*****')
        #         print('Checking accuracy on validation set', file=file, flush=True)
            else:
                print('Checking accuracy on test set') 
        #         print('Checking accuracy on test set', file=file, flush=True) 
        self.eval()
        torch.set_grad_enabled(False)
        
        batchNum = len(val_loader)
#         batchNum = X_val.shape[0] // valBatchSize 
#         if isTrain != True and X_val.shape[0] % valBatchSize != 0:
#             batchNum += 1  
        if verbose == 2:
            print ('batchNum:', batchNum)
             
        valAcc = 0.0
        valLoss = 0.0
        for index, (tX_val_var, tY_val_var) in enumerate(val_loader): 
            tX_val_var = tX_val_var.reshape(self.val_batch_size, 1, -1).to(self.device)
            
            tY_val_var = tY_val_var.unsqueeze(1)
            tY_val_var = torch.zeros(len(tX_val_var), verbose).scatter_(1, tY_val_var, 1)
            tY_val_var = tY_val_var.to(self.device)
            
            scores = self.forward(tX_val_var)

            if isTrain == True:
                loss = loss_fn(scores.squeeze(), tY_val_var / temperature)
                valAcc += self.getAUC(tY_val_var / temperature, torch.sigmoid(scores).squeeze())
                valLoss = valLoss + loss.data.item()

        if isTrain == True:
            return valAcc / float(batchNum), valLoss / float(batchNum)
    
    def predict_proba(self, testDF, inputType='tensor', temperature=1):
        self.eval()
        torch.set_grad_enabled(False)
        if inputType == 'tensor':
            testDF = testDF.reshape(testDF.shape[0], 1, -1).to(self.device)
            scores = self.forward(testDF) / temperature
            return torch.sigmoid(scores).squeeze()
        elif inputType == 'DataFrame':
            testDF = testDF.reshape(testDF.shape[0], 1, -1).to(self.device)
            return torch.sigmoid(self.forward(torch.tensor(np.array(testDF), dtype=torch.float32, device=device)) / temperature).squeeze()
        torch.set_grad_enabled(True)
    
    def predict(self, testDF, inputType='tensor', threshold=0.5, temperature=1):
        predict_proba = self.predict_proba(testDF, inputType, temperature).cpu().numpy().tolist()
#         predict_lables = [1 if x >= threshold else 0 for x in predict_proba]
        return predict_proba
    
    def getAUC(self, y_true, y_score):
        return roc_auc_score(y_true.detach().cpu().numpy(), 
                             y_score.detach().cpu().numpy())    

# 老版 ModelHandler

In [None]:
class ModelHandler(nn.Module):
    def __init__(self, params):
        super(ModelHandler, self).__init__()
#         self.bestStateDict = None
        self.epochNums = params['epochNums']
        self.batch_size = params['batch_size']
        self.device = params['device']
        self.dnn = params['model'].to(self.device)
        
        

    def forword(self, features):
        return self.dnn(features)

#     def reset(self, m):
#         if hasattr(m, 'reset_parameters'):
#             torch.cuda.manual_seed(1)
#             m.reset_parameters()

    
    def fit(self, X_train, y_train, loss_fn, optimizer, task, device, modelPath=None,
            eval_set=None, early_stopping_rounds=None, valBatchSize=None, 
            verbose=0, temperature=1):
        
        if eval_set != None:
            X_val, y_val = eval_set
        if early_stopping_rounds != None:
            # 用来计数多少epoch在验证集上的结果没有改进了
            count = 0
#         self.apply(self.reset)

        batch_size = self.batch_size
        trainDataNum = X_train["feature_idx"].shape[0]
        batchNumInEveryEpoch = trainDataNum // batch_size
        epochNums = self.epochNums
        best_val_acc = -1000000
        best_val_loss = 1000000
        if valBatchSize != None:
            valBatchSize = valBatchSize
        else:
            valBatchSize = X_val["feature_idx"].shape[0]
        num = 0
        for epoch in range(epochNums):
            print ('epoch:', epoch)
        #     print ('epoch:', epoch, file=file, flush=True)
            # 设置成 training 模式
            self.train()
            # 设置自动微分
            torch.set_grad_enabled(True)
            
#             randPermNums = torch.randperm(trainDataNum)
#             X_train["feature_idx"] = X_train["feature_idx"][randPermNums]
#             X_train["feature_values"] = X_train["feature_values"][randPermNums]
#             y_train = y_train[randPermNums]  
            
            randPermNums = np.random.permutation(trainDataNum)
            X_train["feature_idx"] = X_train["feature_idx"].iloc[randPermNums]
            X_train["feature_values"] = X_train["feature_values"].iloc[randPermNums]
            y_train = y_train[randPermNums]  


            trainAcc = 0.0
            trainLoss = 0.0
            for t1 in range(batchNumInEveryEpoch):
                X_train_var = {}
                X_train_var["feature_idx"] = X_train["feature_idx"][t1 * batch_size:(t1 + 1) * batch_size]
                X_train_var["feature_values"] = X_train["feature_values"][t1 * batch_size:(t1 + 1) * batch_size]
                y_train_var = y_train[t1 * batch_size:(t1 + 1) * batch_size].to(self.device)
                self.zero_grad()
                scores = self.forword(X_train_var)
                loss = loss_fn(scores.squeeze(), y_train_var)
                trainAcc = trainAcc + self.getAUC(y_train_var, torch.sigmoid(scores).squeeze())
                trainLoss = trainLoss + loss.data.item()
                self.train()
                torch.set_grad_enabled(True)
                loss.backward()
                optimizer.step()
            if verbose == 2:
                print ('train acc:', trainAcc / float(batchNumInEveryEpoch))
                print ('train loss:', trainLoss / float(batchNumInEveryEpoch))

            val_acc, val_loss = self.check_accuracy(self, X_val, y_val, valBatchSize, loss_fn, 
                                                    task, device, True, verbose)   
            if verbose == 2:
                print ('val_acc:', val_acc)
                print ('val_loss:', val_loss)
        #     print ('val_acc:', val_acc, file=file, flush=True)
            if val_acc > best_val_acc:
#             if val_loss < best_val_loss:
                best_val_acc = val_acc
                best_val_loss = val_loss
                bestEpoch = epoch
                count = 0
                if modelPath != None:
                    torch.save(self.state_dict(), modelPath)
            elif early_stopping_rounds != None:
                count += 1
                if count >= early_stopping_rounds:
                    if verbose >= 1:
                        print ('Stopping.')
                        print ('Best Epoch:', bestEpoch)
                        print ('Best Val Acc:', best_val_acc)
                        print ('Best Val Loss:', best_val_loss)
                    break
    
    def check_accuracy(self, model, X_val, y_val, valBatchSize, loss_fn, 
                       task, device, isTrain, verbose, temperature=1):
        if verbose == 2:
            if isTrain:
                print('*****Checking accuracy on validation set*****')
        #         print('Checking accuracy on validation set', file=file, flush=True)
            else:
                print('Checking accuracy on test set') 
        #         print('Checking accuracy on test set', file=file, flush=True) 
        # 将模型设置成evaluation模式
        self.eval()
        torch.set_grad_enabled(False)
        batchNum = X_val["feature_idx"].shape[0] // valBatchSize 
        if isTrain != True and X_val["feature_idx"].shape[0] % valBatchSize != 0:
            batchNum += 1  
        if verbose == 2:
            print ('batchNum:', batchNum)
        valAcc = 0.0
        valLoss = 0.0
        for t1 in range(batchNum): 
            if isTrain != True and t1 == batchNum - 1:
                tX_val_var = X_val[t1 * valBatchSize:]
                tY_val_var = y_val[t1 * valBatchSize:].to(self.device)
            else:
                tX_val_var = {}
                tX_val_var["feature_idx"] = (X_val["feature_idx"][t1 * valBatchSize:(t1 + 1) * valBatchSize])
                tX_val_var["feature_values"] = (X_val["feature_values"][t1 * valBatchSize:(t1 + 1) * valBatchSize])
                tY_val_var = y_val[t1 * valBatchSize:(t1 + 1) * valBatchSize].to(self.device)
            
            scores = self.forword(tX_val_var)

            if isTrain == True:
                loss = loss_fn(scores.squeeze(), tY_val_var / temperature)
                valAcc += self.getAUC(tY_val_var / temperature, torch.sigmoid(scores).squeeze())
                valLoss = valLoss + loss.data.item()

        if isTrain == True:
            return valAcc / float(batchNum), valLoss / float(batchNum)
    
    def predict_proba(self, testDF, inputType='tensor', temperature=1):
        self.eval()
        torch.set_grad_enabled(False)
        if inputType == 'tensor':
            scores = self.forword(testDF) / temperature
            return torch.sigmoid(scores).squeeze()
        elif inputType == 'DataFrame':
            return torch.sigmoid(self.forword(torch.tensor(np.array(testDF), dtype=torch.float32, device=device)) / temperature)
        torch.set_grad_enabled(True)
    
    # 改进地方
    def predict(self, testDF, inputType='tensor', temperature=1, threshold=0.5):
        predict_proba = self.predict_proba(testDF, inputType, temperature).cpu().numpy().tolist()
        return predict_proba
    
    def getAUC(self, y_true, y_score):
        return roc_auc_score(y_true.detach().cpu().numpy(), 
                             y_score.detach().cpu().numpy())  

# 训练验证

In [None]:
pretrained_embeddings = text_field.vocab.vectors

model = Enet(pretrained_embeddings)
"""
将前面生成的词向量矩阵拷贝到模型的embedding层
这样就自动的可以将输入的word index转为词向量
"""
# model.embedding.weight.data.copy_(text_field.vocab.vectors)
# model.to(DEVICE)

# 损失函数：
# 二分类：二进制交叉熵损失 BCEWithLogitsLoss
# 多分类：交叉熵函数 CrossEntropyLoss
loss_fn = nn.BCEWithLogitsLoss()

optimizer = optim.Adam(model.parameters())#,lr=0.000001)

n_epoch = 20

best_val_acc = 0

for epoch in range(n_epoch):

    for batch_idx, batch in enumerate(train_iter):
        data = batch.text
        target = batch.label
        if data.shape[0]!=128:
            break
#         print(data.shape,target.shape)
#         target = torch.sparse.torch.eye(5).index_select(dim=0, index=target.cpu().data)
#         target = target.to(DEVICE)
#         data = data.permute(1,0)
        optimizer.zero_grad()

        out = model(data)
        # out:(128,2)
        print("out shape:",out.shape)
        print("out squeeze shape:", out.squeeze().shape)
        
        target = target.unsqueeze(1)
        target = torch.zeros(128, 2).scatter_(1, target, 1)
        loss = loss_fn(out, target)
        
        loss.backward()
        optimizer.step()

        if (batch_idx+1) %200 == 0:
            _,y_pre = torch.max(out,-1)
            acc = torch.mean((torch.tensor(y_pre == batch.label, dtype=torch.float)))
            print('epoch: %d \t batch_idx : %d \t loss: %.4f \t train acc: %.4f'
                  %(epoch,batch_idx,loss,acc))
    
    val_accs = []
    for batch_idx, batch in enumerate(dev_iter):
        data = batch.text
        target = batch.label
#         target = torch.sparse.torch.eye(5).index_select(dim=0, index=target.cpu().data)
#         target = target.to(DEVICE)
#         data = data.permute(1,0)
        out = model(data)
        
        _,y_pre = torch.max(out,-1)
        acc = torch.mean((torch.tensor(y_pre == target,dtype=torch.float)))
        val_accs.append(acc)
    
    acc = np.array(val_accs).mean()
    if acc > best_val_acc:
        print('val acc : %.4f > %.4f saving model'%(acc,best_val_acc))
#         torch.save(model.state_dict(), 'params.pkl')
        best_val_acc = acc
    print('val acc: %.4f'%(acc))

In [20]:
%%time
# seed_everything()

train_batch_size, val_batch_size = 2**7, 2**7

pretrained_embeddings = text_field.vocab.vectors
model = Enet(pretrained_embeddings)


modelHandlerParams = {}
modelHandlerParams['epoch_num'] = 1000000
modelHandlerParams['train_batch_size'] = train_batch_size
modelHandlerParams['val_batch_size'] = val_batch_size
modelHandlerParams['device'] = DEVICE

modelHandlerParams['model'] = model
modelHandler = ModelHandler(modelHandlerParams)

# 二分类交叉熵
loss_fn = nn.BCEWithLogitsLoss().to(DEVICE)
# 调参地方，分别调整为0.1,0.01,0.001，最优为0.01
optimizer = optim.Adam(model.parameters(), lr=0.01,
                       weight_decay=0.00001) # lr sets the learning rate of the optimizer

modelHandler.fit(train_iter=train_iter, dev_iter=dev_iter,loss_fn=loss_fn,optimizer=optimizer,
                 early_stopping_rounds=10, verbose=2)

************************* epoch: 0 *************************
torch.Size([128, 40])


NameError: name 'roc_auc_score' is not defined