In [1]:
import numpy as np
import time
import torch
import pickle as pkl
import torch.nn as nn
from sklearn import metrics
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from datetime import timedelta

In [2]:
#numpy随机数置为1
np.random.seed(1)
#为CPU设置种子用于生成随机数，以使得结果是确定的
torch.manual_seed(1)
#GPU随机数种子置为1
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True  # 保证每次结果一样

### 加载数据并转为数字

In [3]:
word_to_index=pkl.load(open("./data/vocab.pkl","rb"))

In [4]:
pad_size=32
#读取数据并转成索引
def read_text(path):
    contents,labels=[],[]
    with open(path,"r",encoding="UTF-8") as file:
        for line in file:
            #print(line.split("\t"))
            #获取数据以及数据的标签
            sentence,label=line.split("\t")[0],line.split("\t")[1]
            #将文本数据按字分词
            sentence_list=[]
            for i in sentence:
                sentence_list.append(i)
            #pad_size=32，不够的补齐，超出的去掉
            if len(sentence_list)<pad_size:
                sentence_list.extend([word_to_index["<PAD>"]] * (pad_size - len(sentence_list)))
            else:
                sentence_list=sentence_list[:pad_size]
                #print(len(sentence_list),sentence_list)
            #都转成固定的pad_size大小后，然后将其转为语料库的index
            res_list=[]
            for word in sentence_list:
                if word == 4761:
                    res_list.append(4761)
                    continue
                try: # 能在语料库中找到这个字的添加其索引
                    res_list.append(word_to_index[word])
                except: # 语料库里没有的字换成<UNK>的索引
                    res_list.append(word_to_index["<UNK>"])
            #print(res_list,label)
            contents.append(res_list)
            labels.append(int(label))
    return contents,labels

因为是主动学习，这里我们采用验证集的数据作为模型训练数据，训练集的数据作为模型的未标注数据，测试集数据还作为测试集数据进行模型效果评估

In [5]:
unlabeled_data,unlabeled_label=read_text("./data/train.txt")
train_data,train_label=read_text("./data/dev.txt")
test_data,test_label=read_text("./data/test.txt")

In [6]:
print(len(train_data),len(unlabeled_data),len(test_data))

10000 180000 10000


In [7]:
print(type(train_data),train_data[0])

<class 'list'> [173, 714, 3, 186, 1844, 889, 0, 2641, 80, 2061, 416, 478, 382, 5, 308, 15, 1264, 1344, 4761, 4761, 4761, 4761, 4761, 4761, 4761, 4761, 4761, 4761, 4761, 4761, 4761, 4761]


### 定义模型

In [8]:
embedding_pretrained = torch.tensor(np.load( './data/embedding_SougouNews.npz')["embeddings"].astype('float32'))

In [9]:
embedding_pretrained.size()

torch.Size([4762, 300])

In [10]:
num_class=len([x.strip() for x in open('./data/class.txt').readlines()])
num_class

10

In [11]:
class LSTM(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding=nn.Embedding.from_pretrained(embedding_pretrained, freeze=False)
        self.lstm=nn.LSTM(input_size=embedding_pretrained.size(1),hidden_size=128 ,num_layers=2,bidirectional=True, batch_first=True, dropout=0.2)
        self.fc=nn.Linear(128*2,num_class)
        #损失预测层
        self.loss_fc=nn.Linear(128*2*2,1)
        
    def forward(self,x):
        out = self.embedding(x)  # out:[batch_size, seq_len, embedding]=[128, 32, 300],这也是LSTM层(batch_first=True)的输入格式
        out, _ = self.lstm(out)#out接收output，_接收元组(h_n,c_n)
        #定义损失预测层的输入
        loss_input=torch.cat((out[:, 0, :],out[:, -1, :]),1)
        #print(out[:, 0, :].shape,out[:, -1, :].shape,loss_input.shape)
        out = self.fc(out[:, -1, :])  # 句子最后时刻的 hidden state
        #损失预测层最终的loss预测值
        loss_out=self.loss_fc(loss_input)
        return out,loss_out

In [12]:
lstm=LSTM()

### loss、optimizer

In [13]:
optimizer=torch.optim.Adam(lstm.parameters(),lr=0.001)

In [14]:
#lstm层的loss
lstm_loss_fn=nn.functional.cross_entropy

In [15]:
#损失预测层的loss,公式里衡量参数 ξ=1
def loss_loss_fn(outputs,labels,loss_outputs):
    res=torch.tensor(0,dtype=torch.float32)
    res=res.to(device)
    #res=0
    for i in range(0,len(outputs),2):
        true_loss1=lstm_loss_fn(outputs[i].unsqueeze(0), labels[i].unsqueeze(0))
        true_loss2=lstm_loss_fn(outputs[i+1].unsqueeze(0), labels[i+1].unsqueeze(0))
        temp=0
        if true_loss1>true_loss2:
            temp=1
        else:
            temp=-1
        pred_loss=max(torch.tensor(0,dtype=torch.float32),-temp*(loss_outputs[i][0]-loss_outputs[i+1][0])+1)
        res+=pred_loss
    return res

### run

In [16]:
def batch_data(data,label,batch_size=64,shuffle=True):
    tensor_data, tensor_label = map(torch.tensor, (data, label))
    dataset = TensorDataset(tensor_data, tensor_label)
    dataloader=DataLoader(dataset,num_workers=4, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [17]:
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [18]:
def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

In [19]:
def train(model,data,label,num_epochs):
    model.train()
    batch_size=64
    train_iter=batch_data(data,label,batch_size)
    
    for index,(trains,labels) in enumerate(train_iter):
        trains=trains.to(device)
        labels=labels.to(device)
        outputs,loss_outputs = model(trains)
        #print(outputs.shape[0])
        if outputs.shape[0]%2 != 0:
            print("batch_size为奇数，不执行")
        else:
            #print(outputs.shape,loss_outputs.shape)#torch.Size([batch_size, 10]) torch.Size([batch_size, 1])
            #获得一个batch数据真实的loss
            true_loss=lstm_loss_fn(outputs, labels)
            #获得一个batch数据预测的loss
            pre_loss_sum=loss_loss_fn(outputs,labels,loss_outputs)
            #print(true_loss,pre_loss_sum)
            loss=true_loss+0.1*2*pre_loss_sum/len(trains)
            #loss=loss.to(device) 若代码执行出错报gpu的错误，将这句加上
            loss.backward()
            optimizer.step()
            model.zero_grad()
        


In [20]:
def test(model,data,label):
    model.eval()
    test_loss=0
    test_iter=batch_data(data,label)
    accuracy=0
    with torch.no_grad():
        for trains,labels in test_iter:
            trains=trains.to(device)
            labels=labels.to(device)
            outputs,loss_outputs = model(trains)
            loss = lstm_loss_fn(outputs, labels)
            test_loss+=loss.item()
            
            #计算测试集的准确率,计算准确率时不能在gpu上计算，得转到cpu上
            true = labels.data.cpu()
            predic = torch.max(outputs.data, 1)[1].cpu()
            train_acc = metrics.accuracy_score(true, predic)
            #print(train_acc)
            accuracy+=train_acc
        print("平均accuracy:",accuracy/len(test_iter),"平均loss:",test_loss/len(test_iter))

In [21]:
def active_learning(model,train_data,train_label,unlabeled_data,unlabeled_label):
    model.eval()
    batch_size=64
    with torch.no_grad():
        unlabeled_iter=batch_data(unlabeled_data,unlabeled_label,batch_size,False)
        res_dict={}
        res_list=[]
        index=0
        for trains,labels in unlabeled_iter:
            trains=trains.to(device)
            labels=labels.to(device)
            outputs,loss_outputs = model(trains)
            #下标当作k，loss值当作v
            for i in range(len(loss_outputs)):
                res_dict[index+i]=loss_outputs[i].item()
            index=index+batch_size
        # 按v值降序降序,排序后变成元组(k,v)组成的list：[('key1', 3), ('key2', 2), ('key3', 1)]
        res_dict=sorted(res_dict.items(),key=lambda x:x[1],reverse = True)
        #每次主动学习取出1000个值加入到标记数据集中，并从未标记数据集中删除
        num=0
        for con in res_dict:
            if num >= 1000:
                break
            res_list.append(con[0])
            num+=1
        #往标注数据集中添加元素
        for i in res_list:
            train_data.append(unlabeled_data[i])
            train_label.append(unlabeled_label[i])
        #从无标注数据集中删除元素(先删角标大的)
        for i in sorted(res_list, reverse=True):
            del(unlabeled_data[i])
            del(unlabeled_label[i])

In [22]:
def run(model,train_data,train_label,test_data,test_label,unlabeled_data,unlabeled_label,num_epochs):
    model.to(device)
    start_time=time.time()
    
    #run
    for epoch in range(num_epochs):
        #训练
        print("第{}次train_data和train_label数量:".format(epoch),len(train_data),len(train_label))
        print('Epoch [{}/{}]'.format(epoch + 1, num_epochs),"用时：",get_time_dif(start_time))
        train(model,train_data,train_label,num_epochs)
        #使用测试数据测试模型效果
        test(model,test_data,test_label)
        #active learning
        active_learning(model,train_data,train_label,unlabeled_data,unlabeled_label)

In [23]:
run(lstm,train_data,train_label,test_data,test_label,unlabeled_data,unlabeled_label,10)

第0次train_data和train_label数量: 10000 10000
Epoch [1/10] 用时： 0:00:00
平均accuracy: 0.31468949044585987 平均loss: 1.6836314892313282
第1次train_data和train_label数量: 11000 11000
Epoch [2/10] 用时： 0:01:00
平均accuracy: 0.5592157643312102 平均loss: 1.2530128948248116
第2次train_data和train_label数量: 12000 12000
Epoch [3/10] 用时： 0:02:01
平均accuracy: 0.7203423566878981 平均loss: 0.8462304488109176
第3次train_data和train_label数量: 13000 13000
Epoch [4/10] 用时： 0:03:03
平均accuracy: 0.7852308917197452 平均loss: 0.6804901570271534
第4次train_data和train_label数量: 14000 14000
Epoch [5/10] 用时： 0:04:07
平均accuracy: 0.8212579617834395 平均loss: 0.578421653740725
第5次train_data和train_label数量: 15000 15000
Epoch [6/10] 用时： 0:05:14
平均accuracy: 0.8264331210191083 平均loss: 0.565793888014593
第6次train_data和train_label数量: 16000 16000
Epoch [7/10] 用时： 0:06:22
平均accuracy: 0.8359872611464968 平均loss: 0.5414501155257985
第7次train_data和train_label数量: 17000 17000
Epoch [8/10] 用时： 0:07:29
平均accuracy: 0.8462380573248408 平均loss: 0.5107007655937961
第8次train_