In [1]:
from google.colab import drive
drive.mount('/content/drive/')
import os
# 此处为google drive中的文件路径,drive为之前指定的工作根目录，要加上
os.chdir("/content/drive/My Drive/罪名预测") 

Mounted at /content/drive/


In [2]:
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import pickle
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [3]:
num_words = 80000
maxlen = 400
device = "cuda" if torch.cuda.is_available else "cpu"
# fact数据集
fact_test = torch.Tensor(np.load('./data_deal/test_big_fact_pad_seq_%d_%d.npy' % (num_words, maxlen))).to(device)
fact_train = torch.Tensor(np.load('./data_deal/big_fact_pad_seq_%d_%d.npy' % (num_words, maxlen))).to(device)

# 标签数据集
labels_test = torch.Tensor(np.load('./data_deal/test_big_labels_accusation.npy')).to(device)
labels_train = torch.Tensor(np.load('./data_deal/big_labels_accusation.npy')).to(device)

In [4]:
print(fact_train.shape)
print(labels_train.shape)
print(fact_test.shape)
print(labels_test.shape)

torch.Size([154592, 400])
torch.Size([154592, 195])
torch.Size([32508, 400])
torch.Size([32508, 195])


In [10]:
train_ds = TensorDataset(fact_train, labels_train)
train_dl = DataLoader(train_ds, batch_size = 128, shuffle = True)

test_ds = TensorDataset(fact_test, labels_test)
test_dl = DataLoader(test_ds, batch_size = 128, shuffle = True)

In [11]:
class GlobalMaxPool1d(nn.Module):
  # 通过普通的池化来实现全局池化
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
         # x shape: (batch_size, channel, seq_len)
        return F.max_pool1d(x, kernel_size=x.shape[2]) # shape: (batch_size, channel, 1)

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, kernel_sizes, num_channels):
        super(TextCNN, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)  
        # self.word_embeddings = self.word_embeddings.from_pretrained(vectors, freeze=False)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Linear(sum(num_channels), 195)
        # 时序最大池化层没有权重，所以可以共用一个实例
        self.pool = GlobalMaxPool1d()
        self.convs = nn.ModuleList()  # 创建多个一维卷积层
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels = embedding_dim, 
                                        out_channels = c, 
                                        kernel_size = k))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence.long())
        embeds = embeds.permute(0, 2, 1)
        # 对于每个一维卷积层，在时序最大池化后会得到一个形状为(批量大小, 通道大小, 1)的
        # Tensor。使用flatten函数去掉最后一维，然后在通道维上连结
        encoding = torch.cat([self.pool(F.relu(conv(embeds))).squeeze(-1) for conv in self.convs], dim=1)
        # 应用丢弃法后使用全连接层得到输出
        outputs = self.decoder(self.dropout(encoding))
        return outputs


In [None]:
device = "cuda" if torch.cuda.is_available else "cpu"
num_words = 80000
maxlen = 400
embedding_dim, kernel_sizes, num_channels = 400, [3, 4, 5], [300, 300, 300]
model = TextCNN(num_words, embedding_dim, kernel_sizes, num_channels).to(device)
print(model)

loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-5)

train_losses = []
train_acces = []
# 用数组保存每一轮迭代中，在测试数据上测试的损失值和精确度，也是为了通过画图展示出来。
eval_losses = []
eval_acces = []
batch_count = 0
for epoch in range(100):
    train_loss = 0
    train_acc = 0
    n1=0
    model.train()   # 训练模型
    num_correct = 0
    for fact, label in train_dl:
        # fact = fact.permute(1,0)    # fact转置
        out = model(fact)
        # print(label.shape)
        # print(out.shape)
        loss = loss_func(out, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 记录误差
        train_loss += loss.item()
        # 计算分类的准确率
        # print("out.argmax(dim=1): ", out.argmax(dim=1),'\n',out.argmax(dim=1).shape)
        # print("label: ",label,"\n",label.shape)
        num_correct += (out.argmax(dim=1) == label.argmax(dim=1)).sum().item()
        # out.max()[1]取出概率最大值的索引位置
        n1 += label.shape[0]
        # print("num_correct:", num_correct,"\nn1: ",n1)
        batch_count += 1
    train_losses.append(train_loss / batch_count)
    train_acces.append(num_correct/n1)

    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in test_dl:
            # X = X.permute(1,0)    # fact转置
            if isinstance(model, torch.nn.Module):
                model.eval() # 评估模式, 这会关闭dropout
                acc_sum += (model(X).argmax(dim=1) == y.argmax(dim=1)).float().sum().item()
                model.train() # 改回训练模式
            else: # 自定义的模型
                if('is_training' in model.__code__.co_varnames): # 如果有is_training这个参数
                    # 将is_training设置成False
                    acc_sum += (model(X, is_training=False).argmax(dim=1) == y.argmax(dim=1)).float().sum().item() 
                else:
                    acc_sum += (model(X).argmax(dim=1) == y.argmax(dim=1)).float().sum().item() 
            n += y.shape[0]
        eval_acces.append(acc_sum / n)
    
    print('epoch: {}, Train Loss: {:.6f}, Train Acc: {:.6f}, Eval Acc: {:.6f}'
          .format(epoch, train_loss/batch_count, num_correct/n1, acc_sum / n))
    if epoch%5 == 1:
      torch.save(model.state_dict(),"./model/train_dict/model_dict_{0}_acc:{1}.pth".format(epoch, acc_sum / n))
      torch.save(model,"./model/model_test/model_{0}_acc:{1}.pth".format(epoch, acc_sum / n))


print('训练完成')
plt.plot(np.arange(len(train_losses)), train_losses,label="train loss")

plt.plot(np.arange(len(train_acces)), train_acces, label="train acc")

plt.plot(np.arange(len(eval_losses)), eval_losses, label="eval loss")

plt.plot(np.arange(len(eval_acces)), eval_acces, label="eval acc")
plt.legend() #显示图例
plt.xlabel('epoches')
#plt.ylabel("epoch")
plt.title('Model accuracy&loss')
plt.show()
torch.save(model,'model.pth')

TextCNN(
  (word_embeddings): Embedding(80000, 400)
  (dropout): Dropout(p=0.5, inplace=False)
  (decoder): Linear(in_features=900, out_features=195, bias=True)
  (pool): GlobalMaxPool1d()
  (convs): ModuleList(
    (0): Conv1d(400, 300, kernel_size=(3,), stride=(1,))
    (1): Conv1d(400, 300, kernel_size=(4,), stride=(1,))
    (2): Conv1d(400, 300, kernel_size=(5,), stride=(1,))
  )
)
epoch: 0, Train Loss: 4.681077, Train Acc: 0.055947, Eval Acc: 0.127476
epoch: 1, Train Loss: 2.110799, Train Acc: 0.133319, Eval Acc: 0.260982
epoch: 2, Train Loss: 1.284687, Train Acc: 0.230452, Eval Acc: 0.370801
epoch: 3, Train Loss: 0.870576, Train Acc: 0.318386, Eval Acc: 0.425834
epoch: 4, Train Loss: 0.629788, Train Acc: 0.381275, Eval Acc: 0.465732
epoch: 5, Train Loss: 0.479988, Train Acc: 0.423521, Eval Acc: 0.495540
epoch: 6, Train Loss: 0.381327, Train Acc: 0.455573, Eval Acc: 0.514550
epoch: 7, Train Loss: 0.313551, Train Acc: 0.478246, Eval Acc: 0.530608
epoch: 8, Train Loss: 0.264270, Tra

In [None]:
torch.save(model,'model.pth')

NameError: ignored