In [1]:
import torch
import torch.utils.data as Data
import joblib
import torch.nn as nn
import torchtext
import numpy as np
import pandas as pd
import random
import time
from sklearn.metrics import f1_score

In [2]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种\


seed = 2022
set_seed(seed)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
train_df = pd.read_csv('../datasets/train_set.csv', sep='\t')
test_df = pd.read_csv('../datasets/test_a.csv', sep='\t')

In [5]:
# 加载word2vec字典
load_vocal = joblib.load('../data/vocab/vocab_word2vec.pkl')

print(load_vocal.get_stoi().get('349', 0))
print(load_vocal.get_stoi().get('3113', 0))
print(load_vocal.get_stoi().get('4806', 0))

1149
2416
6336


In [6]:
def split_truncate_pad(string,
                       num_steps,  # 句子最大长度
                       stoi,  # Dictionary mapping tokens to indices.
                       padding_index):  # 填充字符'<pad>'在词典中的索引
    """截断或填充文本序列"""
    # 获取字在Vocab对象中的位置
    line = [stoi.get(word, 0) for word in string.split()]
    # num_steps_half = int(num_steps / 2)
    # if len(line) > num_steps:
    #      return line[:num_steps_half] + line[-num_steps_half:]  # 句子截断(这里取前max_len/2和后max_len/2个)
    if len(line) > num_steps:
        # 直接返回列表速度较快
        return line[:num_steps]  # 句子截断(这里取前max_len个)
    return line + [padding_index] * (num_steps - len(line))  # 句子填充


max_len = 6000
X_train_data = train_df['text'].apply(split_truncate_pad, num_steps=max_len,
                                      stoi=load_vocal.get_stoi(), padding_index=1)  # 这里设置句子最大长度为max_len
X_test_data = test_df['text'].apply(split_truncate_pad, num_steps=max_len, stoi=load_vocal.get_stoi(),
                                    padding_index=1)

y_train = train_df['label'].values

In [7]:
dataset_tr = Data.TensorDataset(torch.tensor(X_train_data.values.tolist()), torch.tensor(y_train))
dataloader_tr = Data.DataLoader(dataset_tr, 256, shuffle=True)

for i in dataloader_tr:
    print(i)
    print(i[0].shape)  # i[0].shape=[batch_size, num_steps]
    print(i[1].shape)  # i[1].shape=[batch_size, ]
    break

[tensor([[  28,  820,  358,  ...,    1,    1,    1],
        [ 144, 1643,   71,  ...,    1,    1,    1],
        [ 301,  691,   61,  ...,    1,    1,    1],
        ...,
        [ 281,  795,  359,  ...,    1,    1,    1],
        [1254,  348,  630,  ...,    1,    1,    1],
        [  15,  133,  371,  ...,    1,    1,    1]]), tensor([ 0,  0,  8,  7,  1,  3,  7,  0,  1,  7,  5,  2,  2,  0,  2,  3,  2,  4,
         1,  2,  2,  5,  0,  4,  0,  0,  0,  1,  0,  0,  2,  7,  8,  2,  1,  3,
         4,  1,  1,  1, 11,  6,  0,  1,  7,  2,  5,  6,  3,  3,  4,  1,  6,  2,
         0,  2,  5,  0,  0,  2, 11,  4,  8,  0,  1,  4,  5,  6,  1,  1,  2,  0,
         4,  1,  4, 10,  4,  5,  4,  4,  2,  0,  5,  1,  2,  0,  5,  5,  8,  1,
         3,  2,  0,  3,  5,  4,  1,  0,  3,  1,  7,  2,  3,  1,  3,  7,  1, 10,
         1, 10,  3,  2,  7, 10,  1,  4,  2,  1,  0,  0,  5,  1,  0,  2,  2,  8,
         5,  1,  1,  3,  0,  1, 11,  2,  3,  2,  8,  3,  6,  0,  1,  0,  3,  0,
         8,  0, 10,  0,  2,  3, 

In [8]:
# 加载预训练词向量文件
vector = torchtext.vocab.Vectors(name="cnew_300.txt",
                                 cache='word2vec')

pretrained_vector = vector.get_vecs_by_tokens(load_vocal.get_itos())
pretrained_vector

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.8654,  1.0556, -1.1478,  ...,  2.9752, -1.3487, -1.1243],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [9]:
class TextCNN(nn.Module):
    """
    TextCNN模型的pytorch实现(具体任务对应修改)

    Parameters
    ---------
    vocab_size : int
        单词表的单词数目
    embed_size : int
        输出词向量的维度大小
    kernel_sizes : tuple
        一般来说:不同大小卷积核的组合通常优于同样大小的卷积核
        不同卷积层卷积核的宽度;如:kernel_sizes=(3, 4, 5)
    num_channels : tuple
        不同卷积层输出通道数;如:num_channels=(100, 100, 100)
    dropput_ratio : float
        dropout层p值
    """

    def __init__(self, vocab_size, embed_size, kernel_sizes, num_channels, dropout_ratio=0.5):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        # 预训练的词嵌入层
        self.constant_embedding = nn.Embedding(vocab_size, embed_size)
        self.dropout = nn.Dropout(p=dropout_ratio)
        self.decoder = nn.Linear(sum(num_channels), 14)  # 多分类
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.relu = nn.ReLU()
        # 通过nn.ModuleList()创建多个⼀维卷积层
        self.convs = nn.ModuleList()
        for out_channels, kernel_size in zip(num_channels, kernel_sizes):
            self.convs.append(
                # 两个嵌⼊的层连接,故in_channels=2 * embed_size
                nn.Conv1d(in_channels=2 * embed_size, out_channels=out_channels, kernel_size=kernel_size))

    def forward(self, inputs):
        # inputs.shape=(N, L);其实L表示序列长度
        # 沿着向量维度将两个嵌⼊层连接起来
        # embeddings.shape=(N, L, 2 * C);其中C表示输出词向量的维度大小
        embeddings = torch.cat((self.embedding(inputs), self.constant_embedding(inputs)), dim=2)
        # 根据⼀维卷积层的输⼊格式,重新排列张量,以便通道作为第2维
        # embeddings.shape(N, 2 * C, L);
        embeddings = embeddings.permute(0, 2, 1)
        # conv(embeddings).shape=(N, out_channels, L_out);其中out_channelsh表示输出通道数,L_out表示每个输出通道的宽度
        # self.pool(conv(embeddings)).shape=(N, output_channels, 1)
        # torch.squeeze(self.relu(self.pool(conv(embeddings))), dim=-1).shape=(N, output_channels)
        # encoding.shape=(N, output_channels1 + output_channels2 + output_channels3 + .......)
        encoding = torch.cat([torch.squeeze(self.relu(self.pool(conv(embeddings))), dim=-1) for conv in self.convs],
                             dim=1)
        # outputs.shape=(N, 14)
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [10]:
kernel_sizes, nums_channels = [3, 4, 5], [128, 128, 128]  # 卷积核大小和输出通道

net = TextCNN(pretrained_vector.shape[0], pretrained_vector.shape[1], kernel_sizes, nums_channels)
net.embedding.weight.data.copy_(pretrained_vector)
net.constant_embedding.weight.data.copy_(pretrained_vector)  # 使用预训练词向量矩阵
net.constant_embedding.weight.requires_grad = False  # 冻结网络层,使之不参与训练
net = net.to(device)

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
criterion_cross_entropy = nn.CrossEntropyLoss()

In [11]:
# 模型训练
def train(model, dataloader, criterion, optimizer, device):
    model.train()

    for idx, (text, labels) in enumerate(dataloader):
        # 数据设备切换
        text = text.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        out = model(text)
        loss = criterion(out, labels)  # 每个step的损失值
        loss.backward()
        optimizer.step()

        if idx % 50 == 0 and idx > 0:
            predict = out.argmax(dim=1).cpu().numpy()
            f1 = f1_score(labels.cpu().numpy(), predict, average='micro')  # 评估指标
            print('| step {:5d} | loss {:8.3f} | f1 {:8.3f} |'.format(idx, loss.item(), f1))

In [12]:
EPOCHS = 5

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(net, dataloader_tr, criterion_cross_entropy, optimizer, device)
    print('-' * 37)
    print('| end of epoch {:5d} | time: {:5.2f}s |'.format(epoch, time.time() - epoch_start_time))
    print('-' * 37)

| step    50 | loss    1.657 | f1    0.680 |
| step   100 | loss    1.209 | f1    0.793 |
| step   150 | loss    0.926 | f1    0.797 |
| step   200 | loss    0.775 | f1    0.840 |
| step   250 | loss    0.847 | f1    0.801 |
| step   300 | loss    0.793 | f1    0.797 |
| step   350 | loss    0.558 | f1    0.859 |
| step   400 | loss    0.665 | f1    0.828 |
| step   450 | loss    0.584 | f1    0.832 |
| step   500 | loss    0.481 | f1    0.863 |
| step   550 | loss    0.885 | f1    0.781 |
| step   600 | loss    0.496 | f1    0.852 |
| step   650 | loss    0.549 | f1    0.852 |
| step   700 | loss    0.618 | f1    0.855 |
| step   750 | loss    0.457 | f1    0.855 |
-------------------------------------
| end of epoch     1 | time: 452.32s |
-------------------------------------
| step    50 | loss    0.501 | f1    0.863 |
| step   100 | loss    0.422 | f1    0.867 |
| step   150 | loss    0.443 | f1    0.867 |
| step   200 | loss    0.606 | f1    0.840 |
| step   250 | loss    0.357 |

In [19]:
# 模型预测
def predict(model, dataloader, device):
    model.eval()

    predict_list = []
    with torch.no_grad():
        for text, in dataloader:
            # 数据设备切换
            text = text.to(device)
            out = model(text)
            predict_list.append(out.cpu())

    predict_all = torch.cat(predict_list, dim=0)  # 合并所有批次的预测结果
    return predict_all

In [20]:
dataset_te = Data.TensorDataset(torch.tensor(X_test_data.values.tolist()))
dataloader_te = Data.DataLoader(dataset_te, 64)  # 测试数据集

result_pro = predict(net, dataloader_te, device)
result_pro

tensor([[ 9.6695, 23.9957,  1.4339,  ..., -6.5855,  1.1423, -6.3834],
        [ 1.0942,  1.8273, 25.4002,  ..., -0.6347, 12.9418, -3.3128],
        [ 4.6445,  2.7103,  0.3172,  ...,  3.1960, -3.6932, -6.8945],
        ...,
        [ 4.8764, 13.9193,  2.7801,  ..., -8.4020, -0.8942, -6.7221],
        [ 4.3124, -0.0654,  8.0356,  ...,  3.4989, -4.7454, -3.8056],
        [ 6.2089, 15.2082, -0.2523,  ..., -0.6511, -1.1270, -3.5069]])

In [21]:
pro_result_label = np.argmax(result_pro.cpu().numpy(), axis=1)
pro_result_label = pd.DataFrame(pro_result_label, columns=['label'])
pro_result_label

Unnamed: 0,label
0,1
1,2
2,8
3,5
4,0
...,...
49995,0
49996,13
49997,1
49998,3


In [22]:
# 前3000 f1 score:0.9352
# 前1500 + 后1500 f1 score:0.9343
# 前6000 f1 score:0.9352
pro_result_label.to_csv('../predict_result/textcnn.csv', index=False)