In [1]:
import torch
import torch.utils.data as Data
import joblib
import torch.nn as nn
import torchtext
from sklearn.metrics import accuracy_score
import os
import numpy as np
import sys
import pandas as pd

# Linux下添加此代码,添加临时模块搜索路径(pycharm下当前项目为搜索路径)
sys.path.append(os.path.abspath(".." + os.sep + ".." + os.sep + ".."))

from tianchi_NewsTextClassification.core.models.textcnn_model import TextCNN
from tianchi_NewsTextClassification.core.utils.train_evaluate import Trainer

%run ../models/textcnn_model.py

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
X_train = joblib.load('../../intermediate_save_data/X_train.pkl')
y_train = joblib.load('../../intermediate_save_data/y_train.pkl')
X_test = joblib.load('../../intermediate_save_data/X_test.pkl')

In [4]:
# 加载词典
load_vocal = joblib.load('../../intermediate_save_data/vocal.pkl')

# 加载预训练词向量文件
vector = torchtext.vocab.Vectors(name="cnew_200.txt",
                                 cache='../../intermediate_save_data')

pretrained_vector = vector.get_vecs_by_tokens(load_vocal.get_itos())
pretrained_vector

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.8134e+00, -4.1394e+00,  1.1417e+00,  ...,  3.5465e+00,
          2.9921e-02, -8.0849e-01],
        ...,
        [ 2.7235e-03,  7.6506e-03, -7.8161e-02,  ..., -7.4759e-03,
         -1.0344e-01, -1.2040e-01],
        [ 8.8274e-02,  9.2499e-02, -3.2991e-02,  ..., -1.7648e-02,
         -1.1850e-01, -2.1958e-02],
        [-1.1811e-01,  3.4976e-02,  1.8313e-02,  ..., -7.8549e-02,
         -1.6537e-01, -1.1834e-01]])

In [5]:
kernel_sizes, nums_channels = [3, 4, 5, 10], [256, 256, 256]  # 卷积核大小和输出通道

net = TextCNN(pretrained_vector.shape[0], pretrained_vector.shape[1], kernel_sizes, nums_channels)
net.embedding.weight.data.copy_(pretrained_vector)
net.constant_embedding.weight.data.copy_(pretrained_vector)  # 使用预训练词向量矩阵
net.constant_embedding.weight.requires_grad = False  # 冻结网络层,使之不参与训练
net = net.to(device)

lr, num_epochs = 0.001, 5
optimer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()


def compute_metrics_acc(predict_all, y_true):
    predict = predict_all.argmax(-1)
    label = y_true
    acc = accuracy_score(label, predict)
    return {"acc": acc}

In [6]:
dataset_tr = Data.TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
dataloader_tr = Data.DataLoader(dataset_tr, 64, shuffle=True)

t_and_v = Trainer(model=net, optimizer=optimer, criterion=loss, epochs=num_epochs)

In [7]:
t_and_v.train(dataloader_tr,  compute_metrics=compute_metrics_acc, verbose=500)

----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------


{'Training loss': [0.7525599002838135,
  0.3523303270339966,
  0.23900297284126282,
  0.2608601450920105,
  0.21644048392772675],
 'Training acc': [0.92187, 0.938705, 0.94162, 0.937195, 0.942225]}

In [8]:
dataset_te = Data.TensorDataset(torch.tensor(X_test))
dataloader_te = Data.DataLoader(dataset_te, 64)  # 测试数据集

result_pro = t_and_v.predict(dataloader_te, status='Test')
result_pro

tensor([[  6.5208,  33.4337,  -9.1491,  ..., -31.3840, -19.9089, -34.7295],
        [ -0.0969,  -1.5964,  34.2942,  ...,  -4.7506,  18.5967, -20.9246],
        [  4.3830,   0.6824,  -6.7073,  ...,  -3.4819, -10.2862, -21.2170],
        ...,
        [ -1.2743,   9.8537,  -4.1295,  ...,  -5.7697,  -7.8294, -10.9021],
        [  6.6159,  -0.2909,   4.4105,  ...,  -4.5588, -10.7106, -15.7881],
        [  5.1693,  19.0582,  -2.3024,  ..., -13.4357, -10.5318, -16.1603]],
       device='cuda:0')

In [9]:
pre_result_label = np.argmax(result_pro.cpu().numpy(), axis=1)
pre_result_label = pd.DataFrame(pre_result_label, columns=['label'])
pre_result_label

Unnamed: 0,label
0,1
1,2
2,8
3,0
4,0
...,...
49995,0
49996,13
49997,1
49998,3


In [10]:
# 冻结预训练词向量权重=>线上F1 score:0.9199
# 预训练词向量权重正常参与更新=>线上F1 score:0.9191
pre_result_label.to_csv('../../output/test_predictions_textcnn_w2v.csv', index=False)