In [18]:
from sklearn.model_selection import StratifiedKFold
import torch
import torch.utils.data as Data
import joblib
import torch.nn as nn
import torchtext
from sklearn.metrics import accuracy_score
import os
import numpy as np
import sys
import pandas as pd

# Linux下添加此代码,添加临时模块搜索路径(pycharm下当前项目为搜索路径)
sys.path.append(os.path.abspath(".." + os.sep + ".." + os.sep + ".."))

from tianchi_NewsTextClassification.core.models.textcnn_model import TextCNN
from tianchi_NewsTextClassification.core.utils.train_evaluate import Trainer

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [20]:
X_train = joblib.load('../../intermediate_save_data/X_train.pkl')
y_train = joblib.load('../../intermediate_save_data/y_train.pkl')
X_test = joblib.load('../../intermediate_save_data/X_test.pkl')

In [21]:
# 加载词典
load_vocal = joblib.load('../../intermediate_save_data/vocal.pkl')

# 加载预训练词向量文件
vector = torchtext.vocab.Vectors(name="cnew_200.txt",
                                 cache='../../intermediate_save_data')

pretrained_vector = vector.get_vecs_by_tokens(load_vocal.get_itos())
pretrained_vector

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.8134e+00, -4.1394e+00,  1.1417e+00,  ...,  3.5465e+00,
          2.9921e-02, -8.0849e-01],
        ...,
        [ 2.7235e-03,  7.6506e-03, -7.8161e-02,  ..., -7.4759e-03,
         -1.0344e-01, -1.2040e-01],
        [ 8.8274e-02,  9.2499e-02, -3.2991e-02,  ..., -1.7648e-02,
         -1.1850e-01, -2.1958e-02],
        [-1.1811e-01,  3.4976e-02,  1.8313e-02,  ..., -7.8549e-02,
         -1.6537e-01, -1.1834e-01]])

In [22]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=666)

kernel_sizes, nums_channels = [3, 4, 5, 10], [256, 256, 256]  # 卷积核大小和输出通道
net = TextCNN(pretrained_vector.shape[0], pretrained_vector.shape[1], kernel_sizes, nums_channels)
net.embedding.weight.data.copy_(pretrained_vector)
net.constant_embedding.weight.data.copy_(pretrained_vector)  # 使用预训练词向量矩阵
net.constant_embedding.weight.requires_grad = False  # 冻结网络层,使之不参与训练
net = net.to(device)

lr, num_epochs = 0.001, 5
optimer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()


def compute_metrics_acc(predict_all, y_true):
    predict = predict_all.argmax(-1)
    label = y_true
    acc = accuracy_score(label, predict)
    return {"acc": acc}

In [23]:
dataset_tr = Data.TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
dataloader_tr = Data.DataLoader(dataset_tr, 64, shuffle=True)

t_and_v = Trainer(model=net, optimizer=optimer, criterion=loss, epochs=num_epochs)

In [24]:
t_and_v.train(dataloader_tr,  compute_metrics=compute_metrics_acc, verbose=500)

----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------


{'Training loss': [0.2938196063041687,
  0.2223615199327469,
  0.21034269034862518,
  0.21819621324539185,
  0.2022916078567505],
 'Training acc': [0.91013, 0.93507, 0.940945, 0.93893, 0.94363]}

In [25]:
dataset_te = Data.TensorDataset(torch.tensor(X_test))
dataloader_te = Data.DataLoader(dataset_te, 64)  # 测试数据集

result_pro = t_and_v.predict(dataloader_te, status='Test')
result_pro

tensor([[  2.9219,  21.3851,  -5.3220,  ..., -14.6076,  -7.1964, -14.0717],
        [ -1.5998,  -3.4869,  25.5887,  ...,  -9.3767,  13.2178, -22.2694],
        [ -0.3742,  -3.3720, -12.6544,  ...,  -6.2024,  -9.8064, -16.5784],
        ...,
        [  1.3224,  11.1919,  -2.3225,  ..., -10.7451,  -8.2783, -10.2868],
        [  3.9415,  -1.9201,   3.8571,  ...,   0.6587,  -6.3960, -12.3854],
        [  2.6442,  11.3428,  -2.7230,  ...,  -3.2815,  -1.0528,  -6.9574]],
       device='cuda:0')

In [26]:
pre_result_label = np.argmax(result_pro.cpu().numpy(), axis=1)
pre_result_label = pd.DataFrame(pre_result_label, columns=['label'])
pre_result_label

Unnamed: 0,label
0,1
1,2
2,8
3,5
4,0
...,...
49995,0
49996,13
49997,1
49998,3


In [27]:
pre_result_label.to_csv('../../output/test_predictions_textcnn_w2v.csv', index=False)