In [1]:
import torch
import numpy as np
import pandas as pd
import joblib
import torchtext
import torch.nn as nn
from sklearn.metrics import accuracy_score
import torch.utils.data as Data
from torch.utils.data.dataloader import default_collate
import sys
import os

# Linux下添加此代码,添加临时模块搜索路径(pycharm下当前项目为搜索路径)
sys.path.append(os.path.abspath(".." + os.sep + ".." + os.sep + ".."))

from tianchi_NewsTextClassification.core.models.textrnn_model import TextRNN
from tianchi_NewsTextClassification.core.utils.train_evaluate import Trainer

%run ../models/textrnn_model.py
%run ../utils/train_evaluate.py

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
X_train = joblib.load('../../intermediate_save_data/X_train.pkl')
y_train = joblib.load('../../intermediate_save_data/y_train.pkl')
X_test = joblib.load('../../intermediate_save_data/X_test.pkl')

In [4]:
# 加载词典
load_vocal = joblib.load('../../intermediate_save_data/vocal.pkl')

# 加载预训练词向量文件
vector = torchtext.vocab.Vectors(name="cnew_200.txt",
                                 cache='../../intermediate_save_data')

pretrained_vector = vector.get_vecs_by_tokens(load_vocal.get_itos())
pretrained_vector

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.8134e+00, -4.1394e+00,  1.1417e+00,  ...,  3.5465e+00,
          2.9921e-02, -8.0849e-01],
        ...,
        [ 2.7235e-03,  7.6506e-03, -7.8161e-02,  ..., -7.4759e-03,
         -1.0344e-01, -1.2040e-01],
        [ 8.8274e-02,  9.2499e-02, -3.2991e-02,  ..., -1.7648e-02,
         -1.1850e-01, -2.1958e-02],
        [-1.1811e-01,  3.4976e-02,  1.8313e-02,  ..., -7.8549e-02,
         -1.6537e-01, -1.1834e-01]])

In [5]:
vocal_size, embedding_size = pretrained_vector.shape
hidden_size = 256
dropout = 0.5
bidirectional = True
out_size = 14
num_layers = 2

net = TextRNN(vocab_size=vocal_size,
              embedding_size=embedding_size,
              hidden_size=hidden_size,
              num_layers=num_layers,
              dropout_ratio=dropout,
              bidirectional=True,
              out_size=out_size)
net.embed.weight.data.copy_(pretrained_vector)  # 使用预训练词向量矩阵
net = net.to(device)

lr, num_epochs = 0.001, 5
params_1x = [param for name, param in net.named_parameters() if name not in ["embed.weight"]]
optimer = torch.optim.Adam([{'params': params_1x, 'lr': lr}, 
                            {'params': net.embed.parameters(), 'lr': 0.00025}])  # 预训练词向量使用更低的学习率
loss = nn.CrossEntropyLoss()


def compute_metrics_acc(predict_all, y_true):
    predict = predict_all.argmax(-1)
    label = y_true
    acc = accuracy_score(label, predict)
    return {"acc": acc}

In [6]:
def collate_fun(data):
    text, label = default_collate(data)
    text = text.transpose(0, 1)
    return text, label


dataset_tr = Data.TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
dataloader_tr = Data.DataLoader(dataset_tr, 64, shuffle=True, collate_fn=collate_fun)

for i, j in dataloader_tr:
    print(i.shape)
    print(j.shape)
    break

torch.Size([3000, 64])
torch.Size([64])


In [7]:
t_and_v = Trainer(model=net, optimizer=optimer, criterion=loss, epochs=num_epochs)

In [8]:
t_and_v.train(dataloader_tr,  compute_metrics=compute_metrics_acc, verbose=500, estimate_train=False)

----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------


{}

In [9]:
def collate_fun_test(data):
    text = default_collate(data)[0]
    text = text.transpose(0, 1)
    return text, 


dataset_te = Data.TensorDataset(torch.tensor(X_test))
dataloader_te = Data.DataLoader(dataset_te, 64, collate_fn=collate_fun_test)

In [10]:
result_pro = t_and_v.predict(dataloader_te, status='Test')
result_pro

tensor([[-0.0696,  9.6873, -4.6693,  ..., -8.7026, -4.8898, -9.0701],
        [-0.0979, -1.9580, 12.9226,  ..., -4.6836,  2.2152, -8.3977],
        [ 0.9576, -0.2485, -3.2080,  ...,  0.3543, -6.8835, -4.0182],
        ...,
        [-1.1301,  6.6643, -3.7873,  ..., -4.8817, -4.4219, -6.7376],
        [-0.2524, -1.9369,  0.0388,  ..., -0.4760, -4.9485, -4.7881],
        [-0.0329,  7.9241, -4.4056,  ..., -5.5392, -3.9979, -6.1183]],
       device='cuda:0')

In [11]:
pre_result_label = np.argmax(result_pro.cpu().numpy(), axis=1)
pre_result_label = pd.DataFrame(pre_result_label, columns=['label'])
pre_result_label

Unnamed: 0,label
0,1
1,2
2,8
3,5
4,0
...,...
49995,0
49996,13
49997,1
49998,3


In [12]:
# 不使用预训练词向量权重=>线上F1 score:0.9360
# 使用预训练词向量权重+梯度裁剪(梯度爆炸)=>线上F1 score:0.9484
# 使用预训练词向量权重(学习率0.0001)+梯度裁剪(梯度爆炸)=>线上F1 score:0.9405
# 使用预训练词向量权重(学习率0.00025)+梯度裁剪(梯度爆炸)=>线上F1 score:0.9414
pre_result_label.to_csv('../../output/test_predictions_textrnn_w2v.csv', index=False)