In [None]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split
import pickle as pkl
import tqdm
import models.TextCNN as model

np.random.seed(3407)
torch.manual_seed(3407)
torch.cuda.manual_seed_all(3407)
torch.backends.cudnn.deterministic = True  # 保证每次结果一样
torch.backends.cudnn.benchmark = False
UNK, PAD = '<UNK>', '<PAD>'  # 未知字，padding符号

In [None]:
config = model.Config('ship_data', 'embedding.npz')

In [24]:
# coding: UTF-8
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as f


class Config(object):
    """配置参数"""

    def __init__(self, dataset, embedding, notes):
        self.model_name = 'TextCNN'
        self.train_path = dataset + '/train_dataset.csv'
        self.val_path = dataset + '/val_dataset.csv'
        self.test_path = dataset + '/test_dataset.csv'
        self.class_list = [x.strip() for x in
                           open(f'{dataset}/pre_data/class.txt', encoding='utf-8').readlines()]  # 类别名单
        self.vocab_path = dataset + '/pre_data/vocab.pkl'  # 词表
        self.save_path = f'./result/{self.model_name}_{notes}.ckpt'  # 模型训练结果
        self.log_path = dataset + '/log/' + self.model_name
        self.embedding_pretrained = torch.tensor(
            np.load(dataset + '/pre_data/' + embedding)["embeddings"].astype('float32')) \
            if embedding != 'random' else None  # 预训练词向量
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # 设备
        self.is_random = "random" if embedding == "random" else "not_random"

        self.dropout = 0.5  # 随机失活
        self.require_improvement = 1  # 若超过1000batch效果还没提升，则提前结束训练
        self.num_classes = len(self.class_list)  # 类别数
        self.n_vocab = 0  # 词表大小，在运行时赋值
        self.num_epochs = 1000  # epoch数
        self.batch_size = 512  # mini-batch大小
        self.pad_size = 30  # 每句话处理成的长度(短填长切)
        self.learning_rate = 1e-3  # 学习率
        self.embed = self.embedding_pretrained.size(1) \
            if self.embedding_pretrained is not None else 20  # 字向量维度
        self.filter_sizes = (2, 3, 4, 5)  # 卷积核尺寸
        self.num_filters = 256  # 卷积核数量(channels数)


'''Convolutional Neural Networks for Sentence Classification'''


def conv_and_pool(x, conv):
    x = f.relu(conv(x))
    x = x.squeeze(3)
    x = f.max_pool1d(x, x.size(2)).squeeze(2)
    return x


class Model(nn.Module):
    def __init__(self, config):
        super(Model, self).__init__()
        if config.embedding_pretrained is not None:
            self.embedding_1 = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
        else:
            self.embedding_1 = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, config.num_filters, (k, config.embed)) for k in config.filter_sizes])
        self.dropout = nn.Dropout(config.dropout)
        self.fc_layers = nn.Sequential(
            nn.Linear(config.num_filters * len(config.filter_sizes),
                      config.num_filters * len(config.filter_sizes) // 2),
            nn.Linear(config.num_filters * len(config.filter_sizes) // 2,
                      config.num_filters * len(config.filter_sizes) // 4),
            nn.Linear(config.num_filters * len(config.filter_sizes) // 4, config.num_classes))

    def forward(self, x):
        out = self.embedding_1(x)
        out = out.unsqueeze(1)  # 插入维度 进行卷积运算
        out = torch.cat([conv_and_pool(out, conv) for conv in self.convs], 1)
        out = self.dropout(out)
        out = self.fc_layers(out)
        return out


In [5]:
# coding: UTF-8
import argparse
import pickle as pkl
from importlib import import_module

import numpy as np
import torch
from torch.utils.data import DataLoader
from torchinfo import summary

from train_eval import train, init_network
from utils import CustomDataset

# 随机种子设置
np.random.seed(3407)
torch.manual_seed(3407)
torch.cuda.manual_seed_all(3407)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

dataset = 'ship_data'
embedding = 'embedding.npz'

notes = 'test'
model_name = 'TextCNN'

config = Config(dataset, embedding, notes)

# 创建自定义数据集
train_dataset = CustomDataset(config, data_type='test')
val_dataset = CustomDataset(config, data_type='val')
vocab = pkl.load(open(config.vocab_path, 'rb'))

# 数据加载器
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config.batch_size)



In [25]:

config = Config(dataset, embedding, notes)
# 训练
config.n_vocab = len(vocab)
model = Model(config).to(config.device)
# 初始化模型参数
if model_name != 'Transformer':
    init_network(model)

print(summary(model, input_size=(1, 30), dtypes=[torch.long]))
print(model.embedding_2.weight)
train(config, model, train_loader, val_loader, notes)
print(model.embedding_2.weight)

Layer (type:depth-idx)                   Output Shape              Param #
Model                                    [1, 5]                    --
├─Embedding: 1-1                         [1, 30, 100]              72,700
├─ModuleList: 1-2                        --                        --
│    └─Conv2d: 2-1                       [1, 256, 29, 1]           51,456
│    └─Conv2d: 2-2                       [1, 256, 28, 1]           77,056
│    └─Conv2d: 2-3                       [1, 256, 27, 1]           102,656
│    └─Conv2d: 2-4                       [1, 256, 26, 1]           128,256
├─Dropout: 1-3                           [1, 1024]                 --
├─Sequential: 1-4                        [1, 5]                    --
│    └─Linear: 2-5                       [1, 512]                  524,800
│    └─Linear: 2-6                       [1, 256]                  131,328
│    └─Linear: 2-7                       [1, 5]                    1,285
Total params: 1,089,537
Trainable params: 1,089,53

AttributeError: 'Model' object has no attribute 'embedding_2'

In [29]:
model=Model(config)
model(torch.randint(1, 10, [64, 30])).size()

torch.Size([64, 5])