In [3]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split
import pickle as pkl
import tqdm
import models.TextCNN as model

np.random.seed(3407)
torch.manual_seed(3407)
torch.cuda.manual_seed_all(3407)
torch.backends.cudnn.deterministic = True  # 保证每次结果一样
torch.backends.cudnn.benchmark = False
UNK, PAD = '<UNK>', '<PAD>'  # 未知字，padding符号

In [4]:
config = model.Config('ship_data', 'embedding.npz')

In [34]:
# 自定义数据集类，需要实现__len__和__getitem__方法
class CustomDataset(Dataset):
    def __init__(self, data_path, pad_size):
        tokenizer = lambda x: x.split('|')  # word-level
        vocab = pkl.load(open(config.vocab_path, 'rb'))  # 打开词表
        print(f"词表大小: {len(vocab)}")
        class_int_dict = {item: i + 1 for i, item in enumerate(config.class_list)}
        df = pd.read_csv(data_path, usecols=['path', 'cluster'])  # 读取csv
        contents = []
        for index, row in df.iterrows():
            content, label = row['path'], row['cluster']
            token = tokenizer(content)
            seq_len = len(token)
            if pad_size:  # 统一长度
                if seq_len < pad_size:
                    token.extend([PAD] * (pad_size - len(token)))
                else:
                    token = token[:pad_size]
                    seq_len = pad_size
            words_line = []
            for word in token:
                words_line.append(vocab.get(word, vocab.get(UNK)))
            contents.append((words_line, class_int_dict[label], seq_len))
        self.data = contents

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return torch.LongTensor(self.data[i][0]), self.data[i][1], self.data[i][2]


In [35]:
# 创建自定义数据集
dataset = CustomDataset(data_path='./ship_data/test.csv', pad_size=30)

# 定义拆分比例
train_size = int(0.9 * len(dataset))
val_size = int(0.05 * len(dataset))
test_size = len(dataset) - train_size - val_size

# 使用random_split函数拆分数据集
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2)
test_loader = DataLoader(test_dataset, batch_size=2)

# 现在，你可以使用train_loader、val_loader和test_loader来迭代你的数据集进行训练、验证和测试。

词表大小: 727


In [37]:
for i, (x, y, _) in enumerate(train_loader):
    print(x, y, _)
    break

tensor([[ 96,  96, 281, 271, 248, 235, 203, 168, 112, 726, 726, 726, 726, 726,
         726, 726, 726, 726, 726, 726, 726, 726, 726, 726, 726, 726, 726, 726,
         726, 726],
        [ 74,   4,   2,   2,  14,  35, 229, 340, 347, 327, 315, 279, 304,  81,
         726, 726, 726, 726, 726, 726, 726, 726, 726, 726, 726, 726, 726, 726,
         726, 726]]) tensor([3, 1]) tensor([ 9, 14])


In [3]:
# 定义拆分比例
train_size = int(0.9 * len(data))
val_size = int(0.05 * len(data))
test_size = len(data) - train_size - val_size

# 使用random_split函数拆分数据集
train_dataset, val_dataset, test_dataset = random_split(data, [train_size, val_size, test_size])

In [186]:
train_loader = DataLoader(train_dataset, batch_size=3, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=3)
test_loader = DataLoader(test_dataset, batch_size=3)

In [195]:
type(train_dataset[0][2])

int

In [184]:
len(train_dataset[0][0])

30

In [81]:
import torch
import torch.nn as nn
import torch.nn.functional as f
import numpy as np


def conv_and_pool(x, conv):
    x = f.relu(conv(x))
    x = x.squeeze(3)
    x = f.max_pool1d(x, x.size(2)).squeeze(2)
    return x


class Model(nn.Module):
    def __init__(self, config):
        super(Model, self).__init__()
        if config.embedding_pretrained is not None:
            self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
        else:
            self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, config.num_filters, (k, config.embed)) for k in config.filter_sizes])
        self.dropout = nn.Dropout(config.dropout)
        self.fc1 = nn.Linear(config.num_filters * len(config.filter_sizes), config.num_filters * len(config.filter_sizes)//2)
        self.fc2=nn.Linear(config.num_filters * len(config.filter_sizes)//2,config.num_filters * len(config.filter_sizes)//4)
        self.fc3=nn.Linear(config.num_filters * len(config.filter_sizes)//4,config.num_classes)

    def forward(self, x):
        out = self.embedding(x)
        out = out.unsqueeze(1)  #插入维度 进行卷积运算
        out = torch.cat([conv_and_pool(out, conv) for conv in self.convs], 1)
        out = self.dropout(out)
        out = self.fc3(self.fc2(self.fc1(out)))
        print(out.shape)
        return out

In [82]:
model = Model(config).to(config.device)
print(model.parameters)

<bound method Module.parameters of Model(
  (embedding): Embedding(727, 100)
  (convs): ModuleList(
    (0): Conv2d(1, 256, kernel_size=(2, 100), stride=(1, 1))
    (1): Conv2d(1, 256, kernel_size=(3, 100), stride=(1, 1))
    (2): Conv2d(1, 256, kernel_size=(4, 100), stride=(1, 1))
    (3): Conv2d(1, 256, kernel_size=(5, 100), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=1024, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=8, bias=True)
)>


In [85]:
model(torch.randint(1, 10, [64, 30]).to(config.device)).shape()

torch.Size([64, 8])


TypeError: 'torch.Size' object is not callable