In [18]:
import numpy as np
import torch
import torch.nn as nn

import models.ShipRNN as model

np.random.seed(3407)
torch.manual_seed(3407)
torch.cuda.manual_seed_all(3407)
torch.backends.cudnn.deterministic = True  # 保证每次结果一样
torch.backends.cudnn.benchmark = False
UNK, PAD = '<UNK>', '<PAD>'  # 未知字，padding符号

In [19]:
from utils import BertDataConfig, BertDataset

data_config = BertDataConfig()
val_dataset = BertDataset(data_config, data_class='val')

In [20]:
from utils import DataConfig

data_config = DataConfig('word2vec')
model_config = model.ModelConfig()

In [69]:
class Model(nn.Module):
    def __init__(self, model_config, data_config):
        super(Model, self).__init__()
        print(data_config.n_vocab, data_config.embed)
        self.embedding = nn.Embedding.from_pretrained(
            data_config.embedding_pretrained,
            freeze=False) if data_config.embedding_pretrained is not None else nn.Embedding(data_config.n_vocab,
                                                                                            data_config.embed,
                                                                                            padding_idx=data_config.n_vocab - 1)
        self.lstm = nn.LSTM(data_config.embed, model_config.hidden_size, model_config.num_layers,
                            bidirectional=True, batch_first=True, dropout=model_config.dropout)
        self.bn = nn.BatchNorm1d(model_config.hidden_size * 2)
        self.avg_pool = nn.AvgPool1d(data_config.pad_size)
        self.mutilatte = nn.MultiheadAttention(embed_dim=model_config.hidden_size * 2 + data_config.embed, num_heads=6,
                                               batch_first=True)
        self.fc = nn.Linear(model_config.hidden_size * 2 + data_config.embed, data_config.num_classes)

    def forward(self, x):
        embed = self.embedding(x)  # [batch_size, seq_len, embeding]
        out, _ = self.lstm(embed)  # 左右双向
        out1 = torch.cat((embed, out), 2)
        out1 = F.gelu(out1)
        out2, _ = self.mutilatte(out1, out1, out1)
        out2 = out2.permute(0, 2, 1)
        out2 = self.avg_pool(out2).squeeze()
        out2 = F.gelu(out2)
        out2 = self.fc(out2)  # 句子最后时刻的 hidden state
        return out2


In [70]:
model = Model(model_config, data_config).to(data_config.device)

0 100


In [71]:
model(torch.randint(1, 10, [2, 30]).to(data_config.device)).size()

torch.Size([2, 5])

In [48]:
def init_network(model, method='xavier', exclude='embedding'):
    for name, w in model.named_parameters():
        if exclude not in name:  # 如果不是嵌入层
            if 'weight' in name:  # weight 三种初始化方式
                if method == 'xavier' and len(w.size() < 2):
                    nn.init.xavier_normal_(w)
                elif method == 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name:  # bias 置0
                nn.init.constant_(w, 0)
            else:
                pass

In [None]:
init_network(model)

In [None]:
from torchinfo import summary

summary(model, input_size=(1, 30), dtypes=[torch.long])

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

path = './test_data/'
# 读取CSV文件
df = pd.read_csv(f'{path}chn_text.csv')  # 替换为你的CSV文件路径

# 划分数据集
train_df, temp_df = train_test_split(df, test_size=0.1, random_state=3407)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=3407)

# 保存划分后的数据集为新的CSV文件
train_df.to_csv(f'{path}train_dataset.csv', index=False)
val_df.to_csv(f'{path}val_dataset.csv', index=False)
test_df.to_csv(f'{path}test_dataset.csv', index=False)


In [None]:
import torch
import torch.nn.functional as F

# 假设你的输入是一个大小为[2, 30, 100]的tensor
input_tensor = torch.randn(2, 30, 100)

# 执行最大池化操作，保留最大的两个值
output_tensor = F.max_pool1d(input_tensor, kernel_size=2, stride=1)

print(output_tensor.shape)  # 输出应为[2, 2, 100]


In [None]:
from datasets import load_dataset

dataset = load_dataset("csv",
                       data_files={"train": "./ship_data/train_dataset.csv", "test": "./ship_data/test_dataset.csv",
                                   "val": "./ship_data/val_dataset.csv"})

In [None]:
dataset['train'][0]

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=5)

In [None]:
all_layers = list(model.children())

In [None]:
all_layers

In [None]:
from torchinfo import summary
import torch

batch_size = 1
summary(model, input_size=(batch_size, 30), dtypes=[torch.long])

In [None]:
type(model)

In [None]:
from transformers import BertModel

#加载预训练模型
pretrained = BertModel.from_pretrained('bert-base-chinese')

In [11]:
from transformers import BertTokenizer

#加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-chinese')
out = token.encode('今天是个好日子')
token.decode(out)

'[CLS] 今 天 是 个 好 日 子 [SEP]'

In [33]:
zidian = token.get_vocab()

In [16]:
import os
import pandas as pd
import pickle as pkl

tokenizer = lambda x: x.split('|')  # word-level
vocab = pkl.load(open('./ship_data/pre_data/vocab.pkl', 'rb'))  # 打开词表
class_list = [x.strip() for x in open(os.path.join('./ship_data/', 'pre_data', 'class.txt'), encoding='utf-8').readlines()]
class_int_dict = {item: i for i, item in enumerate(class_list)}
df = pd.read_csv('./ship_data/old_data/val_dataset.csv', usecols=['path', 'cluster'])  # 读取csv
print(class_int_dict)
contents = []
pad_size=30
for index, row in df.iterrows():
    content, label = row['path'], row['cluster']
    token = tokenizer(content)
    seq_len = len(token)
    if seq_len < pad_size:
        token.extend(['PAD'] * (pad_size - len(token)))
    else:
        token = token[:pad_size]
        seq_len =pad_size
    words_line = []
    for word in token:
        words_line.append(vocab.get(word, vocab.get('UNK')))
    contents.append((words_line, class_int_dict[label], seq_len))



{'散杂货船': 0, '渔船': 1, '集装箱船': 2, '油船': 3, '液体散货船': 4}


In [17]:
import pandas as pd

# 将列表转换为 Pandas 数据框
df = pd.DataFrame(contents, columns=['path','cluster','length'])

# 保存为 CSV 文件
df.to_csv('val_dataset.csv', index=False)


In [20]:
import csv
# 指定CSV文件路径
csv_file_path = 'output.csv'

# 打开或创建CSV文件，并写入数据
with open(csv_file_path, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)

    # 逐行写入数据
    for row in contents:
        csv_writer.writerow(row)

In [21]:
test=pd.read_csv('./output.csv')

In [25]:
test

Unnamed: 0,"[88, 117, 161, 253, 69, 71, 103, 115, 161, 119, 88, 224, 117, 115, 103, 71, 71, 161, 119, 288, 176, 88, 152, 224, 306, 147, 115, 193, 161, 147]",0,30
0,"[384, 384, None, None, None, None, None, None,...",1,2
1,"[74, 5, 8, 4, 8, 1, 97, 19, 7, 105, 39, 105, 1...",3,30
2,"[23, 2, 19, 39, 35, 39, 39, 19, 14, 39, 100, 3...",1,30
3,"[130, 83, 6, 37, 21, 38, 112, 116, 153, 76, 13...",2,30
4,"[244, 171, 127, 58, 210, 217, 181, 175, 9, 9, ...",1,30
...,...,...,...
115635,"[120, 222, 122, 160, 218, 186, 238, 155, 104, ...",2,30
115636,"[18, 50, 19, 2, 4, 0, 30, 48, 11, 38, 35, 63, ...",0,30
115637,"[289, 281, 260, 223, 217, 214, 151, 43, 167, 2...",2,30
115638,"[4, 20, 33, 85, 85, 33, 85, 85, 33, 85, 85, 33...",0,30
