In [None]:
import numpy as np
import torch
import torch.nn as nn

import models.ShipRNN as model

np.random.seed(3407)
torch.manual_seed(3407)
torch.cuda.manual_seed_all(3407)
torch.backends.cudnn.deterministic = True  # 保证每次结果一样
torch.backends.cudnn.benchmark = False
UNK, PAD = '<UNK>', '<PAD>'  # 未知字，padding符号

In [None]:
from utils import BertDataConfig, BertDataset

data_config = BertDataConfig()
val_dataset = BertDataset(data_config, data_class='val')

In [None]:
from utils import DataConfig

data_config = DataConfig('word2vec')
model_config = model.ModelConfig()

In [None]:
import torch.nn.functional as F
class Model(nn.Module):
    def __init__(self, model_config, data_config):
        super(Model, self).__init__()
        # Existing code
        self.embedding = nn.Embedding.from_pretrained(
            data_config.embedding_pretrained,
            freeze=False) if data_config.embedding_pretrained is not None else nn.Embedding(data_config.n_vocab,
                                                                                            data_config.embed,
                                                                                            padding_idx=data_config.n_vocab - 1)
        # New BatchNorm layer after embedding
        self.bn_after_embedding = nn.BatchNorm1d(data_config.embed)

        self.lstm = nn.LSTM(data_config.embed, model_config.hidden_size, model_config.num_layers,
                            bidirectional=True, batch_first=True, dropout=model_config.dropout)
        self.bn = nn.BatchNorm1d(model_config.hidden_size * 2)
        self.avg_pool = nn.AvgPool1d(data_config.pad_size // 4)
        self.mutilatte = nn.MultiheadAttention(embed_dim=model_config.hidden_size * 2 + data_config.embed, num_heads=6,
                                               batch_first=True)

        # New BatchNorm layer after MultiheadAttention
        self.bn_after_mutilatte = nn.BatchNorm1d(model_config.hidden_size * 2 + data_config.embed)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(4 * (model_config.hidden_size * 2 + data_config.embed), data_config.num_classes)

    def forward(self, x):
        embed = self.embedding(x)  # [batch_size, seq_len, embeding]
        embed = self.bn_after_embedding(embed.permute(0, 2, 1)).permute(0, 2, 1)  # Apply BN after embedding
        out, _ = self.lstm(embed)  # 左右双向
        out1 = torch.cat((embed, out), 2)
        out1 = F.gelu(out1)
        out2, _ = self.mutilatte(out1, out1, out1)
        out2 = self.bn_after_mutilatte(out2.permute(0, 2, 1))  # Apply BN after MultiheadAttention
        out2 = self.avg_pool(out2).squeeze()
        out2 = self.flatten(out2)
        out2 = F.gelu(out2)
        print(out2.shape)
        out2 = self.fc(out2)  # 句子最后时刻的 hidden state
        return out2


In [None]:
model = Model(model_config, data_config).to(data_config.device)

In [None]:
model(torch.randint(1, 10, [2, 30]).to(data_config.device)).size()

In [None]:
def init_network(model, method='xavier', exclude='embedding'):
    for name, w in model.named_parameters():
        if exclude not in name:  # 如果不是嵌入层
            if 'weight' in name:  # weight 三种初始化方式
                if method == 'xavier' and len(w.size() < 2):
                    nn.init.xavier_normal_(w)
                elif method == 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name:  # bias 置0
                nn.init.constant_(w, 0)
            else:
                pass

In [None]:
init_network(model)

In [None]:
from torchinfo import summary

summary(model, input_size=(2, 30), dtypes=[torch.long])

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

path = './test_data/'
# 读取CSV文件
df = pd.read_csv(f'{path}chn_text.csv')  # 替换为你的CSV文件路径

# 划分数据集
train_df, temp_df = train_test_split(df, test_size=0.1, random_state=3407)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=3407)

# 保存划分后的数据集为新的CSV文件
train_df.to_csv(f'{path}train_dataset.csv', index=False)
val_df.to_csv(f'{path}val_dataset.csv', index=False)
test_df.to_csv(f'{path}test_dataset.csv', index=False)


In [None]:
import torch
import torch.nn.functional as F

# 假设你的输入是一个大小为[2, 30, 100]的tensor
input_tensor = torch.randn(2, 30, 100)

# 执行最大池化操作，保留最大的两个值
output_tensor = F.max_pool1d(input_tensor, kernel_size=2, stride=1)

print(output_tensor.shape)  # 输出应为[2, 2, 100]


In [None]:
from datasets import load_dataset

dataset = load_dataset("csv",
                       data_files={"train": "./ship_data/train_dataset.csv", "test": "./ship_data/test_dataset.csv",
                                   "val": "./ship_data/val_dataset.csv"})

In [None]:
dataset['train'][0]

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=5)

In [None]:
all_layers = list(model.children())

In [None]:
all_layers

In [None]:
from torchinfo import summary
import torch

batch_size = 1
summary(model, input_size=(batch_size, 30), dtypes=[torch.long])

In [None]:
type(model)

In [None]:
from transformers import BertModel

#加载预训练模型
pretrained = BertModel.from_pretrained('bert-base-chinese')

In [None]:
from transformers import BertTokenizer

#加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-chinese')
out = token.encode('今天是个好日子')
token.decode(out)

In [None]:
zidian = token.get_vocab()

In [None]:
import os
import pandas as pd
import pickle as pkl

tokenizer = lambda x: x.split('|')  # word-level
vocab = pkl.load(open('./ship_data/pre_data/vocab.pkl', 'rb'))  # 打开词表
class_list = [x.strip() for x in
              open(os.path.join('./ship_data/', 'pre_data', 'class.txt'), encoding='utf-8').readlines()]
class_int_dict = {item: i for i, item in enumerate(class_list)}
df = pd.read_csv('./ship_data/old_data/val_dataset.csv', usecols=['path', 'cluster'])  # 读取csv
print(class_int_dict)
contents = []
pad_size = 30
for index, row in df.iterrows():
    content, label = row['path'], row['cluster']
    token = tokenizer(content)
    seq_len = len(token)
    if seq_len < pad_size:
        token.extend(['PAD'] * (pad_size - len(token)))
    else:
        token = token[:pad_size]
        seq_len = pad_size
    words_line = []
    for word in token:
        words_line.append(vocab.get(word, vocab.get('UNK')))
    contents.append((words_line, class_int_dict[label], seq_len))



In [None]:

# 将列表转换为 Pandas 数据框
df = pd.DataFrame(contents, columns=['path', 'cluster', 'length'])

# 保存为 CSV 文件
df.to_csv('val_dataset.csv', index=False)


In [None]:
import csv

# 指定CSV文件路径
csv_file_path = 'output.csv'

# 打开或创建CSV文件，并写入数据
with open(csv_file_path, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)

    # 逐行写入数据
    for row in contents:
        csv_writer.writerow(row)

In [10]:
import pandas as pd

test = pd.read_csv('./ship_data/test_dataset.csv')

In [11]:
test['cluster'].value_counts()

cluster
散杂货船     44333
渔船       40276
集装箱船     18656
油船        6458
液体散货船     5919
Name: count, dtype: int64