In [None]:
from transformers import BertTokenizer, BertModel
from sklearn.metrics import mean_absolute_error
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import time
import os
from torch.utils.tensorboard import SummaryWriter
import warnings
warnings.filterwarnings('ignore')

PROJECT = 'mes_all'
NUM_EPOCHS = 10

In [None]:
class BertForRegression(nn.Module):
    def __init__(self, pretrained_model_name='bert-base-uncased', dropout=0.3):
        super(BertForRegression, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        hidden_size = self.bert.config.hidden_size  # 移到这里获取 hidden_size
        self.regressor = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_size, 1),  # 输出一个数值
        )

    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):  # 修改：添加 token_type_ids 参数
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)  # 修改：传递 token_type_ids
        cls_output = outputs.last_hidden_state[:, 0, :]  # 取[CLS]标记对应的隐藏状态
        regression_output = self.regressor(cls_output)
        return regression_output

In [None]:
# 加载 BERT 分词器
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 加载 BERT 模型，用于回归任务
model = BertForRegression()

data = pd.read_csv(f'/kaggle/input/storypoint/{PROJECT}.csv')
data['description'].fillna('', inplace=True)
data.dropna(inplace=True)
data = data[data['storypoint'] != -1]
data['text'] = data['title'] + ' ' + data['description']
data['label'] = data['storypoint'].astype(float)
data = data[['text', 'label']]

train_val_split_point = int(len(data) * 0.6)
val_test_split_point = int(len(data) * 0.8)
train_texts = data['text'][:train_val_split_point]
train_labels = data['label'][:train_val_split_point]
val_texts = data['text'][train_val_split_point:val_test_split_point]
val_labels = data['label'][train_val_split_point:val_test_split_point]
test_texts = data['text'][val_test_split_point:]
test_labels = data['label'][val_test_split_point:]

# 分词函数
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, return_tensors="pt")

# 将 tokenized_datasets 转换为 PyTorch Dataset
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.float32)
        return item

    def __len__(self):
        return len(self.labels)

# 分词
tokenized_train = tokenize_function(train_texts.to_list())
tokenized_val = tokenize_function(val_texts.to_list())
tokenized_test = tokenize_function(test_texts.to_list())

# 数据集
train_dataset = TextDataset(tokenized_train, train_labels)
val_dataset = TextDataset(tokenized_val, val_labels)
test_dataset = TextDataset(tokenized_test, test_labels)

# 创建数据加载器
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 设置优化器
optimizer = AdamW(model.parameters(), lr=2e-5)

# 设置学习率调度器
num_training_steps = NUM_EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# 将模型和优化器移动到 GPU（如果可用）
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# 创建 TensorBoard
writer = SummaryWriter(f'tb/{PROJECT}')

# 初始化变量来保存最好的模型
min_eval_loss_epoch = [float('inf'), 0]  # 最小验证损失和对应的 epoch
time_records = []
MAE_RECORDS = []
MDAE_RECORDS = []

start_time = time.time()

for epoch in range(NUM_EPOCHS):
    # ---TRAINING---
    model.train()
    total_train_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Training epoch {epoch+1}/{NUM_EPOCHS}")
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}

        # 前向传播
        outputs = model(batch['input_ids'], batch['attention_mask'], token_type_ids=batch.get('token_type_ids'))  
        loss = F.mse_loss(outputs.squeeze(), batch['labels'])

        # 反向传播
        loss.backward()

        # 更新参数
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_train_loss += loss.item()
        progress_bar.set_postfix({"loss": loss.item()})

    avg_train_loss = total_train_loss / len(train_dataloader)
    writer.add_scalar('loss/train', avg_train_loss, epoch)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} training completed. Avg Train Loss: {avg_train_loss:.4f}")

    # ---EVAL---
    model.eval()
    total_eval_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(batch['input_ids'], batch['attention_mask'], token_type_ids=batch.get('token_type_ids'))  
            loss = F.mse_loss(outputs.squeeze(), batch['labels'])
            total_eval_loss += loss.item()

    avg_eval_loss = total_eval_loss / len(val_dataloader)
    writer.add_scalar('loss/eval', avg_eval_loss, epoch)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} evaluation completed. Avg Eval Loss: {avg_eval_loss:.4f}")

    # 保存最好的模型
    if avg_eval_loss < min_eval_loss_epoch[0]:
        min_eval_loss_epoch = [avg_eval_loss, epoch]
        torch.save(model.state_dict(), f'./models/best_model_{PROJECT}.pth')  # 保存最好的模型

    # ---TESTING---
    model.eval()
    predictions = []
    true_labels = []
    for batch in val_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(batch['input_ids'], batch['attention_mask'], token_type_ids=batch.get('token_type_ids'))  
        predictions.extend(outputs.squeeze().cpu().numpy())
        true_labels.extend(batch['labels'].cpu().numpy())

    # 计算 MAE
    mae = mean_absolute_error(true_labels, predictions)
    MAE_RECORDS.append(mae)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} Testing MAE: {mae:.4f}")

    # 清理内存
    torch.cuda.empty_cache()

# 删除其他模型，只保留最佳模型
for filename in os.listdir('/kaggle/working/models'):
    if filename.startswith('epoch') and filename != f'best_model_{PROJECT}.pth':
        os.remove(os.path.join('/kaggle/working/models', filename))

print("All epochs completed. Best model saved.")