In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F  # Added: for one-hot conversion
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold  # K-fold CV
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import os
import random
from sklearn.metrics import root_mean_squared_error
from torch.utils.data import Subset

In [15]:
# -------------------------------
# （可选）设置中文字体
font_path = './SimHei.ttf'
font_prop = fm.FontProperties(fname=font_path)

# -------------------------------
## 读取数据（请根据实际路径修改）
cbf_df = pd.read_csv("Data/cbf_df_A.csv")  # 424×5 五大人格
ratings_raw = pd.read_csv("ratings_raw_A.csv").values.astype(np.float32)  # 424×25 音乐评分

cbf_df_B = pd.read_csv("Data/cbf_df_B.csv")  # 424×5 五大人格
ratings_raw_B = pd.read_csv("ratings_raw_AB.csv").values.astype(np.float32)  # 424×25 音乐评分


# -------------------------------
# 超参数设置
dims = ["神经质", "严谨性", "宜人性", "开放性", "外向性"]


n_splits = 10       # 10 折交叉验证
batch_size = 256
epochs = 1000
lr = 1e-3



def set_seed(seed=42):
    # python 内置随机性
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

    # numpy 随机性
    np.random.seed(seed)

    # pytorch 随机性
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # 多卡

    # 确保 cudnn 行为确定
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False



    # dataloader 相关：worker 初始化
    def seed_worker(worker_id):
        worker_seed = seed + worker_id
        np.random.seed(worker_seed)
        random.seed(worker_seed)
    return seed_worker

# 使用时：
seed_worker = set_seed(42)



In [None]:
# -------------------------------
# 模型：多任务 + 音乐 one-hot 特征（不使用音频特征）
class MultiTaskModel(nn.Module):
    def __init__(self,
                 num_subjects,
                 num_music=5,
                 subject_emb_dim=8,
                 hidden_dim1_p=8,
                 hidden_dim2_p=8,
                 hidden_dim1=64,
                 hidden_dim2=64,
                 hidden_dim3=64,
                 hidden_dim4=64,
                 hidden_dim5=32,
                 hidden_dim6=32,
                 hidden_dim7=32,
                 hidden_dim8=16,
                 hidden_dim9=16,
                 hidden_dim10=16,
                 hidden_dim11=8,
                 hidden_dim12=8,
                 hidden_dim13=8,
                 output_personality_dim=5,
                 output_dim=1,
                 music_emb_dim=12,
                 dropout_rate=0.4,
                 dropout_rate_p=0.8):
        super().__init__()
        self.num_music = num_music  # 用于 one-hot 大小
        self.subj_proj = nn.Linear(num_subjects, subject_emb_dim) #方法2
        
        #人格预测分支-------------------------------
        self.personality_mlp1 = nn.Sequential(
            nn.Linear(subject_emb_dim, hidden_dim1_p),
            nn.BatchNorm1d(hidden_dim1_p),
            nn.ReLU(),
            nn.Dropout(dropout_rate_p)
        )

        self.personality_mlp2 = nn.Sequential(
            nn.Linear(hidden_dim1_p, hidden_dim2_p),
            nn.BatchNorm1d(hidden_dim2_p),
            nn.ReLU(),
            nn.Dropout(dropout_rate_p),
            nn.Linear(hidden_dim2_p,  output_personality_dim)  # [B,5]
        )
      

        #音乐预测分支------------------------------------

        self.music_feature_mapper = nn.Linear(num_music, music_emb_dim)  # Modified: from one-hot
        # 最终评分预测 MLP
        input_dim = subject_emb_dim + music_emb_dim + subject_emb_dim*music_emb_dim
        self.fc1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim1),
            nn.BatchNorm1d(hidden_dim1),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.fc2 = nn.Sequential(
            nn.Linear(hidden_dim1, hidden_dim2),
            nn.BatchNorm1d(hidden_dim2),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.fc3 = nn.Sequential(
            nn.Linear(hidden_dim2, hidden_dim3),
            nn.BatchNorm1d(hidden_dim3),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.fc4 = nn.Sequential(
            nn.Linear(hidden_dim3, hidden_dim4),
            nn.BatchNorm1d(hidden_dim4),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.fc5 = nn.Sequential(
            nn.Linear(hidden_dim4, hidden_dim5),
            nn.BatchNorm1d(hidden_dim5),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.fc6 = nn.Sequential(
            nn.Linear(hidden_dim5, hidden_dim6),
            nn.BatchNorm1d(hidden_dim6),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.fc7 = nn.Sequential(
            nn.Linear(hidden_dim6, hidden_dim7),
            nn.BatchNorm1d(hidden_dim7),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.fc8 = nn.Sequential(
            nn.Linear(hidden_dim7, hidden_dim8),
            nn.BatchNorm1d(hidden_dim8),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.fc9 = nn.Sequential(
            nn.Linear(hidden_dim8, hidden_dim9),
            nn.BatchNorm1d(hidden_dim9),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.fc10 = nn.Sequential(
            nn.Linear(hidden_dim9, hidden_dim10),
            nn.BatchNorm1d(hidden_dim10),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.fc11 = nn.Sequential(
            nn.Linear(hidden_dim10, hidden_dim11),
            nn.BatchNorm1d(hidden_dim11),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.fc12 = nn.Sequential(
            nn.Linear(hidden_dim11, hidden_dim12),
            nn.BatchNorm1d(hidden_dim12),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.fc13 = nn.Sequential(
            nn.Linear(hidden_dim12, hidden_dim13),
            nn.BatchNorm1d(hidden_dim13),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim13, output_dim)
        )



        


    def forward(self, subj_ids, music_ids):

        subj_onehot = F.one_hot(subj_ids, num_classes=self.subj_proj.in_features).float()  #方法2
        subj_emb = self.subj_proj(subj_onehot)    

        # personality prediction
        x_p1 = self.personality_mlp1(subj_emb)
        x_p2 = self.personality_mlp2(x_p1)

        personality_pred = torch.sigmoid(x_p2)    # [B]
        music_onehot = F.one_hot(music_ids, num_classes=self.num_music).float()  # Modified
        music_emb = self.music_feature_mapper(music_onehot)  

        # interaction
        interaction = (subj_emb.unsqueeze(2) * music_emb.unsqueeze(1))  
        interaction = interaction.reshape(interaction.size(0), -1) 
        x = torch.cat([subj_emb, music_emb, interaction], dim=1)
       
        # 逐层通过 MLP
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.fc4(x)
        x = self.fc5(x)
        x = self.fc6(x)
        x = self.fc7(x)
        x = self.fc8(x)
        x = self.fc9(x)
        x = self.fc10(x)
        x = self.fc11(x)
        x = self.fc12(x)
        out = self.fc13(x)

        rating_pred = torch.sigmoid(out).squeeze(-1)  # [B]

        return rating_pred, personality_pred


In [None]:
seed_worker = set_seed(42)
set_seed(42)
def multi_dim_model2_1(dims,ratings_raw, cbf_df, epochs=1, batch_size=256, lr=1e-5):

    # ---------- 工具函数 ----------
    def compute_metrics(y_true, y_pred):
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_true, y_pred)
        corr = np.corrcoef(y_true, y_pred)[0, 1]
        return mse, rmse, r2, corr


    class RatingDataset(Dataset):
        def __init__(self, df, personality_vals):
            self.df = df.reset_index(drop=True)
            self.personality_vals = personality_vals

        def __len__(self): return len(self.df)
        def __getitem__(self, idx):
            row = self.df.iloc[idx]
            subj_id = int(row["subject_id"])
            music_id = int(row["music_id"])
            rating = float(row["label"])
            personality = self.personality_vals[subj_id]
            mask = int(row.get("mask"))
            return subj_id, music_id, rating, personality, mask
    # ---------- 准备数据 ----------
    ratings = (ratings_raw - 1) / 6.0
    num_subjects, num_music = ratings.shape

    # 将原 music_id 分成 5 首一组
    group_size = 5
    rows = []
    for i in range(num_subjects):
        for j in range(num_music):
            grouped_music_id = j // group_size  # 每 5 首分成一个编号
            rows.append([i, grouped_music_id, float(ratings[i, j])])

    data_df = pd.DataFrame(rows, columns=["subject_id", "music_id", "label"])
    personality_values = cbf_df[dims].values.astype(np.float32)

    # ---------- K折训练 ----------
    fold_metrics = []
    fold_metrics_p = []
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
 
    all_preds_abs_errors = []  # 存储绝对误差
    all_preds_values = []      # 存储预测值
    subj_ids_all = []          # 存储被试id
    
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(np.arange(num_subjects)), 1):
        print(f"5dims Fold {fold}/10")

        data_df["mask"] = data_df["subject_id"].apply(lambda x: 1 if x in train_idx else 0)
        
        full_loader = DataLoader(RatingDataset(data_df, personality_values), 
                                 batch_size=batch_size, shuffle=True,worker_init_fn=seed_worker)
        train_df = data_df[data_df["mask"] == 1].reset_index(drop=True)
        test_df = data_df[data_df["mask"] == 0].reset_index(drop=True)
        train_loader = DataLoader(RatingDataset(train_df, personality_values), batch_size=batch_size, shuffle=True,worker_init_fn=seed_worker)
        test_loader = DataLoader(RatingDataset(test_df, personality_values), batch_size=batch_size, shuffle=True,worker_init_fn=seed_worker)

        model = MultiTaskModel(num_subjects, num_music)
        optimizer = optim.Adam(model.parameters(), lr=lr)
        

        bce_rating = nn.BCELoss()  # 评分：所有样本都算
        bce_personality = nn.BCELoss(reduction="none") 

        train_mean_personality = personality_values[train_idx].mean(axis=0)
        train_mean_personality = torch.tensor(train_mean_personality, dtype=torch.float32)

        


        for epoch in range(200):
            model.train()

            for subj_ids, music_ids, ratings_lbl, pers_lbl, masks in train_loader:
                ratings_lbl, pers_lbl = ratings_lbl.float(), pers_lbl.float()
                masks = masks.float()
                optimizer.zero_grad()
                rating_logit, pers_logit= model(subj_ids, music_ids)  # 模型返回 logit


                # 拆分评分 loss
                loss_rating = bce_rating(rating_logit, ratings_lbl)

                # test 权重大一些 (2:1)
                per_sample_loss = bce_personality(pers_logit, pers_lbl)
                
                loss_personality = (per_sample_loss * masks.unsqueeze(1)).mean()#(per_sample_loss * masks.unsqueeze(1)).sum() / (masks.sum() + 1e-8)

                                
                loss = loss_rating +  loss_personality
                loss.backward()
                optimizer.step()


            # ====== 新增：每个 epoch 结束后，重新评估训练集 & 测试集的逐样本 loss ======
            model.eval()
            train_loss_rating, train_loss_personality = [], []

            with torch.no_grad():
                # 训练集
                for subj_ids, music_ids, ratings_lbl, pers_lbl, _ in train_loader:
                    ratings_lbl, pers_lbl = ratings_lbl.float(), pers_lbl.float()
                    rating_logit, pers_logit = model(subj_ids, music_ids)

                    # 评分 loss（逐样本）
                    loss_r = bce_rating(rating_logit, ratings_lbl).item()
                    # 人格 loss（逐样本）
                    per_sample_loss = bce_personality(pers_logit, pers_lbl)
                    loss_p = per_sample_loss.mean().item()

                    train_loss_rating.append(loss_r)
                    train_loss_personality.append(loss_p)

                    # ================== 阶段 2：测试集 ==================
        for epoch in range(100):
            model.train()
            for subj_ids, music_ids, ratings_lbl, pers_lbl, masks in test_loader:
                ratings_lbl, pers_lbl = ratings_lbl.float(), pers_lbl.float()
                masks = masks.float()

                optimizer.zero_grad()
                rating_logit, pers_logit = model(subj_ids, music_ids)

                loss_rating = bce_rating(rating_logit, ratings_lbl)
                per_sample_loss = bce_personality(pers_logit, pers_lbl)
                loss_personality = (per_sample_loss * masks.unsqueeze(1)).mean()

                loss = loss_rating + loss_personality
                loss.backward()
                optimizer.step()

            model.eval()
            test_loss_rating, test_loss_personality = [], []
            # 测试集
            for subj_ids, music_ids, ratings_lbl, pers_lbl, _ in test_loader:
                ratings_lbl, pers_lbl = ratings_lbl.float(), pers_lbl.float()
                rating_logit, pers_logit = model(subj_ids, music_ids)

                loss_r = bce_rating(rating_logit, ratings_lbl).item()
                per_sample_loss = bce_personality(pers_logit, pers_lbl)
                loss_p = per_sample_loss.mean().item()

                test_loss_rating.append(loss_r)
                test_loss_personality.append(loss_p)



        #评分的评估
        def eval_loader(loader, name=""):
            model.eval()
            all_preds, all_labels = [], []
            with torch.no_grad():
                for subj_ids, music_ids, lbls, _, masks in loader:
                    lbls = lbls.float()
                    preds, _ = model(subj_ids, music_ids)
                    all_preds.append(preds.cpu().numpy())
                    all_labels.append(lbls.cpu().numpy())
                all_preds = np.concatenate(all_preds)
                all_labels = np.concatenate(all_labels)
                metrics = compute_metrics(all_labels, all_preds)
            return metrics


        _, rmse_tr_rating, _, corr_tr_rating = eval_loader(train_loader, name="train")
        _, rmse_te_rating, _, corr_te_rating = eval_loader(test_loader, name="test")



        #人格的评估       
        def collect_preds_p(loader):
            model.eval()
            all_preds, all_labels, subj_ids_batch = [], [],[]
            with torch.no_grad():
                for subj_ids,music_ids, _, pers_lbl, _ in loader:
                    pers_lbl = pers_lbl.float()
                    _, pers_pred = model(subj_ids, music_ids)
                    all_preds.append(pers_pred.cpu().numpy())
                    all_labels.append(pers_lbl.cpu().numpy())
                    subj_ids_batch.append(subj_ids.cpu().numpy())
                all_preds = np.vstack(all_preds)
                all_labels = np.vstack(all_labels)
                subj_ids_batch = np.concatenate(subj_ids_batch)
            return all_preds, all_labels, subj_ids_batch

        
        # 逐维度算指标
        preds_tr_p, labels_tr_p,_ = collect_preds_p(train_loader)

        preds_te_p, labels_te_p, subj_ids_batch = collect_preds_p(test_loader)

        abs_errors = np.abs(preds_te_p - labels_te_p)
        all_preds_abs_errors.append(abs_errors)
        all_preds_values.append(preds_te_p)
        subj_ids_all.append(subj_ids_batch)
        
        
        
        results_p = {}

        for i, dim in enumerate(dims):
            rmse_tr = root_mean_squared_error(labels_tr_p[:, i], preds_tr_p[:, i])
            rmse_te = root_mean_squared_error(labels_te_p[:, i], preds_te_p[:, i])
            corr_tr = np.corrcoef(labels_tr_p[:, i], preds_tr_p[:, i])[0, 1]
            corr_te = np.corrcoef(labels_te_p[:, i], preds_te_p[:, i])[0, 1]

            results_p[f"rmse_tr_{dim}"] = rmse_tr
            results_p[f"rmse_te_{dim}"] = rmse_te
            results_p[f"corr_tr_{dim}"] = corr_tr
            results_p[f"corr_te_{dim}"] = corr_te
            results_p["fold"] = fold



        results_r = {
            'fold': fold,
            'rmse_tr': rmse_tr_rating,
            'corr_tr': corr_tr_rating,
            'rmse_te': rmse_te_rating,
            'corr_te': corr_te_rating,
        }
        fold_metrics.append(results_r)


        # ====== 打印人格的均值指标 ======
        rmse_te_vals = [results_p[f"rmse_te_{dim}"] for dim in dims]
        corr_te_vals = [results_p[f"corr_te_{dim}"] for dim in dims]
        mean_rmse_te = np.mean(rmse_te_vals)
        mean_corr_te = np.mean(corr_te_vals)

        print(f"5dims Fold {fold}/10 | Test Personality Mean RMSE={mean_rmse_te:.4f}, Mean Corr={mean_corr_te:.4f}")
    
    all_preds_abs_errors = np.vstack(all_preds_abs_errors)
    all_preds_values = np.vstack(all_preds_values)
    subj_ids_all = np.concatenate(subj_ids_all)
    
    subj_labels = [f"subj_{i+1}" for i in range(num_subjects)]

    # 还原每个样本对应的 trait
    records = []
    for sid, errs in zip(subj_ids_all, all_preds_abs_errors):
        for t, trait in enumerate(dims):
            records.append([subj_labels[sid], trait, errs[t]])
    df_model2 = pd.DataFrame(records, columns=["subject_id", "trait", "Model2"])
    


    # 合并到已有宽表
    df_wide = pd.read_csv(r"Detailed_Output/A_wide_abs_errors_model1.csv", encoding="utf-8-sig")
    df_wide = df_wide.merge(df_model2, on=["subject_id", "trait"], how="left")
    df_wide.to_csv(r"Detailed_Output/A_wide_abs_errors_model1_2.csv", index=False, encoding="utf-8-sig")


    pd.DataFrame(fold_metrics).to_csv("Main_Result/rating_result_model2.csv", index=False, encoding="utf-8-sig")
    
    pd.DataFrame(fold_metrics_p).to_csv("Main_Result/personality_result_model2.csv", index=False, encoding="utf-8-sig")


multi_dim_model2_1(dims, ratings_raw, cbf_df, epochs=1,batch_size=256, lr=0.01)

5dims Fold 1/10
5dims Fold 1/10 | Test Personality Mean RMSE=0.1861, Mean Corr=0.0761
5dims Fold 2/10
5dims Fold 2/10 | Test Personality Mean RMSE=0.1779, Mean Corr=0.0891
5dims Fold 3/10
5dims Fold 3/10 | Test Personality Mean RMSE=0.1983, Mean Corr=0.1226
5dims Fold 4/10
5dims Fold 4/10 | Test Personality Mean RMSE=0.1763, Mean Corr=0.1313
5dims Fold 5/10
5dims Fold 5/10 | Test Personality Mean RMSE=0.1679, Mean Corr=0.1772
5dims Fold 6/10
5dims Fold 6/10 | Test Personality Mean RMSE=0.1969, Mean Corr=0.0750
5dims Fold 7/10
5dims Fold 7/10 | Test Personality Mean RMSE=0.1639, Mean Corr=0.0391
5dims Fold 8/10
5dims Fold 8/10 | Test Personality Mean RMSE=0.2019, Mean Corr=-0.0791
5dims Fold 9/10
5dims Fold 9/10 | Test Personality Mean RMSE=0.1546, Mean Corr=0.0292
5dims Fold 10/10
5dims Fold 10/10 | Test Personality Mean RMSE=0.1635, Mean Corr=-0.1922


In [None]:
pd.set_option("display.max_rows", None) 
seed_worker = set_seed(19)
set_seed(19)
def single_dim_model2_2(active_dim, ratings_raw_B, cbf_df, batch_size=256, lr=1e-3):

    # ---------- 工具函数 ----------
    def compute_metrics(y_true, y_pred):
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_true, y_pred)
        corr = np.corrcoef(y_true, y_pred)[0, 1]
        return mse, rmse, r2, corr

    class RatingDataset(Dataset):
        def __init__(self, df, personality_vals):
            self.df = df.reset_index(drop=True)
            self.personality_vals = personality_vals


        def __len__(self): 
            return len(self.df)

        def __getitem__(self, idx):
            row = self.df.iloc[idx]
            subj_id = int(row["subject_id"])
            music_id = int(row["music_id"])
            rating = float(row["label"])
            mask = int(row.get("mask"))

            try:
                personality = self.personality_vals[subj_id]
            except Exception as e:
                print(f" 索引错误: subj_id={subj_id}, len(personality_vals)={len(self.personality_vals)}")
                raise e

            return subj_id, music_id, rating, personality, mask


    # ---------- 准备数据 ----------
    ratings = (ratings_raw_B - 1) / 6.0
    num_subjects, num_music = ratings.shape
    print(ratings.shape)
    # 将原 music_id 分成 5 首一组
    group_size = 5
    rows = []
    for i in range(num_subjects):
        for j in range(num_music):
            grouped_music_id = j // group_size  # 每 5 首分成一个编号
            rows.append([i, grouped_music_id, float(ratings[i, j])])

    data_df = pd.DataFrame(rows, columns=["subject_id", "music_id", "label"])
    personality_values = cbf_df[active_dim].values.astype(np.float32)

    # ---------- 训练 ----------
    S2_metrics = []
    S2_metrics_p = []
    all_preds_abs_errors = []  # 存储绝对误差
    all_preds_values = []      # 存储预测值
    subj_ids_all = []          # 存储被试id
    train_idx = np.arange(0, 423)


    print(f"[{active_dim}] 研究二")

    data_df["mask"] = data_df["subject_id"].apply(lambda x: 1 if x in train_idx else 0)
    train_df = data_df[data_df["mask"] == 1].reset_index(drop=True)
    test_df = data_df[data_df["mask"] == 0].reset_index(drop=True)
    print(data_df["subject_id"].unique()[:20])
    
    train_loader = DataLoader(RatingDataset(train_df, personality_values), batch_size=batch_size, shuffle=True)
    
    
    test_loader = DataLoader(RatingDataset(test_df, personality_values), batch_size=batch_size, shuffle=True)

    model = MultiTaskModel(num_subjects, num_music)
    optimizer = optim.Adam(model.parameters(), lr=lr)



    bce_rating = nn.BCELoss()  # 评分：所有样本都算
    bce_personality = nn.BCELoss(reduction="none") 

    train_mean_personality = personality_values[train_idx].mean(axis=0)
    train_mean_personality = torch.tensor(train_mean_personality, dtype=torch.float32)


    # ================== 阶段 1：训练集 ==================
    for epoch in range(200):
        model.train()
        for subj_ids, music_ids, ratings_lbl, pers_lbl, masks in train_loader:
            ratings_lbl, pers_lbl = ratings_lbl.float(), pers_lbl.float()
            masks = masks.float()

            optimizer.zero_grad()
            rating_logit, pers_logit = model(subj_ids, music_ids)

            # 评分 loss
            loss_rating = bce_rating(rating_logit, ratings_lbl)

            # 人格 loss
            per_sample_loss = bce_personality(pers_logit, pers_lbl)
            loss_personality = (per_sample_loss * masks.unsqueeze(1)).mean()

            # 总 loss
            loss = loss_rating + loss_personality
            loss.backward()
            optimizer.step()

        # 评估训练集损失
        model.eval()
        with torch.no_grad():
            for subj_ids, music_ids, ratings_lbl, pers_lbl, _ in train_loader:
                ratings_lbl, pers_lbl = ratings_lbl.float(), pers_lbl.float()
                rating_logit, pers_logit = model(subj_ids, music_ids)

                loss_r = bce_rating(rating_logit, ratings_lbl).item()
                per_sample_loss = bce_personality(pers_logit, pers_lbl)
                loss_p = per_sample_loss.mean().item()





    # ================== 阶段 2：测试集 ==================
    for param in model.parameters(): 
            param.requires_grad = False # 只解冻 subj_proj (人格 embedding) 
    for param in model.subj_proj.parameters(): 
            param.requires_grad = True
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)

    for epoch in range(100):
        model.train()
        for subj_ids, music_ids, ratings_lbl, pers_lbl, masks in test_loader:
            ratings_lbl, pers_lbl = ratings_lbl.float(), pers_lbl.float()
            masks = masks.float()

            optimizer.zero_grad()
            rating_logit, pers_logit = model(subj_ids, music_ids)

            loss_rating = bce_rating(rating_logit, ratings_lbl)
            per_sample_loss = bce_personality(pers_logit, pers_lbl)
            loss_personality = (per_sample_loss * masks.unsqueeze(1)).mean()

            loss = loss_rating + loss_personality
            loss.backward()
            optimizer.step()

        # 评估测试集损失
        model.eval()
        test_loss_rating, test_loss_personality = [], []
        with torch.no_grad():
            for subj_ids, music_ids, ratings_lbl, pers_lbl, _ in test_loader:
                ratings_lbl, pers_lbl = ratings_lbl.float(), pers_lbl.float()
                rating_logit, pers_logit = model(subj_ids, music_ids)

                loss_r = bce_rating(rating_logit, ratings_lbl).item()
                per_sample_loss = bce_personality(pers_logit, pers_lbl)
                loss_p = per_sample_loss.mean().item()

                test_loss_rating.append(loss_r)
                test_loss_personality.append(loss_p)



    def eval_loader(loader, name=""):
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for subj_ids, music_ids, lbls, _, _ in loader:
                lbls = lbls.float()
                preds, _ = model(subj_ids, music_ids)
                all_preds.append(preds.cpu().numpy())
                all_labels.append(lbls.cpu().numpy())
            all_preds = np.concatenate(all_preds)
            all_labels = np.concatenate(all_labels)
            metrics = compute_metrics(all_labels, all_preds)
        return metrics


    _, rmse_tr_rating, _, corr_tr_rating = eval_loader(train_loader, name="train")
    _, rmse_te_rating, _, corr_te_rating = eval_loader(test_loader, name="test")

    #人格的评估       
    def collect_preds_p(loader):
        model.eval()
        all_preds, all_labels, subj_ids_batch = [], [], []
        with torch.no_grad():
            for subj_ids,music_ids, _, pers_lbl, _ in loader:
                pers_lbl = pers_lbl.float()
                _, pers_pred = model(subj_ids, music_ids)
                all_preds.append(pers_pred.cpu().numpy())
                all_labels.append(pers_lbl.cpu().numpy())
                subj_ids_batch.append(subj_ids.cpu().numpy())
            all_preds = np.vstack(all_preds)
            all_labels = np.vstack(all_labels)
            subj_ids_batch = np.concatenate(subj_ids_batch)
        return all_preds, all_labels, subj_ids_batch
    
    # 逐维度算指标
    preds_tr_p, labels_tr_p,_ = collect_preds_p(train_loader)
    preds_te_p, labels_te_p, subj_ids_batch = collect_preds_p(test_loader)
    abs_errors = np.abs(preds_te_p - labels_te_p)
    all_preds_abs_errors.append(abs_errors)
    all_preds_values.append(preds_te_p)
    subj_ids_all.append(subj_ids_batch)
    results_p = {}

    for i, dim in enumerate(dims):
        rmse_tr = root_mean_squared_error(labels_tr_p[:, i], preds_tr_p[:, i])
        rmse_te = root_mean_squared_error(labels_te_p[:, i], preds_te_p[:, i])
        corr_tr = np.corrcoef(labels_tr_p[:, i], preds_tr_p[:, i])[0, 1]
        corr_te = np.corrcoef(labels_te_p[:, i], preds_te_p[:, i])[0, 1]

        results_p[f"rmse_tr_{dim}"] = rmse_tr
        results_p[f"rmse_te_{dim}"] = rmse_te
        results_p[f"corr_tr_{dim}"] = corr_tr
        results_p[f"corr_te_{dim}"] = corr_te



    results_r = {
        'rmse_tr': rmse_tr_rating,
        'corr_tr': corr_tr_rating,
        'rmse_te': rmse_te_rating,
        'corr_te': corr_te_rating,
    }
    S2_metrics.append(results_r)


    # ====== 打印人格的均值指标 ======
    rmse_te_vals = [results_p[f"rmse_te_{dim}"] for dim in dims]
    corr_te_vals = [results_p[f"corr_te_{dim}"] for dim in dims]
    mean_rmse_te = np.mean(rmse_te_vals)
    mean_corr_te = np.mean(corr_te_vals)

    print(f"研究二| Test Personality Mean RMSE={mean_rmse_te:.4f}, Mean Corr={mean_corr_te:.4f}")

    all_preds_abs_errors = np.vstack(all_preds_abs_errors)
    all_preds_values = np.vstack(all_preds_values)
    subj_ids_all = np.concatenate(subj_ids_all)

    subj_labels = [f"subj_{i+1-424}" for i in range(num_subjects)]


        # 还原每个样本对应的 trait
    records = []
    for i in range(len(subj_ids_all)):  # 遍历样本
        sid = subj_ids_all[i]
        errs = all_preds_abs_errors[i]  # 这一行对应该样本的各维度误差
        for t, trait in enumerate(dims):
            records.append([subj_labels[sid], trait, errs[t]])

    df_model2 = pd.DataFrame(records, columns=["subject_id", "trait", "Model2"])
    print(sorted(df_model2["subject_id"].unique()))



    # 合并到已有宽表
    df_wide = pd.read_csv(r"Detailed_Output/B_wide_abs_errors_model1.csv", encoding="utf-8-sig")
    df_wide = df_wide.merge(df_model2, on=["subject_id", "trait"], how="left")
    df_wide.to_csv(r"Detailed_Output/B_wide_abs_errors_model1_2.csv", index=False, encoding="utf-8-sig")

    print(df_wide)
    pd.DataFrame(S2_metrics).to_csv(f"Main_Result/rating_result_model2_study2.csv", index=False, encoding="utf-8-sig")

    pd.DataFrame(S2_metrics_p).to_csv("Main_Result/personality_result_model2_study2.csv", index=False, encoding="utf-8-sig")
    print(f"已保存")



single_dim_model2_2(dims, ratings_raw_B, cbf_df_B, batch_size=256, lr=0.01)

(551, 25)
[['神经质', '严谨性', '宜人性', '开放性', '外向性']] 研究二
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
研究二| Test Personality Mean RMSE=0.1847, Mean Corr=-0.0357
['subj_0', 'subj_1', 'subj_10', 'subj_100', 'subj_101', 'subj_102', 'subj_103', 'subj_104', 'subj_105', 'subj_106', 'subj_107', 'subj_108', 'subj_109', 'subj_11', 'subj_110', 'subj_111', 'subj_112', 'subj_113', 'subj_114', 'subj_115', 'subj_116', 'subj_117', 'subj_118', 'subj_119', 'subj_12', 'subj_120', 'subj_121', 'subj_122', 'subj_123', 'subj_124', 'subj_125', 'subj_126', 'subj_127', 'subj_13', 'subj_14', 'subj_15', 'subj_16', 'subj_17', 'subj_18', 'subj_19', 'subj_2', 'subj_20', 'subj_21', 'subj_22', 'subj_23', 'subj_24', 'subj_25', 'subj_26', 'subj_27', 'subj_28', 'subj_29', 'subj_3', 'subj_30', 'subj_31', 'subj_32', 'subj_33', 'subj_34', 'subj_35', 'subj_36', 'subj_37', 'subj_38', 'subj_39', 'subj_4', 'subj_40', 'subj_41', 'subj_42', 'subj_43', 'subj_44', 'subj_45', 'subj_46', 'subj_47', 'subj_48', 'subj_49', '