In [8]:
import numpy as np
import pandas as pd
import torch
import pytorch_lightning as pl
from sklearn.metrics import mean_squared_error
from pathlib import Path
from tqdm import tqdm

from src.methods.DMF import DMFImputer
from src.methods.DCAE import DCAEImputer

pl.seed_everything(114514)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
data_path = Path("./data/Alzheimer.csv")
from src.datasets import CSVDataset


Seed set to 114514


In [9]:
def _safe_copy(array):
    return np.asarray(array, dtype=np.float64).copy()

def z_score_normalize(data, eps=1e-8):
    x = _safe_copy(data)
    mean = np.nanmean(x, axis=0)
    std = np.nanstd(x, axis=0)
    std = np.where(std < eps, 1.0, std)
    return (x - mean) / std, {"mean": mean, "std": std}

def minmax_normalize(data, eps=1e-8):
    x = _safe_copy(data)
    feature_min = np.nanmin(x, axis=0)
    feature_max = np.nanmax(x, axis=0)
    span = feature_max - feature_min
    span = np.where(span < eps, 1.0, span)
    return (x - feature_min) / span, {"min": feature_min, "max": feature_max}

def max_scale_normalize(data, eps=1e-8):
    x = _safe_copy(data)
    feature_max = np.nanmax(np.abs(x), axis=0)
    feature_max = np.where(feature_max < eps, 1.0, feature_max)
    return x / feature_max, {"max_abs": feature_max}

def log2_transform(data, offset=1e-6):
    x = _safe_copy(data)
    min_val = np.nanmin(x, axis=0)
    shift = np.where(min_val <= 0, np.abs(min_val) + offset, 0.0)
    return np.log2(x + shift + offset), {"shift": shift, "offset": offset}

def log2_z_score_normalize(data, eps=1e-8, offset=1e-6):
    log_data, log_params = log2_transform(data, offset=offset)
    norm_data, stats = z_score_normalize(log_data, eps=eps)
    return norm_data, {"log": log_params, **stats}

def log2_minmax_normalize(data, eps=1e-8, offset=1e-6):
    log_data, log_params = log2_transform(data, offset=offset)
    norm_data, stats = minmax_normalize(log_data, eps=eps)
    return norm_data, {"log": log_params, **stats}

def log2_max_scale_normalize(data, eps=1e-8, offset=1e-6):
    log_data, log_params = log2_transform(data, offset=offset)
    norm_data, stats = max_scale_normalize(log_data, eps=eps)
    return norm_data, {"log": log_params, **stats}

In [17]:
normalizers = {
    "z_score": z_score_normalize,
    "minmax": minmax_normalize,
    "max_scale": max_scale_normalize,
    "log2_z_score": log2_z_score_normalize,
    "log2_minmax": log2_minmax_normalize,
    "log2_max_scale": log2_max_scale_normalize,
}
imputers = ["DMF", "DCAE"]

def load_data(data_path, missing_threshold=0.9):
    print(f"Loading data from: {data_path}")
    df_original = pd.read_csv(data_path)
    data_original = df_original.iloc[:, 1:].values.astype('float32')
    original_missing_mask = (data_original <= 0) | np.isnan(data_original)

    feature_missing_rate = original_missing_mask.mean(axis=0)
    valid_features = feature_missing_rate < missing_threshold
    data_filtered = data_original[:, valid_features]
    missing_mask_filtered = original_missing_mask[:, valid_features]

    feature_columns = df_original.columns[1:][valid_features] 
    df_filtered = pd.concat([df_original.iloc[:, [0]], 
                            pd.DataFrame(data_filtered, columns=feature_columns)], axis=1)

    # 修复Path对象的处理
    temp_path = str(data_path).replace('.csv', '_filtered.csv')
    df_filtered.to_csv(temp_path, index=False)
    dataset = CSVDataset(temp_path)
    
    print(f"Filtered data shape: {data_filtered.shape}")
    print(f"Filtered missing rate: {missing_mask_filtered.sum() / missing_mask_filtered.size:.2%}")

    return dataset, df_filtered, data_filtered, missing_mask_filtered


def inverse_transform(method, imputed_norm, params):
    if method == "z_score":
        return imputed_norm * params["std"] + params["mean"]
    if method == "minmax":
        span = params["max"] - params["min"]
        return imputed_norm * span + params["min"]
    if method == "max_scale":
        return imputed_norm * params["max_abs"]
    log_params = params["log"]
    offset = log_params["offset"]
    shift = log_params["shift"]
    if method == "log2_z_score":
        log_vals = imputed_norm * params["std"] + params["mean"]
    elif method == "log2_minmax":
        span = params["max"] - params["min"]
        log_vals = imputed_norm * span + params["min"]
    elif method == "log2_max_scale":
        log_vals = imputed_norm * params["max_abs"]
    else:
        raise ValueError(method)
    return np.power(2.0, log_vals) - shift - offset

def train_imputer(name, data_tensor, mask_tensor, max_epochs=100):
    if name == "DMF":
        model = DMFImputer(
            full_data_tensor=data_tensor,
            full_mask_tensor=mask_tensor,
            embedding_dim=64,
            hidden_dims=[256, 128],
            reconstruction_weight=1.0,
            mask_weight=0.5,
            lr=1e-3,
            batch_size=512,
        )
    elif name == "DCAE":
        model = DCAEImputer(
            full_data_tensor=data_tensor,
            full_mask_tensor=mask_tensor,
            ae_dim=256,
            mask_predictor_hidden_dim=128,
            lambda_mask=0.5,
            num_encoder_blocks=3,
            num_decoder_blocks=3,
            dilation=2,
            learning_rate=1e-3,
            batch_size=512,
        )
    else:
        raise ValueError(name)
    trainer = pl.Trainer(
        max_epochs=max_epochs,
        accelerator=device,
        devices=1,
        enable_checkpointing=False,
        enable_model_summary=False,
        logger=False,
        enable_progress_bar=False,
    )
    trainer.fit(model)
    model.eval()
    with torch.no_grad():
        imputed = model.get_imputed_data().cpu().numpy()
    return imputed


In [22]:
dataset, df_filtered, data_filtered, original_missing = load_data(data_path, missing_threshold=0.9)

raw_data = data_filtered.astype(np.float64)
mask_from_dataset = dataset.get_mask().numpy().astype(bool)
artificial_mask = ~mask_from_dataset  
artificial_mask = artificial_mask & ~original_missing  
final_train_mask = ~original_missing & ~artificial_mask

print(f"Data shape: {raw_data.shape}")
print(f"Original missing: {original_missing.sum()}")
print(f"Artificial mask: {artificial_mask.sum()}")
print(f"Final train mask: {final_train_mask.sum()}")

results = []

for norm_name, norm_fn in tqdm(normalizers.items(), desc="Normalizations"):
    print(f"\nProcessing {norm_name}...")
    
    # 使用完整的原始数据计算归一化参数
    raw_data_for_stats = raw_data.copy()
    raw_data_for_stats[original_missing] = np.nan
    
    # 计算归一化参数
    norm_data, params = norm_fn(raw_data_for_stats)
    
    # 创建训练用的数据：只有final_train_mask位置保留归一化值
    norm_data_train = norm_data.copy()
    norm_data_train[~final_train_mask] = np.nan
    
    # 检查归一化后的情况
    nan_count = np.isnan(norm_data).sum()
    print(f"  NaN count after normalization: {nan_count}")
    
    # 检查artificial_mask位置的ground truth
    artificial_values_normalized = norm_data[artificial_mask]
    nan_in_artificial = np.isnan(artificial_values_normalized).sum()
    print(f"  NaN in artificial positions: {nan_in_artificial}/{len(artificial_values_normalized)}")
    
    if nan_in_artificial == len(artificial_values_normalized):
        print(f"  Skipping {norm_name}: all artificial positions are NaN")
        for imputer in imputers:
            results.append({
                "normalization": norm_name,
                "imputer": imputer,
                "MSE": float('inf'),
                "valid_count": 0,
                "artificial_masked_count": artificial_mask.sum(),
                "ground_truth_nan": nan_in_artificial,
                "imputed_nan": 0
            })
        continue
    
    # 准备训练数据
    norm_data_filled = np.nan_to_num(norm_data_train, nan=0.0).astype(np.float32)
    mask_tensor = torch.tensor(final_train_mask.astype(np.float32))
    data_tensor = torch.tensor(norm_data_filled)
    
    for imputer in imputers:
        print(f"  Training {imputer}...")
        
        imputed_norm = train_imputer(imputer, data_tensor, mask_tensor, max_epochs=500)

        # 获取归一化尺度下的ground truth和imputed值
        norm_ground_truth = norm_data[artificial_mask]
        imputed_ground_truth = imputed_norm[artificial_mask]

        print(f"    Ground truth NaN count: {np.isnan(norm_ground_truth).sum()}")
        print(f"    Imputed NaN count: {np.isnan(imputed_ground_truth).sum()}")

        valid_mask = ~(np.isnan(norm_ground_truth) | np.isnan(imputed_ground_truth))
        
        if valid_mask.sum() > 0:
            # 转换回原始尺度进行MSE计算
            original_ground_truth = raw_data[artificial_mask][valid_mask]
            
            # 将imputed值转换回原始尺度
            imputed_full = imputed_norm.copy()
            imputed_original = inverse_transform(norm_name, imputed_full, params)
            imputed_original_values = imputed_original[artificial_mask][valid_mask]
            
            mse = mean_squared_error(original_ground_truth, imputed_original_values)
            print(f"    Valid samples: {valid_mask.sum()}, MSE (original scale): {mse:.2f}")
        else:
            mse = float('inf')
            print(f"    No valid samples for comparison!")
        
        results.append({
            "normalization": norm_name,
            "imputer": imputer,
            "MSE": mse,
            "valid_count": valid_mask.sum(),
            "artificial_masked_count": artificial_mask.sum(),
            "ground_truth_nan": np.isnan(norm_ground_truth).sum(),
            "imputed_nan": np.isnan(imputed_ground_truth).sum()
        })

results_df = pd.DataFrame(results).sort_values(["normalization", "imputer"])
display(results_df)

Loading data from: data/Alzheimer.csv
Filtered data shape: (210, 1494)
Filtered missing rate: 18.88%
Data shape: (210, 1494)
Original missing: 59223
Artificial mask: 130249
Final train mask: 124268


Normalizations:   0%|          | 0/6 [00:00<?, ?it/s]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]



Processing z_score...
  NaN count after normalization: 59223
  NaN in artificial positions: 0/130249
  Training DMF...


`Trainer.fit` stopped: `max_epochs=500` reached.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


    Ground truth NaN count: 0
    Imputed NaN count: 0
    Valid samples: 130249, MSE (original scale): 74972820935230.72
  Training DCAE...


`Trainer.fit` stopped: `max_epochs=500` reached.
Normalizations:  17%|█▋        | 1/6 [02:49<14:05, 169.18s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


    Ground truth NaN count: 0
    Imputed NaN count: 0
    Valid samples: 130249, MSE (original scale): 91543408796285.34

Processing minmax...
  NaN count after normalization: 59223
  NaN in artificial positions: 0/130249
  Training DMF...


`Trainer.fit` stopped: `max_epochs=500` reached.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


    Ground truth NaN count: 0
    Imputed NaN count: 0
    Valid samples: 130249, MSE (original scale): 75771760781747.06
  Training DCAE...


`Trainer.fit` stopped: `max_epochs=500` reached.
Normalizations:  33%|███▎      | 2/6 [05:39<11:19, 169.78s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


    Ground truth NaN count: 0
    Imputed NaN count: 0
    Valid samples: 130249, MSE (original scale): 310036689074335.25

Processing max_scale...
  NaN count after normalization: 59223
  NaN in artificial positions: 0/130249
  Training DMF...


`Trainer.fit` stopped: `max_epochs=500` reached.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


    Ground truth NaN count: 0
    Imputed NaN count: 0
    Valid samples: 130249, MSE (original scale): 77480950857146.70
  Training DCAE...


`Trainer.fit` stopped: `max_epochs=500` reached.
Normalizations:  50%|█████     | 3/6 [08:29<08:29, 169.88s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


    Ground truth NaN count: 0
    Imputed NaN count: 0
    Valid samples: 130249, MSE (original scale): 389767514446383.38

Processing log2_z_score...
  NaN count after normalization: 59223
  NaN in artificial positions: 0/130249
  Training DMF...


`Trainer.fit` stopped: `max_epochs=500` reached.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


    Ground truth NaN count: 0
    Imputed NaN count: 0
    Valid samples: 130249, MSE (original scale): 74979229918399.41
  Training DCAE...


`Trainer.fit` stopped: `max_epochs=500` reached.
Normalizations:  67%|██████▋   | 4/6 [11:16<05:37, 168.71s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


    Ground truth NaN count: 0
    Imputed NaN count: 0
    Valid samples: 130249, MSE (original scale): 85145808381018.16

Processing log2_minmax...
  NaN count after normalization: 59223
  NaN in artificial positions: 0/130249
  Training DMF...


`Trainer.fit` stopped: `max_epochs=500` reached.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


    Ground truth NaN count: 0
    Imputed NaN count: 0
    Valid samples: 130249, MSE (original scale): 83024379745690.70
  Training DCAE...


`Trainer.fit` stopped: `max_epochs=500` reached.
Normalizations:  83%|████████▎ | 5/6 [14:03<02:48, 168.12s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


    Ground truth NaN count: 0
    Imputed NaN count: 0
    Valid samples: 130249, MSE (original scale): 199091421129881.50

Processing log2_max_scale...
  NaN count after normalization: 59223
  NaN in artificial positions: 0/130249
  Training DMF...


`Trainer.fit` stopped: `max_epochs=500` reached.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


    Ground truth NaN count: 0
    Imputed NaN count: 0
    Valid samples: 130249, MSE (original scale): 347201747697170.50
  Training DCAE...


`Trainer.fit` stopped: `max_epochs=500` reached.
Normalizations: 100%|██████████| 6/6 [16:51<00:00, 168.59s/it]

    Ground truth NaN count: 0
    Imputed NaN count: 0
    Valid samples: 130249, MSE (original scale): 1072083346583429760.00





Unnamed: 0,normalization,imputer,MSE,valid_count,artificial_masked_count,ground_truth_nan,imputed_nan
11,log2_max_scale,DCAE,1.072083e+18,130249,130249,0,0
10,log2_max_scale,DMF,347201700000000.0,130249,130249,0,0
9,log2_minmax,DCAE,199091400000000.0,130249,130249,0,0
8,log2_minmax,DMF,83024380000000.0,130249,130249,0,0
7,log2_z_score,DCAE,85145810000000.0,130249,130249,0,0
6,log2_z_score,DMF,74979230000000.0,130249,130249,0,0
5,max_scale,DCAE,389767500000000.0,130249,130249,0,0
4,max_scale,DMF,77480950000000.0,130249,130249,0,0
3,minmax,DCAE,310036700000000.0,130249,130249,0,0
2,minmax,DMF,75771760000000.0,130249,130249,0,0
