UNIFIED DATA PREPROCESSING PIPELINE
Loads NPZ and CSV data, performs train/test split and normalization.
Saves all preprocessed data for use in training and analysis notebooks.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
import os

print("\n" + "=" * 70)
print("UNIFIED DATA PREPROCESSING PIPELINE")
print("=" * 70)


UNIFIED DATA PREPROCESSING PIPELINE


In [2]:
# ==========================================
# 1. LOAD NPZ DATA (PointCloud Data)
# ==========================================
print("\n[1/5] Loading NPZ data...")

npz_path = 'Data/Rpt0_N5000.npz'
data = np.load(npz_path, allow_pickle=True)

# Extract all data
all_input = data['a']      # (N_total, 5000, 9)
all_output = data['b']     # (N_total, 5000, 4)
all_names = data['c']      # Sample names

print(f"  ✓ Loaded NPZ: {npz_path}")
print(f"    Total samples: {len(all_input)}")
print(f"    Input shape: {all_input.shape}")
print(f"    Output shape: {all_output.shape}")


[1/5] Loading NPZ data...
  ✓ Loaded NPZ: Data/Rpt0_N5000.npz
    Total samples: 6315
    Input shape: (6315, 5000, 9)
    Output shape: (6315, 5000, 4)


In [3]:
# ==========================================
# 2. FILTER DIAGONAL SAMPLES
# ==========================================
print("\n[2/5] Filtering diagonal load samples...")

dia_mask = np.array(['dia' in str(name) for name in all_names])
dia_input = all_input[dia_mask]      # (N_dia, 5000, 9)
dia_output = all_output[dia_mask]    # (N_dia, 5000, 4)
dia_names = all_names[dia_mask]

print(f"  ✓ Diagonal samples: {len(dia_input)} / {len(all_input)}")


[2/5] Filtering diagonal load samples...
  ✓ Diagonal samples: 2105 / 6315


In [4]:
# ==========================================
# 3. LOAD AND ALIGN CSV SCALARS
# ==========================================
print("\n[3/5] Loading and aligning CSV scalar data...")

csv_path = 'Data/bracket_labels.csv'
df = pd.read_csv(csv_path)

# Create lookup dictionary
csv_lookup = {}
for idx, row in df.iterrows():
    item_name = str(row['item_name'])
    csv_lookup[item_name] = [
        row['max_dia_stress(MPa)'],
        row['mass(kg)'],
        row['1st_mode_freq(Hz)']
    ]

# Match NPZ names with CSV
dia_scalars = []
valid_indices = []
unmatched_names = []

for idx, name in enumerate(dia_names):
    name_str = str(name)
    clean_name = name_str.replace('dia_', '').replace('hor_', '').replace('ver_', '')
    
    if clean_name in csv_lookup:
        dia_scalars.append(csv_lookup[clean_name])
        valid_indices.append(idx)
    else:
        unmatched_names.append(name_str)

dia_scalars = np.array(dia_scalars, dtype=np.float32)
valid_indices = np.array(valid_indices)

# Filter to matched samples only
dia_input = dia_input[valid_indices]
dia_output = dia_output[valid_indices]
dia_names = dia_names[valid_indices]

print(f"  ✓ Successfully matched: {len(dia_scalars)} samples")
if unmatched_names:
    print(f"  ⚠ Unmatched: {len(unmatched_names)} samples (excluded)")


[3/5] Loading and aligning CSV scalar data...
  ✓ Successfully matched: 2105 samples


In [5]:
# ==========================================
# 4. TRAIN/TEST SPLIT
# ==========================================
print("\n[4/5] Creating train/test split (80/20)...")

train_idx, test_idx = train_test_split(
    np.arange(len(dia_scalars)),
    test_size=0.2,
    random_state=42
)

# Split inputs
train_input_full = dia_input[train_idx]  # (N_train, 5000, 9)
test_input_full = dia_input[test_idx]    # (N_test, 5000, 9)

# Extract XYZ coordinates only
train_input_xyz = train_input_full[:, :, :3]  # (N_train, 5000, 3)
test_input_xyz = test_input_full[:, :, :3]    # (N_test, 5000, 3)

# Split outputs (field data)
train_field_output = dia_output[train_idx]  # (N_train, 5000, 4)
test_field_output = dia_output[test_idx]    # (N_test, 5000, 4)

# Split scalars
train_scalars = dia_scalars[train_idx]  # (N_train, 3)
test_scalars = dia_scalars[test_idx]    # (N_test, 3)

# Split names
train_names = dia_names[train_idx]
test_names = dia_names[test_idx]

print(f"  ✓ Train set: {len(train_idx)} samples")
print(f"  ✓ Test set: {len(test_idx)} samples")


[4/5] Creating train/test split (80/20)...
  ✓ Train set: 1684 samples
  ✓ Test set: 421 samples


In [6]:
# ==========================================
# 5. NORMALIZATION (using train statistics only)
# ==========================================
print("\n[5/5] Computing normalization statistics...")

# --- FIELD OUTPUT NORMALIZATION ---
field_min = train_field_output.min(axis=(0, 1))  # (4,)
field_max = train_field_output.max(axis=(0, 1))  # (4,)

train_field_norm = np.zeros_like(train_field_output, dtype=np.float32)
test_field_norm = np.zeros_like(test_field_output, dtype=np.float32)

for i in range(4):
    train_field_norm[:, :, i] = (train_field_output[:, :, i] - field_min[i]) / (field_max[i] - field_min[i] + 1e-8)
    test_field_norm[:, :, i] = (test_field_output[:, :, i] - field_min[i]) / (field_max[i] - field_min[i] + 1e-8)

train_field_norm = np.clip(train_field_norm, 0, 1)
test_field_norm = np.clip(test_field_norm, 0, 1)

# --- SCALAR OUTPUT NORMALIZATION ---
scalar_min = train_scalars.min(axis=0)  # (3,)
scalar_max = train_scalars.max(axis=0)  # (3,)

train_scalars_norm = (train_scalars - scalar_min) / (scalar_max - scalar_min + 1e-8)
test_scalars_norm = (test_scalars - scalar_min) / (scalar_max - scalar_min + 1e-8)

train_scalars_norm = np.clip(train_scalars_norm, 0, 1)
test_scalars_norm = np.clip(test_scalars_norm, 0, 1)

print("  ✓ Field outputs normalized to [0, 1]")
print("  ✓ Scalar outputs normalized to [0, 1]")


[5/5] Computing normalization statistics...
  ✓ Field outputs normalized to [0, 1]
  ✓ Scalar outputs normalized to [0, 1]


In [8]:
# ==========================================
# SAVE ALL PREPROCESSED DATA
# ==========================================
print("\n" + "=" * 70)
print("SAVING PREPROCESSED DATA")
print("=" * 70)

output_dir = 'Data/preprocessed'
os.makedirs(output_dir, exist_ok=True)

# Create preprocessing info dictionary
preprocess_info = {
    'train_idx': train_idx,
    'test_idx': test_idx,
    'field_min': field_min,
    'field_max': field_max,
    'scalar_min': scalar_min,
    'scalar_max': scalar_max,
    'scalar_columns': ['max_dia_stress(MPa)', 'mass(kg)', '1st_mode_freq(Hz)'],
    'field_columns': ['ux (displacement)', 'uy (displacement)', 'uz (displacement)', 'von Mises stress'],
    'n_train': len(train_idx),
    'n_test': len(test_idx),
    'n_points': 5000
}

# Save as pickle (easy to load)
with open(f'{output_dir}/preprocess_info.pkl', 'wb') as f:
    pickle.dump(preprocess_info, f)

# Save train data
np.save(f'{output_dir}/train_input_xyz.npy', train_input_xyz)
np.save(f'{output_dir}/train_field_output.npy', train_field_output)
np.save(f'{output_dir}/train_field_norm.npy', train_field_norm)
np.save(f'{output_dir}/train_scalars.npy', train_scalars)
np.save(f'{output_dir}/train_scalars_norm.npy', train_scalars_norm)

# Save test data
np.save(f'{output_dir}/test_input_xyz.npy', test_input_xyz)
np.save(f'{output_dir}/test_field_output.npy', test_field_output)
np.save(f'{output_dir}/test_field_norm.npy', test_field_norm)
np.save(f'{output_dir}/test_scalars.npy', test_scalars)
np.save(f'{output_dir}/test_scalars_norm.npy', test_scalars_norm)

# Save names
np.save(f'{output_dir}/train_names.npy', train_names, allow_pickle=True)
np.save(f'{output_dir}/test_names.npy', test_names, allow_pickle=True)

# Save normalization statistics separately
np.savez(f'{output_dir}/normalization_stats.npz',
        field_min=field_min, field_max=field_max,
        scalar_min=scalar_min, scalar_max=scalar_max)

print(f"\n✓ Saved to: {output_dir}/")
print(f"  - train_input_xyz.npy ({train_input_xyz.shape})")
print(f"  - train_field_output.npy ({train_field_output.shape})")
print(f"  - train_field_norm.npy ({train_field_norm.shape})")
print(f"  - train_scalars.npy ({train_scalars.shape})")
print(f"  - train_scalars_norm.npy ({train_scalars_norm.shape})")
print(f"  - test_input_xyz.npy ({test_input_xyz.shape})")
print(f"  - test_field_output.npy ({test_field_output.shape})")
print(f"  - test_field_norm.npy ({test_field_norm.shape})")
print(f"  - test_scalars.npy ({test_scalars.shape})")
print(f"  - test_scalars_norm.npy ({test_scalars_norm.shape})")
print(f"  - train_names.npy, test_names.npy")
print(f"  - preprocess_info.pkl")
print(f"  - normalization_stats.npz")


SAVING PREPROCESSED DATA

✓ Saved to: Data/preprocessed/
  - train_input_xyz.npy ((1684, 5000, 3))
  - train_field_output.npy ((1684, 5000, 4))
  - train_field_norm.npy ((1684, 5000, 4))
  - train_scalars.npy ((1684, 3))
  - train_scalars_norm.npy ((1684, 3))
  - test_input_xyz.npy ((421, 5000, 3))
  - test_field_output.npy ((421, 5000, 4))
  - test_field_norm.npy ((421, 5000, 4))
  - test_scalars.npy ((421, 3))
  - test_scalars_norm.npy ((421, 3))
  - train_names.npy, test_names.npy
  - preprocess_info.pkl
  - normalization_stats.npz


In [9]:
# ==========================================
# PRINT SUMMARY STATISTICS
# ==========================================
print("\n" + "=" * 70)
print("DATA SUMMARY")
print("=" * 70)

print(f"\nSAMPLE COUNTS:")
print(f"  Training: {len(train_idx)}")
print(f"  Testing:  {len(test_idx)}")
print(f"  Total:    {len(train_idx) + len(test_idx)}")

print(f"\nFIELD OUTPUT STATISTICS (ORIGINAL SCALE):")
field_names = ['ux (disp)', 'uy (disp)', 'uz (disp)', 'von Mises stress']
for i, name in enumerate(field_names):
    print(f"\n  {name}:")
    print(f"    Min:  {field_min[i]:12.6f}")
    print(f"    Max:  {field_max[i]:12.6f}")
    print(f"    Range: {field_max[i] - field_min[i]:12.6f}")

print(f"\nSCALAR OUTPUT STATISTICS (ORIGINAL SCALE):")
scalar_names = ['Max Stress (MPa)', 'Mass (kg)', '1st Freq (Hz)']
for i, name in enumerate(scalar_names):
    print(f"\n  {name}:")
    print(f"    Min:  {scalar_min[i]:12.6f}")
    print(f"    Max:  {scalar_max[i]:12.6f}")
    print(f"    Mean (train): {train_scalars[:, i].mean():12.6f}")
    print(f"    Std (train):  {train_scalars[:, i].std():12.6f}")

print("\n" + "=" * 70)
print("✅ PREPROCESSING COMPLETE!")
print("=" * 70)
print("\nUsage in training notebooks:")
print("  from data_loader import load_preprocessed_data")
print("  data = load_preprocessed_data()")
print("=" * 70 + "\n")


DATA SUMMARY

SAMPLE COUNTS:
  Training: 1684
  Testing:  421
  Total:    2105

FIELD OUTPUT STATISTICS (ORIGINAL SCALE):

  ux (disp):
    Min:     -0.737986
    Max:      0.113362
    Range:     0.851348

  uy (disp):
    Min:     -0.276773
    Max:      0.274422
    Range:     0.551195

  uz (disp):
    Min:     -0.039458
    Max:      0.743882
    Range:     0.783340

  von Mises stress:
    Min:      0.000000
    Max:    700.386475
    Range:   700.386475

SCALAR OUTPUT STATISTICS (ORIGINAL SCALE):

  Max Stress (MPa):
    Min:    286.685944
    Max:   1067.449707
    Mean (train):   543.181885
    Std (train):    118.700417

  Mass (kg):
    Min:      0.561002
    Max:      2.407380
    Mean (train):     1.234920
    Std (train):      0.381467

  1st Freq (Hz):
    Min:    752.470276
    Max:   6850.030762
    Mean (train):  3221.244141
    Std (train):   1404.028076

✅ PREPROCESSING COMPLETE!

Usage in training notebooks:
  from data_loader import load_preprocessed_data
  data 