In [None]:
import os
import numpy as np
import shutil
from pathlib import Path
from tqdm import tqdm  # Progress bar
from concurrent.futures import ProcessPoolExecutor, as_completed

# Paths
AIA_BASE = "/mnt/data/PAPER_DATA_B/AIA"
AIA_SUBDIRS = ["train", "val", "test"]
SDO335_PATH = "/mnt/data/SDO335ITI"
SXR_BASE = "/mnt/data/PAPER_DATA_B/SXR"
MISSING_DIR = "/mnt/data/PAPER_DATA_B/missing_335"

# Create missing_335/AIA and missing_335/SXR subfolders
os.makedirs(f"{MISSING_DIR}/AIA", exist_ok=True)
os.makedirs(f"{MISSING_DIR}/SXR", exist_ok=True)

def process_file(args):
    aia_file, sdo335_path, sxr_dir, missing_dir = args
    aia_filename = aia_file.name  # Example: 2023-03-30T07:36:00.npy
    sdo335_file = Path(sdo335_path) / aia_filename
    sxr_file = Path(sxr_dir) / aia_filename
    try:
        if sdo335_file.exists():
            # Load AIA and 335 data
            try:
                aia_data = np.load(aia_file)
                data_335 = np.load(sdo335_file)
                # Check the expected channel counts
                if aia_data.shape[0] != 6:
                    return f"Skipping {aia_file}: expected 6 channels, found {aia_data.shape[0]}"
                if data_335.shape[0] == 1:
                    data_335 = data_335[0]
                # Check the spatial shape matches
                if aia_data.shape[1:] != data_335.shape:
                    return f"Shape mismatch for {aia_file}: AIA shape={aia_data.shape}, 335 shape={data_335.shape}"
                # Create new array with 7 channels
                new_shape = (7,) + aia_data.shape[1:]
                aia_data_new = np.zeros(new_shape, dtype=aia_data.dtype)
                aia_data_new[:6] = aia_data
                aia_data_new[6] = data_335
                np.save(aia_file, aia_data_new)
                return None  # Success
            except Exception as e:
                return f"Error updating {aia_file}: {e}"
        else:
            # Move AIA file to missing_335/AIA and SXR file to missing_335/SXR (if exists)
            msg = f"Missing 335 data for {aia_file}, moving to {missing_dir}"
            try:
                shutil.move(str(aia_file), f"{missing_dir}/AIA/{aia_filename}")
            except Exception as e:
                msg += f"\nError moving AIA {aia_file}: {e}"
            if sxr_file.exists():
                try:
                    shutil.move(str(sxr_file), f"{missing_dir}/SXR/{aia_filename}")
                except Exception as e:
                    msg += f"\nError moving SXR {sxr_file}: {e}"
            return msg
    except Exception as main_e:
        return f"General error with {aia_file}: {main_e}"

for subdir in AIA_SUBDIRS:
    aia_dir = Path(AIA_BASE) / subdir
    sxr_dir = Path(SXR_BASE) / subdir
    aia_files = list(aia_dir.glob("*.npy"))
    inputs = [(aia_file, SDO335_PATH, sxr_dir, MISSING_DIR) for aia_file in aia_files]
    results = []
    with ProcessPoolExecutor() as executor:
        futures = {executor.submit(process_file, arg): arg[0] for arg in inputs}
        for fut in tqdm(as_completed(futures), total=len(futures), desc=f"Processing {subdir}", unit="file"):
            res = fut.result()
            if res is not None:
                print(res)

