# First Model: CNN

## Import packages

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import itertools
import os
import glob 
import seaborn as sns
import tensorflow as tf
import multiprocessing as mp
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Concatenate, Lambda
from tensorflow.keras.models import Model
from astropy.stats import sigma_clip
from tqdm import tqdm
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor, as_completed

sns.set_theme(style='dark')
palette = sns.color_palette('muted')
pd.set_option('display.max_columns', None)

print('TF version:', tf.__version__)

## Load and calibrate the data

In [None]:
path_folder = '/kaggle/input/ariel-data-challenge-2025/' 
path_out = '/kaggle/tmp/data_light_raw/'

if not os.path.exists(path_out):
    os.makedirs(path_out)
    print(f"Directory {path_out} created.")
else:
    print(f"Directory {path_out} already exists.")

CHUNKS_SIZE = 4

In [None]:
def ADC_convert(signal, gain=0.4369, offset=-1000):
    """The Analog-to-Digital Conversion (adc) is performed by the detector to convert
    the pixel voltage into an integer number. Since we are using the same conversion number 
    this year, we have simply hard-coded it inside."""
    signal = signal.astype(np.float64)
    signal /= gain
    signal += offset
    return signal

def mask_hot_dead(signal, dead, dark):
    hot = sigma_clip(
        dark, sigma=5, maxiters=5
    ).mask
    hot = np.tile(hot, (signal.shape[0], 1, 1))
    dead = np.tile(dead, (signal.shape[0], 1, 1))
    signal = np.ma.masked_where(dead, signal)
    signal = np.ma.masked_where(hot, signal)
    return signal

def apply_linear_corr(linear_corr, clean_signal):
    linear_corr = np.flip(linear_corr, axis=0)
    for x, y in itertools.product(
                range(clean_signal.shape[1]), range(clean_signal.shape[2])
            ):
        poli = np.poly1d(linear_corr[:, x, y])
        clean_signal[:, x, y] = poli(clean_signal[:, x, y])
    return clean_signal

def clean_dark(signal, dead, dark, dt):
    dark = np.ma.masked_where(dead, dark)
    dark = np.tile(dark, (signal.shape[0], 1, 1))
    signal -= dark * dt[:, np.newaxis, np.newaxis]
    return signal

def get_cds(signal):
    cds = signal[:,1::2,:,:] - signal[:,::2,:,:]
    return cds

def bin_obs(cds_signal, binning):
    cds_transposed = cds_signal.transpose(0,1,3,2)
    cds_binned = np.zeros((cds_transposed.shape[0], cds_transposed.shape[1] // binning, cds_transposed.shape[2], cds_transposed.shape[3]))
    for i in range(cds_transposed.shape[1] // binning):
        cds_binned[:,i,:,:] = np.sum(cds_transposed[:,i*binning:(i+1)*binning,:,:], axis=1)
    return cds_binned

def correct_flat_field(flat, dead, signal):
    flat = flat.transpose(1, 0)
    dead = dead.transpose(1, 0)
    flat = np.ma.masked_where(dead, flat)
    flat = np.tile(flat, (signal.shape[0], 1, 1))
    signal /= flat
    return signal

def get_index(files, CHUNKS_SIZE):
    index = []
    for file in files:
        file_name = file.split('/')[-1]
        if file_name.split('_')[0] == 'AIRS-CH0' and file_name.split('_')[1] == 'signal' and file_name.split('_')[2] == '0.parquet':
            file_index = os.path.basename(os.path.dirname(file))
            index.append(int(file_index))
    index = np.array(index)
    index = np.sort(index) 
    # credit to DennisSakva
    index = np.array_split(index, len(index) // CHUNKS_SIZE)
    
    return index

In [None]:
files = glob.glob(os.path.join(path_folder + 'train/', '*/*'))

In [None]:
index = get_index(files, CHUNKS_SIZE)
print(len(index[0]))

In [None]:
axis_info = pd.read_parquet(os.path.join(path_folder,'axis_info.parquet'))
DO_MASK = False
DO_THE_NL_CORR = False
DO_DARK = False
DO_FLAT = False
TIME_BINNING = True

cut_inf, cut_sup = 39, 321
l = cut_sup - cut_inf

In [None]:
def load_calibration_data_batch(path_folder, index_chunk, cut_inf, cut_sup, dataset):
    """Load all calibration data for the chunk at once"""
    calibration_data = {}
    
    for idx in index_chunk:
        calibration_data[idx] = {}
        
        # AIRS calibration data
        airs_flat = pd.read_parquet(os.path.join(path_folder, f'{dataset}/{idx}/AIRS-CH0_calibration_0/flat.parquet'))
        calibration_data[idx]['airs_flat'] = airs_flat.values.astype(np.float32).reshape((32, 356))[:, cut_inf:cut_sup]
        
        airs_dark = pd.read_parquet(os.path.join(path_folder, f'{dataset}/{idx}/AIRS-CH0_calibration_0/dark.parquet'))
        calibration_data[idx]['airs_dark'] = airs_dark.values.astype(np.float32).reshape((32, 356))[:, cut_inf:cut_sup]
        
        airs_dead = pd.read_parquet(os.path.join(path_folder, f'{dataset}/{idx}/AIRS-CH0_calibration_0/dead.parquet'))
        calibration_data[idx]['airs_dead'] = airs_dead.values.astype(np.float32).reshape((32, 356))[:, cut_inf:cut_sup]
        
        airs_linear = pd.read_parquet(os.path.join(path_folder, f'{dataset}/{idx}/AIRS-CH0_calibration_0/linear_corr.parquet'))
        calibration_data[idx]['airs_linear'] = airs_linear.values.astype(np.float32).reshape((6, 32, 356))[:, :, cut_inf:cut_sup]
        
        # FGS1 calibration data
        fgs_flat = pd.read_parquet(os.path.join(path_folder, f'{dataset}/{idx}/FGS1_calibration_0/flat.parquet'))
        calibration_data[idx]['fgs_flat'] = fgs_flat.values.astype(np.float32).reshape((32, 32))
        
        fgs_dark = pd.read_parquet(os.path.join(path_folder, f'{dataset}/{idx}/FGS1_calibration_0/dark.parquet'))
        calibration_data[idx]['fgs_dark'] = fgs_dark.values.astype(np.float32).reshape((32, 32))
        
        fgs_dead = pd.read_parquet(os.path.join(path_folder, f'{dataset}/{idx}/FGS1_calibration_0/dead.parquet'))
        calibration_data[idx]['fgs_dead'] = fgs_dead.values.astype(np.float32).reshape((32, 32))
        
        fgs_linear = pd.read_parquet(os.path.join(path_folder, f'{dataset}/{idx}/FGS1_calibration_0/linear_corr.parquet'))
        calibration_data[idx]['fgs_linear'] = fgs_linear.values.astype(np.float32).reshape((6, 32, 32))
    
    return calibration_data

In [None]:
def process_single_observation(args):
    """Process a single observation with all AIRS and FGS1 cleaning steps"""
    # Unpack the arguments
    (i, index_chunk, path_folder, cut_inf, cut_sup, l, axis_info, calibration_data, DO_MASK, DO_THE_NL_CORR, DO_DARK) = args
    
    idx = index_chunk[i]
    
    # AIRS Processing
    # Load AIRS signal data
    df = pd.read_parquet(os.path.join(path_folder, f'train/{idx}/AIRS-CH0_signal_0.parquet'))
    signal = df.values.astype(np.float32).reshape((df.shape[0], 32, 356))

    # 1. ADC Conversion
    signal = ADC_convert(signal)
    dt_airs = axis_info['AIRS-CH0-integration_time'].dropna().values
    dt_airs[1::2] += 0.1
    chopped_signal = signal[:, :, cut_inf:cut_sup]
    del signal, df
    
    # Get pre-loaded calibration data for AIRS
    flat = calibration_data[idx]['airs_flat']
    dark = calibration_data[idx]['airs_dark'] 
    dead_airs = calibration_data[idx]['airs_dead']
    linear_corr = calibration_data[idx]['airs_linear']

    # 2. Mask Hot/Dead Pixels
    if DO_MASK:
        chopped_signal = mask_hot_dead(chopped_signal, dead_airs, dark)

    # 3. Linearity Correction
    if DO_THE_NL_CORR: 
        linear_corr_signal = apply_linear_corr(linear_corr, chopped_signal)
        chopped_signal = linear_corr_signal

    # 4. Dark Current Subtraction
    if DO_DARK: 
        cleaned_signal = clean_dark(chopped_signal, dead_airs, dark, dt_airs)
        chopped_signal = cleaned_signal

    # Store AIRS result
    airs_result = chopped_signal
    
    # FGS1 Processing
    # Load FGS1 signal data
    df = pd.read_parquet(os.path.join(path_folder, f'train/{idx}/FGS1_signal_0.parquet'))
    fgs_signal = df.values.astype(np.float32).reshape((df.shape[0], 32, 32))

    # 1. ADC Conversion
    fgs_signal = ADC_convert(fgs_signal)
    dt_fgs1 = np.ones(len(fgs_signal)) * 0.1
    dt_fgs1[1::2] += 0.1
    chopped_FGS1 = fgs_signal
    del fgs_signal, df
    
    # Get pre-loaded calibration data for FGS1
    flat = calibration_data[idx]['fgs_flat']
    dark = calibration_data[idx]['fgs_dark']
    dead_fgs1 = calibration_data[idx]['fgs_dead']
    linear_corr = calibration_data[idx]['fgs_linear']

    # 2. Mask Hot/Dead pixels
    if DO_MASK:
        chopped_FGS1 = mask_hot_dead(chopped_FGS1, dead_fgs1, dark)

    # 3. Linearity Correction
    if DO_THE_NL_CORR: 
        linear_corr_signal = apply_linear_corr(linear_corr, chopped_FGS1)
        chopped_FGS1 = linear_corr_signal

    # 4. Dark Current Subtraction
    if DO_DARK: 
        cleaned_signal = clean_dark(chopped_FGS1, dead_fgs1, dark, dt_fgs1)
        chopped_FGS1 = cleaned_signal

    # Store FGS1 result
    fgs_result = chopped_FGS1
    
    # Return the processed results
    return i, airs_result, fgs_result

In [None]:
def process_single_observation_test(args):
    """Process a single observation with all AIRS and FGS1 cleaning steps"""
    # Unpack the arguments
    (i, index_chunk, path_folder, cut_inf, cut_sup, l, axis_info, calibration_data, DO_MASK, DO_THE_NL_CORR, DO_DARK) = args
    
    idx = index_chunk[i]
    
    # AIRS Processing
    # Load AIRS signal data
    df = pd.read_parquet(os.path.join(path_folder, f'test/{idx}/AIRS-CH0_signal_0.parquet'))
    signal = df.values.astype(np.float32).reshape((df.shape[0], 32, 356))

    # 1. ADC Conversion
    signal = ADC_convert(signal)
    dt_airs = axis_info['AIRS-CH0-integration_time'].dropna().values
    dt_airs[1::2] += 0.1
    chopped_signal = signal[:, :, cut_inf:cut_sup]
    del signal, df
    
    # Get pre-loaded calibration data for AIRS
    flat = calibration_data[idx]['airs_flat']
    dark = calibration_data[idx]['airs_dark'] 
    dead_airs = calibration_data[idx]['airs_dead']
    linear_corr = calibration_data[idx]['airs_linear']

    # 2. Mask Hot/Dead Pixels
    if DO_MASK:
        chopped_signal = mask_hot_dead(chopped_signal, dead_airs, dark)

    # 3. Linearity Correction
    if DO_THE_NL_CORR: 
        linear_corr_signal = apply_linear_corr(linear_corr, chopped_signal)
        chopped_signal = linear_corr_signal

    # 4. Dark Current Subtraction
    if DO_DARK: 
        cleaned_signal = clean_dark(chopped_signal, dead_airs, dark, dt_airs)
        chopped_signal = cleaned_signal

    # Store AIRS result
    airs_result = chopped_signal
    
    # FGS1 Processing
    # Load FGS1 signal data
    df = pd.read_parquet(os.path.join(path_folder, f'test/{idx}/FGS1_signal_0.parquet'))
    fgs_signal = df.values.astype(np.float32).reshape((df.shape[0], 32, 32))

    # 1. ADC Conversion
    fgs_signal = ADC_convert(fgs_signal)
    dt_fgs1 = np.ones(len(fgs_signal)) * 0.1
    dt_fgs1[1::2] += 0.1
    chopped_FGS1 = fgs_signal
    del fgs_signal, df
    
    # Get pre-loaded calibration data for FGS1
    flat = calibration_data[idx]['fgs_flat']
    dark = calibration_data[idx]['fgs_dark']
    dead_fgs1 = calibration_data[idx]['fgs_dead']
    linear_corr = calibration_data[idx]['fgs_linear']

    # 2. Mask Hot/Dead pixels
    if DO_MASK:
        chopped_FGS1 = mask_hot_dead(chopped_FGS1, dead_fgs1, dark)

    # 3. Linearity Correction
    if DO_THE_NL_CORR: 
        linear_corr_signal = apply_linear_corr(linear_corr, chopped_FGS1)
        chopped_FGS1 = linear_corr_signal

    # 4. Dark Current Subtraction
    if DO_DARK: 
        cleaned_signal = clean_dark(chopped_FGS1, dead_fgs1, dark, dt_fgs1)
        chopped_FGS1 = cleaned_signal

    # Store FGS1 result
    fgs_result = chopped_FGS1
    
    # Return the processed results
    return i, airs_result, fgs_result

In [None]:
for n, index_chunk in enumerate(tqdm(index)):
    # Load all calibration data once at the beginning 
    calibration_data = load_calibration_data_batch(path_folder, index_chunk, cut_inf, cut_sup, 'train')
    
    # Pre-allocate output arrays
    AIRS_CH0_clean = np.zeros((CHUNKS_SIZE, 11250, 32, l), dtype=np.float32)
    FGS1_clean = np.zeros((CHUNKS_SIZE, 135000, 32, 32), dtype=np.float32)
    
    # Parallel Processing
    # Determine number of workers 
    num_workers = min(2, CHUNKS_SIZE)
    
    # Prepare arguments for each observation
    args_list = []
    for i in range(CHUNKS_SIZE):
        args = (i, index_chunk, path_folder, cut_inf, cut_sup, l, axis_info, calibration_data, DO_MASK, DO_THE_NL_CORR, DO_DARK)
        args_list.append(args)
        
    # Process observations in parallel
    results = []
    with ThreadPoolExecutor(max_workers=num_workers) as executor:        
        # Submit all tasks to the thread pool
        future_to_index = {executor.submit(process_single_observation, args): args[0] for args in args_list}
                
        # Collect results as they complete
        for future in tqdm(as_completed(future_to_index), total=CHUNKS_SIZE, desc=f"Processing observations"):
            i, airs_result, fgs_result = future.result()
            results.append((i, airs_result, fgs_result))
        
    # Sort results by observation index (i) to maintain order
    results.sort(key=lambda x: x[0])
    
    # Store results in your existing arrays
    for result in results:
        i, airs_result, fgs_result = result
        AIRS_CH0_clean[i] = airs_result
        FGS1_clean[i] = fgs_result
        
    # 5. Get Correlated Double Sampling
    AIRS_cds = get_cds(AIRS_CH0_clean)
    FGS1_cds = get_cds(FGS1_clean)
    
    del AIRS_CH0_clean, FGS1_clean
    
    # 6. (Optional) Time Binning (to reduce space)
    if TIME_BINNING:
        AIRS_cds_binned = bin_obs(AIRS_cds, binning=30)
        FGS1_cds_binned = bin_obs(FGS1_cds, binning=30*12)
    else:
        AIRS_cds = AIRS_cds.transpose(0,1,3,2)
        AIRS_cds_binned = AIRS_cds
        FGS1_cds = FGS1_cds.transpose(0,1,3,2)
        FGS1_cds_binned = FGS1_cds
    
    del AIRS_cds, FGS1_cds

    # 7. Flat Field Correction - use pre-loaded calibration data
    for i in range(CHUNKS_SIZE):
        if DO_FLAT:
            flat_airs = calibration_data[index_chunk[i]]['airs_flat']  # Fixed: added [i]
            flat_fgs = calibration_data[index_chunk[i]]['fgs_flat']    # Fixed: added [i]
            dead_airs = calibration_data[index_chunk[i]]['airs_dead']  # Fixed: added [i]
            dead_fgs1 = calibration_data[index_chunk[i]]['fgs_dead']   # Fixed: added [i]
            
            corrected_AIRS_cds_binned = correct_flat_field(flat_airs, dead_airs, AIRS_cds_binned[i])
            AIRS_cds_binned[i] = corrected_AIRS_cds_binned
            corrected_FGS1_cds_binned = correct_flat_field(flat_fgs, dead_fgs1, FGS1_cds_binned[i])
            FGS1_cds_binned[i] = corrected_FGS1_cds_binned

    # Save data
    np.save(os.path.join(path_out, 'AIRS_clean_train_{}.npy'.format(n)), AIRS_cds_binned)
    np.save(os.path.join(path_out, 'FGS1_train_{}.npy'.format(n)), FGS1_cds_binned)
    del AIRS_cds_binned, FGS1_cds_binned, calibration_data

In [None]:
TRAIN_CHUNKS_SIZE = CHUNKS_SIZE
CHUNKS_SIZE = 1
test_files = glob.glob(os.path.join(path_folder + 'test/', '*/*'))
test_index = get_index(test_files, CHUNKS_SIZE)

for n, index_chunk in enumerate(tqdm(test_index)):
    # Load all calibration data once at the beginning 
    calibration_data = load_calibration_data_batch(path_folder, index_chunk, cut_inf, cut_sup, 'test')
    
    # Pre-allocate output arrays
    AIRS_CH0_clean = np.zeros((CHUNKS_SIZE, 11250, 32, l), dtype=np.float32)
    FGS1_clean = np.zeros((CHUNKS_SIZE, 135000, 32, 32), dtype=np.float32)
    
    # Parallel Processing
    # Determine number of workers (start with 2 to be safe)
    num_workers = min(2, CHUNKS_SIZE)
    
    # Prepare arguments for each observation
    args_list = []
    for i in range(CHUNKS_SIZE):
        args = (i, index_chunk, path_folder, cut_inf, cut_sup, l, axis_info, calibration_data, DO_MASK, DO_THE_NL_CORR, DO_DARK)
        args_list.append(args)
        
    # Process observations in parallel
    results = []
    with ThreadPoolExecutor(max_workers=num_workers) as executor:        
        # Submit all tasks to the thread pool
        future_to_index = {executor.submit(process_single_observation_test, args): args[0] for args in args_list}
                
        # Collect results as they complete
        for future in tqdm(as_completed(future_to_index), total=CHUNKS_SIZE, desc=f"Processing observations"):
            i, airs_result, fgs_result = future.result()
            results.append((i, airs_result, fgs_result))
        
    # Sort results by observation index (i) to maintain order
    results.sort(key=lambda x: x[0])
    
    # Store results in your existing arrays
    for result in results:
        i, airs_result, fgs_result = result
        AIRS_CH0_clean[i] = airs_result
        FGS1_clean[i] = fgs_result
        
    # 5. Get Correlated Double Sampling
    AIRS_cds = get_cds(AIRS_CH0_clean)
    FGS1_cds = get_cds(FGS1_clean)
    
    del AIRS_CH0_clean, FGS1_clean
    
    # 6. (Optional) Time Binning (to reduce space)
    if TIME_BINNING:
        AIRS_cds_binned = bin_obs(AIRS_cds, binning=30)
        FGS1_cds_binned = bin_obs(FGS1_cds, binning=30*12)
    else:
        AIRS_cds = AIRS_cds.transpose(0,1,3,2)
        AIRS_cds_binned = AIRS_cds
        FGS1_cds = FGS1_cds.transpose(0,1,3,2)
        FGS1_cds_binned = FGS1_cds
    
    del AIRS_cds, FGS1_cds

    # 7. Flat Field Correction - use pre-loaded calibration data
    for i in range(CHUNKS_SIZE):
        if DO_FLAT:
            flat_airs = calibration_data[index_chunk[i]]['airs_flat']  # Fixed: added [i]
            flat_fgs = calibration_data[index_chunk[i]]['fgs_flat']    # Fixed: added [i]
            dead_airs = calibration_data[index_chunk[i]]['airs_dead']  # Fixed: added [i]
            dead_fgs1 = calibration_data[index_chunk[i]]['fgs_dead']   # Fixed: added [i]
            
            corrected_AIRS_cds_binned = correct_flat_field(flat_airs, dead_airs, AIRS_cds_binned[i])
            AIRS_cds_binned[i] = corrected_AIRS_cds_binned
            corrected_FGS1_cds_binned = correct_flat_field(flat_fgs, dead_fgs1, FGS1_cds_binned[i])
            FGS1_cds_binned[i] = corrected_FGS1_cds_binned

    # Save data
    np.save(os.path.join(path_out, 'AIRS_clean_test_{}.npy'.format(n)), AIRS_cds_binned)
    np.save(os.path.join(path_out, 'FGS1_test_{}.npy'.format(n)), FGS1_cds_binned)
    del AIRS_cds_binned, FGS1_cds_binned, calibration_data

In [None]:
def load_data(file, chunk_size, nb_files): 
    data0 = np.load(file + '_0.npy')
    data_all = np.zeros((nb_files * chunk_size, data0.shape[1], data0.shape[2], data0.shape[3]))
    data_all[:chunk_size] = data0
    for i in range(1, nb_files): 
        data_all[i * chunk_size : (i+1) * chunk_size] = np.load(file + '_{}.npy'.format(i))
    return data_all 

data_train_AIRS = load_data(path_out + 'AIRS_clean_train', TRAIN_CHUNKS_SIZE, len(index)) 
data_train_FGS = load_data(path_out + 'FGS1_train', TRAIN_CHUNKS_SIZE, len(index))

print(data_train_AIRS.shape)
print(data_train_FGS.shape)

In [None]:
data_test_AIRS = load_data(path_out + 'AIRS_clean_test', CHUNKS_SIZE, 1) 
data_test_FGS = load_data(path_out + 'FGS1_test', CHUNKS_SIZE, 1)

print(data_test_AIRS.shape)
print(data_test_FGS.shape)

In [None]:
df_train = pd.read_csv(path_folder + 'train.csv')
df_train.set_index('planet_id', inplace=True)

planet_ids = np.concatenate(index)
df_train = df_train[df_train.index.isin(planet_ids)]

print(df_train.shape)
df_train.head()

## Split into train and validation sets

In [None]:
n = round(.8 * len(df_train))
n

In [None]:
train_AIRS = data_train_AIRS[:n]
val_AIRS = data_train_AIRS[n:]
print(len(train_AIRS), len(val_AIRS))

train_FGS = data_train_FGS[:n]
val_FGS = data_train_FGS[n:]
print(len(train_FGS), len(val_FGS))

train_labels = df_train.iloc[:n,:] 
val_labels = df_train.iloc[n:,:]
print(train_labels.shape, val_labels.shape)

## Define the model

In [None]:
print(train_AIRS.shape)
print(train_labels.shape)

In [None]:
inputs = Input(shape=(375, 282, 32), name='inputs')
x = Conv2D(32, (3, 3), activation='relu')(inputs)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = Flatten()(x)
x = Dense(64, activation='relu')(x)

# Two output heads
mean_output = Dense(283, activation='linear', name='mean')(x)  
log_std_output = Dense(283, activation='linear', name='log_std')(x) 
std_output = Lambda(lambda x: tf.exp(0.5 * x), name='std')(log_std_output)

# Concatenate outputs for submission
outputs = Concatenate(name='outputs')([mean_output, std_output])

model = Model(inputs=inputs, outputs=outputs)
model.summary()

## Compile and train the model

In [None]:
def nll_loss(y_true, y_pred):
    mu, std = y_pred[:, :283], y_pred[:, 283:]
    return tf.reduce_mean(0.5 * tf.math.log(2 * np.pi * std**2) + 0.5 * ((y_true - mu)**2 / std**2))

model.compile(optimizer = 'adam', loss = nll_loss)

In [None]:
model.fit(train_AIRS, train_labels.values, 
          validation_data=(val_AIRS, val_labels.values),
          epochs=10, batch_size=32, verbose=0)

## Generate predictions

In [None]:
# Generate predictions
predictions = model.predict(data_test_AIRS)  
means = predictions[:, :283]  
stds = predictions[:, 283:] 
stds = np.abs(stds)

# Create submission DataFrame
# Fix test_index if it's nested from chunking
if isinstance(test_index, list) and len(test_index) > 0:
    test_index_flat = np.concatenate(test_index)
else:
    test_index_flat = test_index
    
# Load sample submission 
df_sample = pd.read_csv(path_folder + 'sample_submission.csv')

# Create submission DataFrame matching sample format exactly
df_submission = df_sample.copy()
df_submission['planet_id'] = test_index_flat

df_submission.iloc[:, 1:284] = means.astype(np.float32)  
df_submission.iloc[:, 284:567] = stds.astype(np.float32)   

# Replace inf and NaN values
df_submission = df_submission.replace([np.inf, -np.inf], np.nan)
df_submission = df_submission.fillna(0.0)

# Ensure all columns have correct data types (skip planet_id)
for col in df_submission.columns[1:]: 
    df_submission[col] = df_submission[col].astype(np.float32)

# Verify
print(df_submission.shape) 
df_submission.head()

In [None]:
df_submission.to_csv('/kaggle/working/submission.csv', index=False)