# Data Cleaner
Clean data to remove any nan values or large anomalies. 

In [1]:
import os
import xarray as xr
import numpy as np
from tqdm import tqdm

In [2]:
input_dir = 'time_series_losangeles'
output_dir = 'time_series_cleaned_losangeles'
spike_threshold = 2000 #Anything above this is likely an anomaly. 
os.makedirs(output_dir, exist_ok=True)



In [3]:
def DataCleaner(input_dir, output_dir, spike_threshold):
    bands = [f"C{b:02}" for b in range(1, 17)]
    
    for band in bands:
        path = os.path.join(input_dir, f"{band}_time_series.nc")
        if not os.path.exists(path):
            continue
    
        print(f"Cleaning {band}")
        ds = xr.open_dataset(path)
        if "Rad" not in ds:
            print(f"No 'Rad' in {band}, skipping.")
            continue
    
        rad = ds["Rad"]
        rad_data = rad.values.copy()  # shape: (T, Y, X)
    
        # Drop pixels that are always NaN 
        nan_mask = np.all(np.isnan(rad_data), axis=0) # (Y, X)
        valid_y = ~np.all(nan_mask, axis=1)
        valid_x = ~np.all(nan_mask, axis=0)
    
        # Subset spatially
        rad_data = rad_data[:, valid_y, :][:, :, valid_x] #(T, new_Y, new_X)
    

        T, H, W = rad_data.shape #Remove any anomalous spikes 
        spike_mask = rad_data > spike_threshold
        spike_count = 0
    
        for t in tqdm(range(1, T), desc=f'{band} - Despiking'):
            spikes = spike_mask[t]
            spike_count += np.sum(spikes)
            rad_data[t][spikes] = rad_data[t - 1][spikes]  # replace with previous timestep
    
        # Replace into dataset
        ds_clean = ds.isel(x=np.where(valid_x)[0], y=np.where(valid_y)[0])
        ds_clean["Rad"].values = rad_data  # overwrite cleaned data
    
        # Save
        out_path = os.path.join(output_dir, f"{band}_time_series.nc")
        ds_clean.to_netcdf(out_path)
        print(f"Saved cleaned {band} to {out_path}.{spike_count} spikes replaced.")
        print(f"{spike_count} spikes replaced.")

    return

In [4]:
DataCleaner(input_dir, output_dir, spike_threshold)

Cleaning C01


C01 - Despiking: 100%|█████████████████| 14326/14326 [00:01<00:00, 11006.03it/s]


Saved cleaned C01 to time_series_cleaned_losangeles/C01_time_series.nc.0 spikes replaced.
0 spikes replaced.
Cleaning C02


C02 - Despiking: 100%|██████████████████| 14323/14323 [00:05<00:00, 2613.20it/s]


Saved cleaned C02 to time_series_cleaned_losangeles/C02_time_series.nc.0 spikes replaced.
0 spikes replaced.
Cleaning C03


C03 - Despiking: 100%|█████████████████| 14324/14324 [00:01<00:00, 11246.86it/s]


Saved cleaned C03 to time_series_cleaned_losangeles/C03_time_series.nc.0 spikes replaced.
0 spikes replaced.
Cleaning C04


C04 - Despiking: 100%|████████████████| 14299/14299 [00:00<00:00, 102578.30it/s]


Saved cleaned C04 to time_series_cleaned_losangeles/C04_time_series.nc.0 spikes replaced.
0 spikes replaced.
Cleaning C05


C05 - Despiking: 100%|█████████████████| 14321/14321 [00:01<00:00, 10808.60it/s]


Saved cleaned C05 to time_series_cleaned_losangeles/C05_time_series.nc.0 spikes replaced.
0 spikes replaced.
Cleaning C06


C06 - Despiking: 100%|█████████████████| 14298/14298 [00:00<00:00, 99830.30it/s]


Saved cleaned C06 to time_series_cleaned_losangeles/C06_time_series.nc.0 spikes replaced.
0 spikes replaced.
Cleaning C07


C07 - Despiking: 100%|█████████████████| 14300/14300 [00:00<00:00, 93921.93it/s]


Saved cleaned C07 to time_series_cleaned_losangeles/C07_time_series.nc.0 spikes replaced.
0 spikes replaced.
Cleaning C08


C08 - Despiking: 100%|█████████████████| 14296/14296 [00:00<00:00, 99099.55it/s]


Saved cleaned C08 to time_series_cleaned_losangeles/C08_time_series.nc.0 spikes replaced.
0 spikes replaced.
Cleaning C09


C09 - Despiking: 100%|████████████████| 14303/14303 [00:00<00:00, 114353.55it/s]


Saved cleaned C09 to time_series_cleaned_losangeles/C09_time_series.nc.0 spikes replaced.
0 spikes replaced.
Cleaning C10


C10 - Despiking: 100%|████████████████| 14298/14298 [00:00<00:00, 100824.92it/s]


Saved cleaned C10 to time_series_cleaned_losangeles/C10_time_series.nc.0 spikes replaced.
0 spikes replaced.
Cleaning C11


C11 - Despiking: 100%|█████████████████| 14304/14304 [00:00<00:00, 94180.93it/s]


Saved cleaned C11 to time_series_cleaned_losangeles/C11_time_series.nc.0 spikes replaced.
0 spikes replaced.
Cleaning C12


C12 - Despiking: 100%|████████████████| 14303/14303 [00:00<00:00, 105110.40it/s]


Saved cleaned C12 to time_series_cleaned_losangeles/C12_time_series.nc.0 spikes replaced.
0 spikes replaced.
Cleaning C13


C13 - Despiking: 100%|█████████████████| 14294/14294 [00:00<00:00, 87127.87it/s]


Saved cleaned C13 to time_series_cleaned_losangeles/C13_time_series.nc.0 spikes replaced.
0 spikes replaced.
Cleaning C14


C14 - Despiking: 100%|████████████████| 14303/14303 [00:00<00:00, 114082.38it/s]


Saved cleaned C14 to time_series_cleaned_losangeles/C14_time_series.nc.0 spikes replaced.
0 spikes replaced.
Cleaning C15


C15 - Despiking: 100%|████████████████| 14303/14303 [00:00<00:00, 113511.86it/s]


Saved cleaned C15 to time_series_cleaned_losangeles/C15_time_series.nc.0 spikes replaced.
0 spikes replaced.
Cleaning C16


C16 - Despiking: 100%|████████████████| 14296/14296 [00:00<00:00, 100734.94it/s]


Saved cleaned C16 to time_series_cleaned_losangeles/C16_time_series.nc.0 spikes replaced.
0 spikes replaced.
