In [None]:
import pandas as pd

In [None]:
sxr = pd.read_csv('/mnt/data/batch_results/ViT/ViT_predictions.csv')


In [None]:
# Simple plot to see the data
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=sxr['timestamp'],
    y=sxr['groundtruth'],
    mode='lines',
    name='Ground Truth',
    line=dict(color='blue', width=1)
))

fig.add_trace(go.Scatter(
    x=sxr['timestamp'],
    y=sxr['predictions'],
    mode='lines',
    name='Predicted',
    line=dict(color='red', width=1)
))

fig.update_layout(
    title='Ground Truth vs Predicted SXR Flux',
    xaxis_title='Timestamp',
    yaxis_title='SXR Flux',
    legend=dict(x=0, y=1),
    template='plotly_white',
    height=500,
    width=1000
)

# Set yscale to log
fig.update_yaxes(type='log')

fig.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os

# Path to AIA .npy files
AIA_DIR = "/mnt/data/NO-OVERLAP/AIA/test"
OUT_DIR = "/mnt/data/OUTLIERS"
os.makedirs(OUT_DIR, exist_ok=True)

def save_aia_image_for_timestamp(ts, out_path):
    # Convert timestamp to string in the expected filename format
    if isinstance(ts, np.datetime64):
        ts = str(ts)
    if isinstance(ts, pd.Timestamp):
        ts = ts.strftime("%Y-%m-%dT%H:%M:%S")
    else:
        ts = str(ts)
        if " " in ts and "T" not in ts:
            ts = ts.replace(" ", "T")
        if len(ts) == 16:
            ts = ts + ":00"
    filename = f"{ts}.npy"
    aia_path = os.path.join(AIA_DIR, filename)
    if os.path.exists(aia_path):
        aia_img = np.load(aia_path)
        plt.figure(figsize=(6,6))
        # Make an image for each dimension (0-5), corresponding to 94, 131, 171, 193, 211, 304
        aia_channels = [94, 131, 171, 193, 211, 304]
        for i, ch in enumerate(aia_channels):
            img = aia_img[i]
            # Use tanh scaling for better contrast
            img_tanh = np.tanh(img / np.nanmax(np.abs(img)))
            plt.figure(figsize=(6,6))
            plt.imshow(img_tanh, cmap='gray')
            plt.title(f"AIA {ch}Ã… image at {ts}")
            plt.axis('off')
            plt.tight_layout()
            # Save with channel in filename
            ch_out_path = out_path.replace(".png", f"_{ch}.png")
            plt.savefig(ch_out_path, bbox_inches='tight', dpi=150)
            plt.close()
            print(f"Saved: {ch_out_path}")
        plt.title(f"AIA image at {ts}")
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(out_path, bbox_inches='tight', dpi=150)
        plt.close()
        print(f"Saved: {out_path}")
    else:
        print(f"No AIA file found for {ts}")

# Only show points where prediction and ground truth are off by 100x
# (i.e., ratio >= 100 or <= 0.01, and both are > 0 to avoid division by zero)
mask = (
    (sxr['groundtruth'] > 0) &
    (sxr['predictions'] > 0) &
    (
        (sxr['predictions'] / sxr['groundtruth'] >= 8) |
        (sxr['groundtruth'] / sxr['predictions'] >= 8)
    )
)
print(mask)
sxr_outlier = sxr[mask].reset_index(drop=True)

# Save AIA images for each outlier timestamp
for i, row in sxr_outlier.iterrows():
    ts = row['timestamp']
    # Make a safe filename
    ts_str = str(ts)
    if " " in ts_str and "T" not in ts_str:
        ts_str = ts_str.replace(" ", "T")
    if len(ts_str) == 16:
        ts_str = ts_str + ":00"
    # Add info about the error for easier inspection
    gt = row['groundtruth']
    pred = row['predictions']
    ratio = pred / gt if gt > 0 else np.nan
    out_path = os.path.join(
        OUT_DIR,
        f"aia_{ts_str.replace(':','-')}_gt_{gt:.2e}_pred_{pred:.2e}_ratio_{ratio:.1f}.png"
    )
    save_aia_image_for_timestamp(ts, out_path)

In [None]:
/mnt/data/OUTLIERS/aia_2024-11-08T03-09-00_gt_1.06e-05_pred_1.27e-06_ratio_0.1_211.png

In [None]:
a = np.load("/mnt/data/NO-OVERLAP/AIA/test/2024-11-08T03:09:00.npy")

In [None]:
plt.imshow(a[4])

In [None]:
a[4].mean()

In [None]:
a = np.load("/mnt/data/NO-OVERLAP/AIA/test/2023-08-08T03:09:00.npy")

In [None]:
a[4].mean()

In [None]:
import glob
import numpy as np
import multiprocessing as mp
from tqdm import tqdm

# Wavelengths and their corresponding indices
wavelengths = [94, 131, 171, 193, 211, 304]
wavelength_indices = {w: i for i, w in enumerate(wavelengths)}

test_data_dir = "/mnt/data/NO-OVERLAP/AIA/test/"
npy_files = sorted(glob.glob(test_data_dir + "*.npy"))

def compute_means(file_path):
    arr = np.load(file_path)
    # arr shape: (6, H, W), indices: 0=94, 1=131, 2=171, 3
    #
    #=193, 4=211, 5=304
    means = {}
    for w, idx in wavelength_indices.items():
        means[w] = arr[idx].mean()
    return means

# Use tqdm to monitor progress of multiprocessing
with mp.Pool(mp.cpu_count()) as pool:
    all_means = list(tqdm(pool.imap(compute_means, npy_files), total=len(npy_files), desc="Processing files"))

# Convert to array: shape (num_files, num_wavelengths)
means_matrix = np.array([[means[w] for w in wavelengths] for means in all_means])

mean_per_wavelength = np.nanmean(means_matrix, axis=0)
std_per_wavelength = np.nanstd(means_matrix, axis=0)

outlier_indices = []
for idx, means in enumerate(means_matrix):
    if np.any(np.abs(means - mean_per_wavelength) > 2 * std_per_wavelength):
        outlier_indices.append(idx)

print(f"Total images: {len(npy_files)}")
print(f"Outlier images: {len(outlier_indices)}")
print("Outlier files and their means (by wavelength):")
for idx in outlier_indices:
    print(f"{npy_files[idx]}: ", end="")
    print({w: means_matrix[idx][i] for i, w in enumerate(wavelengths)})




In [None]:
outlier_indices = []
for idx, means in enumerate(means_matrix):
    if np.any(np.abs(means - mean_per_wavelength) > 5 * std_per_wavelength):
        outlier_indices.append(idx)

print(f"Total images: {len(npy_files)}")
print(f"Outlier images: {len(outlier_indices)}")
print("Outlier files and their means (by wavelength):")
for idx in outlier_indices:
    print(f"{npy_files[idx]}: ", end="")
    print({w: means_matrix[idx][i] for i, w in enumerate(wavelengths)})

In [None]:
# Assuming you have a DataFrame named `sxr` with a column 'timestamp' or similar that matches the file timestamps

# Extract timestamps from outlier file names
import os
import pandas as pd

# Helper to extract timestamp from file name, assuming format: .../AIA_YYYYMMDD_HHMMSS.npy
def extract_timestamp_from_filename(filename):
    base = os.path.basename(filename)
    # Example: AIA_20250101_123456.npy
    # Extract '20250101_123456'
    ts = base.replace("AIA_", "").replace(".npy", "")
    return ts

# Get outlier timestamps
outlier_timestamps = [extract_timestamp_from_filename(npy_files[idx]) for idx in outlier_indices]

# Remove outliers from sxr DataFrame
# If your timestamp column is named differently, change 'timestamp' below
sxr_clean = sxr[~sxr['timestamp'].isin(outlier_timestamps)].copy()

# Optionally, overwrite sxr with the cleaned version
# sxr = sxr_clean

print(f"Removed {len(sxr) - len(sxr_clean)} outlier rows from sxr DataFrame.")


In [None]:
sxr_clean

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,6))
plt.scatter(sxr_clean['groundtruth'], sxr_clean['predictions'], alpha=0.5, s=5)
plt.plot([sxr_clean['groundtruth'].min(), sxr_clean['groundtruth'].max()],
         [sxr_clean['groundtruth'].min(), sxr_clean['groundtruth'].max()],
         color='red', linestyle='--', label='1:1 Line')
plt.xlabel('Ground Truth')
plt.ylabel('Predictions')
plt.title('1:1 Plot of Predictions vs Ground Truth (Clean Data)')
plt.xscale('log')
plt.yscale('log')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from matplotlib.colors import LogNorm

def plot_regression_comparison_sxr(sxr_clean, save_path=None):
    """
    Generate regression comparison plot with MAE contours and flare class axes using sxr_clean DataFrame.
    """
    # Optionally set up Barlow font if available
    try:
        import matplotlib.font_manager as fm
        barlow_font = fm.FontProperties(fname="/usr/share/fonts/truetype/barlow/Barlow-Regular.ttf")
        plt.rcParams['font.family'] = barlow_font.get_name()
    except Exception:
        pass

    flare_classes = {
        'A1.0': (1e-8, 1e-7),
        'B1.0': (1e-7, 1e-6),
        'C1.0': (1e-6, 1e-5),
        'M1.0': (1e-5, 1e-4),
        'X1.0': (1e-4, 1e-3)
    }

    def add_flare_class_axes(ax, min_val, max_val):
        """Helper function to add flare class secondary axes"""
        ax_top = ax.twiny()
        ax_top.set_xlim(ax.get_xlim())
        ax_top.set_xscale('log')
        ax_top.patch.set_alpha(0.0)

        ax_right = ax.twinx()
        ax_right.set_ylim(ax.get_ylim())
        ax_right.set_yscale('log')
        ax_right.patch.set_alpha(0.0)

        flare_positions = []
        flare_labels = []
        for class_name, (min_flux, max_flux) in flare_classes.items():
            if min_flux >= min_val and min_flux <= max_val:
                flare_positions.append(min_flux)
                flare_labels.append(f'{class_name}')
            if max_flux >= min_val and max_flux <= max_val and max_flux != min_flux:
                flare_positions.append(max_flux)
                flare_labels.append(f'{class_name}')

        if flare_positions:
            ax_top.set_xticks(flare_positions)
            ax_top.set_xticklabels(flare_labels, fontsize=12, color='black', fontfamily='Barlow')
            ax_top.tick_params(colors='black')
            ax_top.xaxis.set_minor_locator(mticker.LogLocator(base=10, subs='auto', numticks=100))
            ax_top.tick_params(which='minor', colors='black')

            ax_right.set_yticks(flare_positions)
            ax_right.set_yticklabels(flare_labels, fontsize=12, color='black', fontfamily='Barlow')
            ax_right.tick_params(colors='black')
            ax_right.yaxis.set_minor_locator(mticker.LogLocator(base=10, subs='auto', numticks=100))
            ax_right.tick_params(which='minor', colors='black')

    def draw_mae_contours(plot_ax, min_val, max_val):
        """Draw MAE contours on the 1-to-1 plot"""
        y_true = sxr_clean['groundtruth'].values
        y_pred = sxr_clean['predictions'].values

        flare_classes_mae = {
            'A': (1e-8, 1e-7, "#FFAAA5"),
            'B': (1e-7, 1e-6,  "#FFAAA5"),
            'C': (1e-6, 1e-5, "#FFAAA5"),
            'M': (1e-5, 1e-4, "#FFAAA5"),
            'X': (1e-4, 1e-2, "#FFAAA5")
        }

        for class_name, (min_flux, max_flux, color) in flare_classes_mae.items():
            mask = (y_true >= min_flux) & (y_true < max_flux)
            if not np.any(mask):
                continue
            true_subset = y_true[mask]
            pred_subset = y_pred[mask]
            # Calculate MAE in log space
            log_true = np.log10(true_subset)
            log_pred = np.log10(pred_subset)
            log_mae = np.mean(np.abs(log_true - log_pred))
            x_class = np.logspace(np.log10(min_flux), np.log10(max_flux), 100)
            upper_bound = x_class * np.exp(log_mae)
            lower_bound = x_class * np.exp(-log_mae)
            if class_name == 'X':
                plot_ax.fill_between(x_class, lower_bound, upper_bound,
                                    alpha=0.75, label=f'MAE', color=color)
            else:
                plot_ax.fill_between(x_class, lower_bound, upper_bound,
                                    alpha=0.75, color=color)

    y_true = sxr_clean['groundtruth'].values
    y_pred = sxr_clean['predictions'].values

    min_val = min(np.min(y_true), np.min(y_pred))
    max_val = max(np.max(y_true), np.max(y_pred))
    log_bins = np.logspace(np.log10(min_val), np.log10(max_val), 100)
    shared_norm = LogNorm(vmin=1, vmax=None)

    fig, ax1 = plt.subplots(1, 1, figsize=(10, 6))
    fig.patch.set_alpha(0.0)

    # 1:1 line
    ax1.plot([min_val, max_val], [min_val, max_val],
             label='Perfect Prediction', color='#A00503', linestyle='-', linewidth=1, zorder=5)

    # 2D histogram
    h1 = ax1.hist2d(y_true, y_pred, bins=[log_bins, log_bins],
                    cmap='inferno', norm=shared_norm, alpha=1)

    # Draw MAE contours
    draw_mae_contours(ax1, min_val, max_val)

    ax1.set_facecolor('#FFEEE6')
    ax1.patch.set_alpha(1.0)

    ax1.set_xlabel(r'Ground Truth Flux (W/m$^{2}$)', fontsize=14, color='black', fontfamily='Barlow')
    ax1.set_ylabel(r'Predicted Flux (W/m$^{2}$)', fontsize=14, color='black', fontfamily='Barlow')
    ax1.tick_params(labelsize=12, colors='black')

    for label in ax1.get_xticklabels():
        label.set_fontfamily('Barlow')
    for label in ax1.get_yticklabels():
        label.set_fontfamily('Barlow')

    #ax1.set_title('Model Performance with MAE Overlay', fontsize=16, color='black', pad=20, fontfamily='Barlow')

    legend = ax1.legend(loc='upper left', frameon=True, fancybox=True, shadow=True, prop={'family': 'Barlow', 'size': 12})
    legend.get_frame().set_facecolor('#FFEEE6')
    legend.get_frame().set_alpha(0.9)
    for text in legend.get_texts():
        text.set_color('black')
        text.set_fontsize(12)
        text.set_fontfamily('Barlow')

    ax1.set_axisbelow(True)
    ax1.grid(True, alpha=0.3, color='black', linestyle='-', linewidth=0.5)
    ax1.set_xscale('log')
    ax1.set_yscale('log')

    ax1.xaxis.set_minor_locator(mticker.LogLocator(base=10, subs='auto', numticks=100))
    ax1.yaxis.set_minor_locator(mticker.LogLocator(base=10, subs='auto', numticks=100))
    ax1.tick_params(which='minor', colors='black')
    ax1.grid(True, which='minor', alpha=0.15, linewidth=0.25, linestyle='--', color='black')

    add_flare_class_axes(ax1, min_val, max_val)

    cbar = fig.colorbar(h1[3], ax=ax1, orientation='vertical', pad=.1)
    cbar.ax.yaxis.set_tick_params(labelsize=12, colors='black')
    cbar.set_label("Count", fontsize=14, color='black', fontfamily='Barlow')
    cbar.ax.tick_params(colors='black')
    cbar.ax.yaxis.set_minor_locator(mticker.LogLocator(base=10, subs='auto', numticks=100))
    cbar.ax.tick_params(which='minor', colors='black')
    cbar.ax.set_facecolor('#1a1a3a')
    cbar.ax.patch.set_alpha(1.0)
    for label in cbar.ax.get_yticklabels():
        label.set_fontfamily('Barlow')

    if save_path is not None:
        #plt.savefig(save_path, dpi=500, bbox_inches='tight', facecolor='none')
        print(f"Saved regression comparison plot to {save_path}")
    else:
        plt.show()

# Example usage:
plot_regression_comparison_sxr(sxr_clean)

In [None]:
import os
import shutil

# Define the source and destination directories
src_dir = "/mnt/data/NO-OVERLAP/AIA/test"
dst_dir = "/mnt/data/NO-OVERLAP/AIA/bad_data/SXR"

# Ensure the destination directory exists
os.makedirs(dst_dir, exist_ok=True)

# Move all files from 2024-11-08 in the SXR train directory to the outliers folder
for fname in os.listdir(src_dir):
    if fname.startswith("2024-11-08"):
        src_path = os.path.join(src_dir, fname)
        dst_path = os.path.join(dst_dir, fname)
        print(f"Moving {src_path} -> {dst_path}")
        shutil.move(src_path, dst_path)
