In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read SHARP data
data = pd.read_csv("D:/GitHub/solar-forecasting/data/sharp_metadata_dump_daily.csv")

In [None]:
data.head()

In [None]:
# Drop any duplicate writes and inspect data shape and types
data = data.drop_duplicates()
print(data.shape)

In [None]:
data.info()

Majority of NOAA_ARS are MISSING, drop column.

In [None]:
# NOAA_ARS contains info about related or overlapping active regions and is often not populated (MISSING).
# Too significant to repair, so drop column.
noaa_ars_counts = data['NOAA_ARS'].value_counts()

print(noaa_ars_counts)

In [None]:
data.drop(columns='NOAA_ARS', inplace=True)

In [None]:
# Check data for nulls.
data.isnull().sum()

In [None]:
# Replace all 'MISSING' strings with np.nan
data.replace(['MISSING', 'NaN'], np.nan, inplace=True)
data.isnull().sum()

In [None]:
# Retain only QUALITY data. Other data may be corrupted and unsuitable for training.
# Corruption can occur through instrument failure or environmental factors such as cosmic rays striking equipment, degrading the image data.
data['QUALITY'].value_counts()


In [None]:
# Retain only 'science grade' quality data.
quality_data = data[(data['QUALITY'] == 0)]# | (data['QUALITY'] == 65536)]
quality_data.shape

In [None]:
# Convert timestamp to datetime object for analysis.
quality_data['T_REC'] = pd.to_datetime(quality_data['T_REC'].str.replace('_TAI', ''), format='%Y.%m.%d_%H:%M:%S')

In [None]:
#For Null values that do not suit linear interpolation, repair using median value.
# Applicable for fields including size and number of pixels, where median is likely to be a reasonable representation.
median_features = ['SIZE_ACR', 'SIZE', 'NPIX', 'NACR']

for feature in median_features:
    medians = quality_data.groupby('HARPNUM')[feature].transform('median')
    quality_data[feature] = quality_data[feature].fillna(medians)

quality_data.isnull().sum()


In [None]:
# For other features, repair using linear interpolation, which is logical for time dependent magnetic flux measurements.
linear_interpolation_features = ['TOTUSJH','TOTUSJZ', 'SAVNCPP', 'USFLUX', 'ABSNJZH', 'TOTPOT', 'MEANPOT', 'MEANJZH', 'SHRGT45', 'MEANSHR', 'MEANJZD', 'MEANALP', 'MEANGBT', 'MEANGBL', 'MEANGAM', 'MEANGBZ', 'MEANGBH']

In [None]:
# Convert linear interpolation features to numeric datatypes for analysis and training.
quality_data_LI = quality_data.copy()
quality_data_LI[linear_interpolation_features] = quality_data[linear_interpolation_features].apply(
    pd.to_numeric, errors='coerce'
)

In [None]:
# Sort data, grouping by HARPNUM (solar active region) and sorting by timestamp.
df_sorted = quality_data_LI.sort_values(['HARPNUM', 'T_REC']).copy()

In [None]:
# Fill nulls using linear interpolation between two adjacent values for all linear interpolation ready features.
for col in linear_interpolation_features:
    df_sorted[col] = df_sorted.groupby('HARPNUM')[col].transform(lambda g: g.interpolate(method='linear', limit_direction = 'both'))


In [None]:
# Review reduction in null values in set
df_sorted.isnull().sum()

In [None]:
# Drop small number of remaining null values which could not be repaired using linear interpolation.
quality_data_no_null = df_sorted.dropna()

In [None]:
# Confirm dataset is free of null values and inspect details.
quality_data_no_null.isnull().sum()

In [None]:
quality_data_no_null.shape

In [None]:
quality_data_no_null.head()

In [None]:
# Identify number of solar active regions in dataset.
len(quality_data_no_null['HARPNUM'].unique())

In [None]:
# Segment data into a dictionary containing HARPNUM (patch ID) as key, and list of SHARP sequences as values.
harp_dict = {}

grouped = quality_data_no_null.groupby('HARPNUM')

for harpnum, group in grouped:
    harp_dict[harpnum] = group



In [None]:
len(harp_dict.keys())

In [None]:
# Define the length of sequences to build, and the range (+ tolerance) to search through the dataset.
sequence_length = 30  # 6 hours of 12-minute cadence
cadence_upper = pd.Timedelta(minutes=13)
cadence_lower = pd.Timedelta(minutes=11)


In [None]:
# BUild a sequence dictionary, with HARPNUM as key and list of valid sequences as value.
# Only fully formed consecutive sequences with 12 minute cadence will be retained. Malformed sequences with gaps will be dropped.

sequence_dict = {}

for harp_ID, sample in harp_dict.items():
      
    valid_sequences = []
    sample = sample.sort_values('T_REC').reset_index(drop=True)

    start_idx = 0
    while start_idx < (len(sample) - sequence_length + 1):
            seq = sample.iloc[start_idx : start_idx + sequence_length]
            time_deltas = seq['T_REC'].diff().dropna()

            if all(time_deltas < cadence_upper) and all(time_deltas > cadence_lower):
                valid_sequences.append(seq.reset_index(drop=True))
                start_idx = start_idx + sequence_length
            else:
                 start_idx += 1
    if len(valid_sequences) > 0:
        sequence_dict[harp_ID] = valid_sequences



In [None]:
len(sequence_dict.keys())

In [None]:
# Find total number of sequences to assess total data quantity.
total_sequences = 0
lengths = []
for sequence_list in sequence_dict.values():
    length = len(sequence_list)
    lengths.append(length)
    total_sequences += length



In [None]:
total_sequences

In [None]:
# Inspect sequence dictionary.
sequence_dict.keys()

In [None]:
sequence_dict[2]

In [None]:
# Assign last_timestamp for each sequence to determine the start of the forecast window.
flare_sequences = []

for harpnum, sequences in sequence_dict.items():
    for i, sequence in enumerate(sequences):
        last_timestamp = sequence.iloc[29]['T_REC']
        NOAA_id = sequence.iloc[29]['NOAA_AR']
        flare_sequences.append({
            'HARPNUM': harpnum,
            'NOAA_id' : NOAA_id,
            'Sequence_Number': i,
            'Last_Timestamp': last_timestamp
        })

# Convert to DataFrame
flare_sequences_df = pd.DataFrame(flare_sequences)
flare_sequences_df


In [None]:
# Find values which are not assigned to a matching NOAA ID (Key required to mergewith  the GOES flare event data)
flare_sequences_df['NOAA_id'].value_counts()

In [None]:
# Drop sequences which are not matched to a valid SHARP patch (HARPNUM == 0). They cannot be validated against flare event data.
flare_sequences_df_cleaned = flare_sequences_df[flare_sequences_df['NOAA_id'] != 0]
flare_sequences_df_cleaned


In [None]:
# Write flare sequences to csv.
# Data frame format can be used to match each sequence to a flare event (or no flare) within 24h of the last timestamp.
flare_sequences_df_cleaned.to_csv("flare_sequences.csv")


In [None]:
# Import GOES flare event data.
flare_events = pd.read_csv("flare_events.csv")

In [None]:
flare_sequences_df_cleaned.head()

In [None]:
flare_sequences_df_cleaned.info()

In [None]:
flare_sequences_df_cleaned.shape

In [None]:
flare_sequences_df_cleaned['NOAA_id'].value_counts()

In [None]:
# Inspect flare event data.
flare_events.head()

In [None]:
# Convert GOES timestamps to datetime objects.
flare_events['start_time'] = pd.to_datetime(flare_events['start_time'])
flare_events['peak_time'] = pd.to_datetime(flare_events['peak_time'])
flare_events['end_time'] = pd.to_datetime(flare_events['end_time'])


In [None]:
# Inspect distribution of flare event classes.
flare_events['class_letter'].value_counts()

In [None]:
# Prepare dataframes for merge.
seq = flare_sequences_df_cleaned.copy()
events = flare_events.copy()

# Left join SHARP data sequences to selected fields from GOES flare event data on NOAA_id.
merged = seq.merge(events[['noaa_active_region', 'start_time', 'class_letter', 'intensity_W/m^2']], left_on='NOAA_id', right_on='noaa_active_region', how='left')
# Creates a cartesian product with many 'duplicate' records. Each sequence per harpnum is matched to every flare event for that harpnum, irresepctive of time.
# Data must be filtered to remove invalid dupes.

# Drop duplicate key from right table.
merged = merged.drop('noaa_active_region', axis=1)


In [None]:
merged.head()

In [None]:
merged.shape
#Assign 'No flare' as intensity == 10^-9 and class_label == N for all sequences with on flare event within 24h of final timestamp.

In [None]:
# Define the 24h inclusion mask to extract events with a flare event within 24h.
within_24h_mask = (
    (merged['start_time'] >= merged['Last_Timestamp']) &
    (merged['start_time'] < merged['Last_Timestamp'] + pd.Timedelta(hours=24))
)

# Set intensity to 0.0 for rows outside the 24h window.
merged.loc[~within_24h_mask, 'intensity_W/m^2'] = 0.0
merged.loc[~within_24h_mask, 'class_letter'] = "N"

In [None]:
merged = merged.drop('start_time', axis=1)


In [None]:
merged.head()

In [None]:
# Retain sequences with matched flares, and drop their duplicates with no flare event attached.
merged = merged.drop_duplicates()
merged.head()

In [None]:
max_flares = merged.loc[merged.groupby(['NOAA_id', 'Sequence_Number'])['intensity_W/m^2'].idxmax()]
max_flares = max_flares.sort_index()
max_flares.head()


In [None]:
max_flares.shape

In [None]:
#Give 'no flare' events a nominal intensity magnitude of 1e-9
max_flares['intensity_W/m^2'] = max_flares['intensity_W/m^2'].replace(0, 1e-9)
max_flares.head()


In [None]:
# Inspect distriution of flare intenstiies
max_flares['intensity_W/m^2'].value_counts()

In [None]:
# Convert to log10 scale for intensity.
max_flares['log10_intensity'] = np.log10(max_flares['intensity_W/m^2'])
max_flares.head()

In [None]:
# Remove class A. Small number of values barely above background. This intensity will be assigned as 'non-flare'.
max_flares = max_flares[max_flares["class_letter"] != "A"]

In [None]:
# #Distribution is highly imbalanced with ~76% of sequences yielding no flare.

# Visualise distribution of flare intensities in dataset.
# plt.figure(figsize=(10, 5))
# sns.kdeplot(max_flares['log10_intensity'], fill=True)
# plt.xlabel('log₁₀(Flare Intensity)')
# plt.title('Smoothed Distribution of Flare Intensities')
# plt.grid(True)
# plt.show()



In [None]:
# Inspect sequence data.
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)    

In [None]:
a = sequence_dict[1][1].head()
a

In [None]:
print(a.dtypes)

In [None]:
# Extract numeric fields for model training, removing timestamps, AR IDs and quality.

sequence_array = a.select_dtypes(include='number')
sequence_array


In [None]:
# Prepare data and labels as lists for model training
X_list = []
y_list = []
y_class_label = []

# Convert all sequence data into list of numpy arrays, with matching list of intensity labels and also class labels for use in model evaluation.
for harpnum, sequences in sequence_dict.items():
    for i, sequence_df in enumerate(sequences):
        # Get the label
        match = max_flares[
            (max_flares['HARPNUM'] == harpnum) &
            (max_flares['Sequence_Number'] == i)
        ]
        # if not match.empty:
        #     print(f"Match {harpnum}-{i}")
        if match.empty:
        #     print(f"No match {harpnum}-{i}")
            continue  # Skip if no label

        # Select only numeric features 
        sequence_array = sequence_df.select_dtypes(include='number').to_numpy()

        # Should already be shape (30, n_features)
        if sequence_array.shape[0] != 30:
            print(f"Sequence != 30 {harpnum}-{i}")
            continue  # optional: skip sequences that don't match expected length

        X_list.append(sequence_array)
        y_list.append(match['log10_intensity'].values[0])
        y_class_label.append(match['class_letter'].values[0])


In [None]:
# Show distribution of flare classes in dataset.
pd.Series(y_class_label).value_counts()


In [None]:
# Perform stratified sampling to split the dataset into 70:15:15 train : validate : test 
from sklearn.model_selection import train_test_split

# First split off test set (15%)
X_temp, X_test, y_temp, y_test, class_temp, class_test = train_test_split(
    X_list, 
    y_list, 
    y_class_label, 
    test_size=0.15, 
    random_state=42, 
    stratify=y_class_label
)

# Split remaining temp set into train and validate sets.
X_train, X_val, y_train, y_val, class_train, class_val = train_test_split(
    X_temp, 
    y_temp, 
    class_temp, 
    test_size=0.176, 
    random_state=42, 
    stratify=class_temp
)

# Show distribution
print(len(X_train))
print(len(y_train))
print(len(X_val))
print(len(y_val))
print(len(X_test))
print(len(y_test))


In [None]:
# Introduce class weigtings to be used with a weighted loss function to address significant class imbalances.
# Weights will be inversely proportioal to class frequency.

from collections import Counter

# Define class list explicitly to maintain consistent order
flare_classes = ['N','B', 'C', 'M', 'X']

# Count occurrences in training set only
class_counts = Counter(class_train)

# Total number of training examples
total = sum(class_counts.values())

# Compute inverse-frequency weights
class_weights = {cls: total / class_counts[cls] for cls in flare_classes}

# Normalize so sum(weights) = 1
norm = sum(class_weights.values())
class_weights = {cls: w / norm for cls, w in class_weights.items()}

# Display weights 
for cls in flare_classes:
    print(f"{cls}: weight = {class_weights[cls]:.4f}")


In [None]:
# Normalise the training data using min-max scaling to prepare for model training.
# Apply independently to all sets, train, val and test.
# Save scaler for use in live data pipeline, ensuring consistency with training data.

from sklearn.preprocessing import MinMaxScaler

X_train_stacked = np.vstack(X_train) 

scaler = MinMaxScaler()
scaler.fit(X_train_stacked)

# Apply to train set
X_train_scaled = [scaler.transform(seq) for seq in X_train]

# Sanity check inspection
X_train_scaled[0]

In [None]:
# Apply independently to val and test sets

X_val_scaled = [scaler.transform(seq) for seq in X_val]

X_test_scaled = [scaler.transform(seq) for seq in X_test]

In [None]:
# Initiaise tensors for use in LSTM training.

import torch
import numpy as np

# Convert lists of arrays into 3D arrays
X_train_array = np.array(X_train_scaled)
X_val_array = np.array(X_val_scaled)
X_test_array = np.array(X_test_scaled)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_array, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_array, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_array, dtype=torch.float32)

# Targets as float tensors for regression
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)


In [None]:
# Save data as .pt file for storage and import into model training notebook.

sample_weights = torch.tensor([class_weights[cls] for cls in class_train], dtype=torch.float32)
torch.save({
    'X_train': X_train_tensor,
    'y_train': y_train_tensor,
    'X_val': X_val_tensor,
    'y_val': y_val_tensor,
    'X_test': X_test_tensor,
    'y_test': y_test_tensor,
    'sample_weights': sample_weights,
}, 'D:/GitHub/solar-forecasting/data/preprocessed_data_v2.pt')
