In [None]:
#Import libraries
import drms
from drms import DrmsQueryError
from astropy.time import Time
import pandas as pd
from datetime import datetime, timedelta
import csv
from os.path import exists
from sklearn.preprocessing import MinMaxScaler
import numpy as np



In [None]:
#Initisalise drms client and display available data series
c = drms.Client()
c.series(r'hmi\.sharp_')

# Set a series
si = c.info('hmi.sharp_720s')

In [None]:
#Initialise SHARP metadata features.
fields = [
    "T_REC", 
    "HARPNUM", 
    "NOAA_NUM", 
    "NOAA_ARS", 
    "NOAA_AR", 
    "QUALITY", 
    "TOTUSJH", 
    "TOTUSJZ", 
    "SAVNCPP", 
    "USFLUX", 
    "ABSNJZH", 
    "TOTPOT",
    "SIZE_ACR", 
    "NACR", 
    "MEANPOT", 
    "SIZE", 
    "MEANJZH", 
    "SHRGT45", 
    "MEANSHR",
    "MEANJZD", 
    "MEANALP", 
    "MEANGBT", 
    "MEANGBL", 
    "MEANGAM", 
    "MEANGBZ", 
    "MEANGBH", 
    "NPIX"
]

query_string = ",".join(fields)

In [None]:
#Define log and error files for download

log_file = "D:/GitHub/solar-forecasting/logs/query_pipeline_log.csv"
error_file = "D:/GitHub/solar-forecasting/logs/query_pipeline_error.csv"

def log_success(log_file, t1_str, t2_str, n_rows):
    write_header = not exists(log_file)
    with open(log_file, 'a', newline='') as f:
        writer = csv.writer(f)
        if write_header:
            writer.writerow(['start_time', 'end_time', 'rows_written'])
        writer.writerow([t1_str, t2_str, n_rows])

def log_error(error_file, t1_str, t2_str, error):
    write_header = not exists(error_file)
    with open(error_file, 'a', newline='') as f:
        writer = csv.writer(f)
        if write_header:
            writer.writerow(['start_time', 'end_time', 'error_message'])
        writer.writerow([t1_str, t2_str, str(error)])

In [None]:
#Download data to csv, searching iteratively for the most recent available day's data.

time_diff = timedelta(minutes=1440)

first_write = True
end = Time.now()
start = end - time_diff
while True:
    
    print(f"start = {start}")
    print(f"end = {end}")

    print(f"Downloading data")
    t_2_str = end.strftime("%Y.%m.%d_%H:%M:%S_TAI") 
    t_1_str = start.strftime("%Y.%m.%d_%H:%M:%S_TAI") 
    #t_1_str = "2025.07.03_11:12:00_TAI"

    print(t_2_str)
    print(t_1_str)
    try:
        extract = c.query(f'hmi.sharp_720s[1-13459][{t_1_str}-{t_2_str}]', key=query_string)

        if not extract.empty:
            extract.to_csv("D:/GitHub/solar-forecasting/data/sharp_metadata_pipeline_2.csv", mode='a', index=False, header=first_write)
            print(f"Wrote {len(extract)} rows for {t_1_str} - {t_2_str}")
            log_success(log_file, start, end, len(extract))
            if first_write:
                first_write = False
                start = start - time_diff
                end = end - time_diff
                continue
            else:
                break
        else:
            print(f"No records available for {t_1_str} - {t_2_str}")
            log_success(log_file, start, end, 0)
            print("retry")
            start = start - time_diff
            end = end - time_diff

    except (DrmsQueryError, TimeoutError) as e:
        print(f"JSOC query failed for {t_1_str}-{t_2_str}: {e}")
        log_error(error_file, start, end, e)
        print("retry")
        start = start - time_diff
        end = end - time_diff


In [None]:
data = pd.read_csv("D:/GitHub/solar-forecasting/data/sharp_metadata_pipeline_2.csv")

In [None]:
data = data.drop_duplicates()

In [None]:
data.shape

Majority of NOAA_ARS are MISSING, drop column.

In [None]:
noaa_ars_counts = data['NOAA_ARS'].value_counts()

print(noaa_ars_counts)

In [None]:
data.drop(columns='NOAA_ARS', inplace=True)

In [None]:
data.isnull().sum()

In [None]:
import numpy as np

# Replace all 'MISSING' strings with np.nan
data.replace(['MISSING', 'NaN'], np.nan, inplace=True)
data.isnull().sum()

In [None]:
#Retain only QUALITY data. Other data may be corrupted and unsuitable for training.
data['QUALITY'].value_counts()


In [None]:
quality_data = data#[(data['QUALITY'] == 0)]# | (data['QUALITY'] == 65536)]
quality_data.shape

In [None]:
quality_data.isnull().sum()

In [None]:
quality_data['T_REC'] = pd.to_datetime(quality_data['T_REC'].str.replace('_TAI', ''), format='%Y.%m.%d_%H:%M:%S')

In [None]:


# Set pandas display options to show all rows (for large columns)
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns (if needed)

# Now print the column (replace 'T_REC' with the column you want to print)
print(quality_data['T_REC'])

In [None]:
#For Null values that do not suit linear interpolation, repair using median value.
median_features = ['SIZE_ACR', 'SIZE', 'NPIX', 'NACR']

for feature in median_features:
    medians = quality_data.groupby('HARPNUM')[feature].transform('median')
    quality_data[feature] = quality_data[feature].fillna(medians)

quality_data.isnull().sum()


In [None]:
#For other features, repair using linear interpolation, which is logical for time dependent magnetic flux measurements.
linear_interpolation_features = ['TOTUSJH','TOTUSJZ', 'SAVNCPP', 'USFLUX', 'ABSNJZH', 'TOTPOT', 'MEANPOT', 'MEANJZH', 'SHRGT45', 'MEANSHR', 'MEANJZD', 'MEANALP', 'MEANGBT', 'MEANGBL', 'MEANGAM', 'MEANGBZ', 'MEANGBH']

In [None]:
quality_data_LI = quality_data.copy()
quality_data_LI[linear_interpolation_features] = quality_data[linear_interpolation_features].apply(
    pd.to_numeric, errors='coerce'
)

In [None]:
df_sorted = quality_data_LI.sort_values(['HARPNUM', 'T_REC']).copy()

In [None]:
for col in linear_interpolation_features:
    df_sorted[col] = df_sorted.groupby('HARPNUM')[col].transform(lambda g: g.interpolate(method='linear', limit_direction = 'both'))


In [None]:
df_sorted.isnull().sum()

In [None]:
#Drop small number of remaining NULLs which could not be repaired using linear interpolation.
#quality_data_no_null = df_sorted.dropna()

In [None]:
#quality_data_no_null.isnull().sum()

In [None]:
quality_data.shape

In [None]:
quality_data.head()

In [None]:
len(quality_data['HARPNUM'].unique())

In [None]:
#Segment data into a dictionary containing HARPNUM (patch ID) as key, and list of SHARP sequences as values.
harp_dict = {}

grouped = quality_data.groupby('HARPNUM')

for harpnum, group in grouped:
    harp_dict[harpnum] = group



In [None]:
len(harp_dict.keys())

In [None]:
sequence_length = 30  # 6 hours of 12-minute cadence
cadence_upper = pd.Timedelta(minutes=13)
cadence_lower = pd.Timedelta(minutes=11)


In [None]:
sequence_dict = {}

for harp_ID, sample in harp_dict.items():
      
    valid_sequences = []
    sample = sample.sort_values('T_REC').reset_index(drop=True)

    start_idx = 0
    while start_idx < (len(sample) - sequence_length + 1):
            seq = sample.iloc[start_idx : start_idx + sequence_length]
            time_deltas = seq['T_REC'].diff().dropna()

            if all(time_deltas < cadence_upper) and all(time_deltas > cadence_lower):
                valid_sequences.append(seq.reset_index(drop=True))
                start_idx = start_idx + sequence_length
            else:
                 start_idx += 1
    if len(valid_sequences) > 0:
        sequence_dict[harp_ID] = valid_sequences



In [None]:
len(sequence_dict.keys())

In [None]:
print(sequence_dict.keys())

In [None]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)   
a = sequence_dict[13432][1].head()
a.dtypes

In [None]:
import pandas as pd

# Initialize an empty list to collect new rows
new_rows = []

# Assuming `sequence_dict` is already defined
for harpnum, sequences in sequence_dict.items():
    for index, sequence in enumerate(sequences):
        # Add the row to the list (instead of concatenating repeatedly)
        new_rows.append({
            "Harpnum": harpnum,
            "T_REC": sequence["T_REC"].iloc[29],
            "seq_number": index
        })

# Create a DataFrame from the list of rows
sequence_end_time = pd.DataFrame(new_rows, columns=["Harpnum", "T_REC", "seq_number"])

# Ensure 'T_REC' is in datetime format
sequence_end_time['T_REC'] = pd.to_datetime(sequence_end_time['T_REC'])

# Find the maximum T_REC (end time)
max_end_time = sequence_end_time['T_REC'].max()

# Filter the DataFrame to keep only the rows where T_REC is equal to the max end time
max_end_time_records = sequence_end_time[sequence_end_time['T_REC'] == max_end_time]

# Print the filtered DataFrame
#print(max_end_time_records)

recent_sequence_dict = {}

# Assuming `max_end_time_records` is the DataFrame and `harpnum` is the key you're filtering by
for harpnum, sequences in sequence_dict.items():
    # Check if the harpnum exists in max_end_time_records to avoid errors
    if harpnum in max_end_time_records['Harpnum'].values:
        seq_index = max_end_time_records[max_end_time_records['Harpnum'] == harpnum]['seq_number'].iloc[0]
        recent_sequence_dict[harpnum] = sequences[seq_index]
    else:
        print(f"Warning: {harpnum} not found in max_end_time_records.")

# Print the resulting dictionary with the most recent sequences
#print(recent_sequence_dict[13424])


In [None]:
X_list = []
for harpnum, sequence in recent_sequence_dict.items():

    #sequence_array = sequence.select_dtypes(include='number').to_numpy()
    sequence_array = sequence.drop(columns=['T_REC', 'HARPNUM', 'NOAA_NUM', 'NOAA_AR', 'QUALITY']).to_numpy()
    if sequence_array.shape[0] != 30:
        print(f"Sequence != 30 {harpnum}")
        continue  # optional: skip sequences that don't match expected length

    X_list.append(sequence_array)

print(len(X_list))


In [None]:
print(recent_sequence_dict.keys())

In [None]:
print(max_end_time)

In [None]:
X_list

In [None]:
import joblib

# Initialize a list to store scaled sequences
X_real_time_scaled = []

# Load the pre-fitted scaler (from training data)
scaler = joblib.load('D:/GitHub/solar-forecasting/data/scaler.pkl')

# Assuming `X_list` contains multiple sequences (each sequence is a 2D array of shape (30, n_features))
for seq in X_list:
    # Transform the sequence using the pre-fitted scaler (no fitting, just transforming)
    seq_scaled = scaler.transform(seq)

    # Append the scaled sequence to the list
    X_real_time_scaled.append(seq_scaled)

# Convert the scaled sequences list to a numpy array if needed
X_real_time_scaled = np.array(X_real_time_scaled)

# Example: Print the shape of the first scaled sequence
print(len(X_real_time_scaled[0][1]))  # For checking a single element



In [None]:
X_real_time_scaled

In [None]:
# Initiaise tensors for use in STM training.

import torch
import numpy as np

# Convert lists of arrays into 3D arrays
X_array = np.array(X_real_time_scaled)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_array, dtype=torch.float32)


In [None]:
torch.save({
    'X_tensor': X_tensor,
}, 'D:/GitHub/solar-forecasting/data/data_pipeline.pt')