# Parsing the features being used in the GNN

## IMF

In [1]:
import pandas as pd
import glob
import numpy as np
from datetime import datetime, timedelta

In [None]:

# Path to your folder
path = "/Users/elliotdable/Documents/PhD/research/fpi_ml/data/imf_data/raw"

# Find all .lst files
lst_files = glob.glob(f"{path}/*.lst")

print(len(lst_files), "files found.")

# Column names and widths
col_names = [
    "Year", "Day", "Hour", "Minute",
    "Bmag_avg", "BX", "BY_GSE", "BZ_GSE",
    "BY_GSM", "BZ_GSM", "RMS_B_scalar", "RMS_B_vector",
    "Speed", "Vx", "Vy", "Vz",
    "Proton_density", "Proton_temp",
    "AE_index", "AL_index", "AU_index"
]

col_widths = [4, 4, 3, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 9, 6, 6, 6]

dfs = []

for file in lst_files:
    df = pd.read_fwf(file, widths=col_widths, names=col_names)

    # Replace IMF fill values (various 9999 patterns) with NaN
    df.replace(
        to_replace=[
            9999, 9999.0,
            9999.9, 9999.99, 99999.9,
            99999.99, 999999.9, 9999999, 9999999.0
        ],
        value=np.nan,
        inplace=True
    )

    # Create datetime from Year + Day + Hour + Minute
    def make_datetime(row):
        try:
            date = datetime(int(row.Year), 1, 1) + timedelta(days=int(row.Day) - 1)
            return datetime(
                date.year, date.month, date.day,
                int(row.Hour), int(row.Minute)
            )
        except Exception:
            return pd.NaT

    df["datetime"] = df.apply(make_datetime, axis=1)
    df.drop(columns=["Year", "Day", "Hour", "Minute"], inplace=True)

    # Rename and select requested columns
    df = df.rename(columns={
        "BX": "bx_gse_gsm",
        "BY_GSE": "by_gse",
        "BZ_GSE": "bz_gse",
        "RMS_B_vector": "b_vector_rms_sd",
        "Vx": "vx_kms",
        "Vy": "vy_kms",
        "Vz": "vz_kms",
        "Proton_density": "proton_density",
        "Proton_temp": "proton_temperature",
        "AL_index": "al_index",
        "AU_index": "au_index",
    })

    df = df[
        [
            "datetime",
            "bx_gse_gsm", "by_gse", "bz_gse",
            "b_vector_rms_sd", "vx_kms", "vy_kms", "vz_kms",
            "proton_density", "proton_temperature",
            "al_index", "au_index"
        ]
    ]

    dfs.append(df)

In [None]:
# Combine all into one DataFrame
all_data = pd.concat(dfs, ignore_index=True)

In [3]:
all_data.shape

(18430560, 12)

In [4]:
all_data.to_csv("/Users/elliotdable/Documents/PhD/research/fpi_ml/data/imf_data/compiled/omni_imf_data_1990_2025.csv", index=False)

In [5]:
dropped = all_data.dropna()

In [6]:
dropped.shape

(12742750, 12)

In [7]:
dropped.to_csv("/Users/elliotdable/Documents/PhD/research/fpi_ml/data/imf_data/compiled/omni_imf_data_1990_2025_cleaned.csv", index=False)

## Solar and Geomagnetic

In [13]:
import pandas as pd

# --- 1) Read the file ---
# Adjust the path to your file:
path = "/Users/elliotdable/Documents/PhD/research/fpi_ml/data/geomag_data/hp30_ap30_1985_2025.txt"

cols = ["YYYY","MM","DD","hh_start","hh_mid","days","days_mid","hp30","ap30","D"]

geo_df = pd.read_csv(
    path,
    delim_whitespace=True,   # blank-separated
    comment="#",             # skip header/comment lines
    header=None,
    names=cols,
    engine="python",
    na_values={"hp30": [-1.000], "ap30": [-1, -1.0]}  # treat 'missing' markers as NaN
)

# --- 2) Build a datetime column ---
# The file provides two hour columns:
#   - 'hh_start'  = start of the 30-min interval in hours
#   - 'hh_mid'    = mid time of the interval in hours
# Use whichever you prefer; below I use the MID time (recommended for a time series centered on each bin).
dt = pd.to_datetime(
    dict(year=geo_df["YYYY"], month=geo_df["MM"], day=geo_df["DD"]),
    errors="coerce"
) + pd.to_timedelta(geo_df["hh_start"], unit="h")

# Times are UT per the file description. Make them timezone-aware (UTC).
geo_df["datetime"] = dt.dt.tz_localize("UTC")

# --- 3) Keep only the requested columns ---
geo_df = geo_df[["datetime", "hp30", "ap30"]].sort_values("datetime").reset_index(drop=True)

  geo_df = pd.read_csv(


In [14]:
geo_df.head()

Unnamed: 0,datetime,hp30,ap30
0,1985-01-01 00:00:00+00:00,4.0,27
1,1985-01-01 00:30:00+00:00,3.333,18
2,1985-01-01 01:00:00+00:00,3.667,22
3,1985-01-01 01:30:00+00:00,3.0,15
4,1985-01-01 02:00:00+00:00,4.333,32


In [15]:
# path to your txt file
file_path = "/Users/elliotdable/Documents/PhD/research/fpi_ml/data/geomag_data/all_radio_flux_1950_2025.txt"

# read the data, skipping comment lines starting with '#'
solar_df = pd.read_csv(file_path, delim_whitespace=True, comment="#")

# create datetime column
solar_df["datetime"] = pd.to_datetime(dict(year=solar_df.year, month=solar_df.month, day=solar_df.day))

# keep only the most accurate (final) observed values — i.e. columns without suffixes
# based on file info: f30, f15, f10.7, f8, f3.2 are the direct observed values
cols_to_keep = ["datetime", "f30", "f15", "f10.7", "f8", "f3.2"]

solar_df_clean = solar_df[cols_to_keep].copy()

  solar_df = pd.read_csv(file_path, delim_whitespace=True, comment="#")


In [18]:
solar_df_clean

Unnamed: 0,datetime,f30,f15,f10.7,f8,f3.2
0,1951-11-01,70.1,86.3,105.6,109.3,285.3
1,1951-11-02,69.0,84.3,103.0,107.0,283.4
2,1951-11-03,68.2,83.3,104.4,106.3,282.9
3,1951-11-04,65.5,79.7,105.4,103.7,282.0
4,1951-11-05,70.5,86.9,112.0,109.6,283.5
...,...,...,...,...,...,...
26991,2025-09-24,123.7,168.0,185.8,164.0,356.1
26992,2025-09-25,124.7,168.9,171.4,163.9,355.9
26993,2025-09-26,119.6,158.8,165.9,153.7,349.7
26994,2025-09-27,120.5,158.7,164.2,153.6,352.5


In [20]:
# 1) Make both datetime columns tz-aware UTC
geo_df["datetime"]    = pd.to_datetime(geo_df["datetime"], utc=True)
solar_df_clean["datetime"] = pd.to_datetime(solar_df_clean["datetime"], utc=True)

# 2) Sort (required by merge_asof)
geo_df = geo_df.sort_values("datetime")
solar_df_clean = solar_df_clean.sort_values("datetime")

solar_geomag_df = pd.merge_asof(
    geo_df.sort_values("datetime"),
    solar_df_clean.sort_values("datetime"),
    on="datetime",
    direction="backward"  # ensures we match the most recent daily solar value
)

In [26]:
solar_geomag_df.to_csv("/Users/elliotdable/Documents/PhD/research/fpi_ml/data/geomag_data/solar_geomag_data.csv", index=False)

In [None]:
import pandas as pd

# Read the text file (adjust filename as needed)
df = pd.read_csv("dst_data.txt", 
                 delim_whitespace=True, 
                 header=None, 
                 names=["year", "day", "hour", "dst"])

# Convert year + day-of-year + hour to datetime (no timezone)
df["datetime"] = pd.to_datetime(df["year"].astype(str), format="%Y") + \
                 pd.to_timedelta(df["day"] - 1, unit="D") + \
                 pd.to_timedelta(df["hour"], unit="h")

# Keep only datetime and dst
df = df[["datetime", "dst"]]

# Show first few rows
print(df.head())


## Adding dst (because I forgot)

In [16]:
import pandas as pd

# Read the text file (adjust filename as needed)
dst_df = pd.read_csv("/Users/elliotdable/Documents/PhD/research/fpi_ml/data/geomag_data/dst_data.lst", 
                 delim_whitespace=True, 
                 header=None, 
                 names=["year", "day", "hour", "dst"])

# Convert year + day-of-year + hour to datetime (no timezone)
dst_df["datetime"] = pd.to_datetime(dst_df["year"].astype(str), format="%Y") + \
                 pd.to_timedelta(dst_df["day"] - 1, unit="D") + \
                 pd.to_timedelta(dst_df["hour"], unit="h")

# Keep only datetime and dst
dst_df = dst_df[["datetime", "dst"]]


  dst_df = pd.read_csv("/Users/elliotdable/Documents/PhD/research/fpi_ml/data/geomag_data/dst_data.lst",


In [17]:
dst_df

Unnamed: 0,datetime,dst
0,1985-01-01 00:00:00,-24
1,1985-01-01 01:00:00,-23
2,1985-01-01 02:00:00,-21
3,1985-01-01 03:00:00,-25
4,1985-01-01 04:00:00,-27
...,...,...
350659,2025-01-01 19:00:00,-136
350660,2025-01-01 20:00:00,-133
350661,2025-01-01 21:00:00,-148
350662,2025-01-01 22:00:00,-148


In [18]:
large_df = pd.read_csv('/Users/elliotdable/Documents/PhD/research/fpi_ml/data/geomag_data/solar_geomag_data.csv')

In [19]:
# Ensure both are datetime and sorted
dst_df['datetime'] = pd.to_datetime(dst_df['datetime'])
large_df['datetime'] = pd.to_datetime(large_df['datetime'])

dst_df['datetime'] = dst_df['datetime'].dt.tz_localize(None)
large_df['datetime'] = large_df['datetime'].dt.tz_localize(None)

# Merge by nearest previous datetime (direction='backward')
merged_df = pd.merge_asof(
    large_df.sort_values('datetime'),
    dst_df.sort_values('datetime'),
    on='datetime',
    direction='backward'  # takes the last known dst value up to that time
)

In [24]:
final = merged_df[['datetime', 'hp30', 'ap30', 'dst', 'f30', 'f15', 'f10.7', 'f8', 'f3.2']]
final = final[final['datetime'] < pd.Timestamp('2025-01-01')]

In [None]:
final

Unnamed: 0,datetime,hp30,ap30,dst,f30,f15,f10.7,f8,f3.2
0,1985-01-01 00:00:00,4.000,27,-24,46.4,54.1,68.3,76.3,254.1
1,1985-01-01 00:30:00,3.333,18,-24,46.4,54.1,68.3,76.3,254.1
2,1985-01-01 01:00:00,3.667,22,-23,46.4,54.1,68.3,76.3,254.1
3,1985-01-01 01:30:00,3.000,15,-23,46.4,54.1,68.3,76.3,254.1
4,1985-01-01 02:00:00,4.333,32,-21,46.4,54.1,68.3,76.3,254.1
...,...,...,...,...,...,...,...,...,...
701275,2024-12-31 21:30:00,3.667,22,-18,142.0,200.0,210.4,206.8,353.6
701276,2024-12-31 22:00:00,3.667,22,-23,142.0,200.0,210.4,206.8,353.6
701277,2024-12-31 22:30:00,5.000,48,-23,142.0,200.0,210.4,206.8,353.6
701278,2024-12-31 23:00:00,4.000,27,-22,142.0,200.0,210.4,206.8,353.6


In [26]:
final.to_csv('/Users/elliotdable/Documents/PhD/research/fpi_ml/data/geomag_data/solar_geomag_dst_data.csv', index=False)