In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd

from utils.download_data import download_themis_data
from utils.process_data_model import *

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
for year in range(2007, 2026):
    start_date = f"{year}0101"
    end_date = f"{year}1231"

    base_url = f"https://themis.ssl.berkeley.edu/data/themis/tha/l2/fgm/{year}/"
    # base_url = f"https://themis.ssl.berkeley.edu/data/themis/tha/l2/mom/{year}/"
    # base_url = f"https://themis.ssl.berkeley.edu/data/themis/tha/l1/state/{year}/"
    
    download_dir = "data/tha/fgm_2007_2025"
    # download_dir = "data/tha/mom_2007_2025"
    # download_dir = "data/tha/state_2007_2025"

    download_themis_data(base_url, start_date, end_date, download_dir)

In [None]:
fgm_dates = set()

for file_name in sorted(os.listdir("data/tha/fgm_2007_2025")):
    if file_name.startswith('.'):
        continue
    fgm_dates.add(file_name.split('_')[3])

state_dates = set()

for file_name in sorted(os.listdir("data/tha/state_2007_2025")):
    if file_name.startswith('.'):
        continue
    state_dates.add(file_name.split('_')[3])

mom_dates = set()

for file_name in sorted(os.listdir("data/tha/mom_2007_2025")):
    if file_name.startswith('.'):
        continue
    mom_dates.add(file_name.split('_')[3])

valid_dates = sorted(mom_dates & fgm_dates & state_dates)

In [None]:
raw_fgm_matrix = process_fgm_model("data/tha/fgm_2007_2025", dates=valid_dates, satellite='tha')
raw_state_matrix = process_state_model("data/tha/state_2007_2025", dates=valid_dates, satellite='tha')
raw_iplasma_matrix = process_mom_model("data/tha/mom_2007_2025", dates=valid_dates, satellite='tha')

raw_fgm_matrix.to_parquet('data/model_data/raw_fgm_matrix_tha.parquet', index=False)
raw_state_matrix.to_parquet('data/model_data/raw_state_matrix_tha.parquet', index=False)
raw_iplasma_matrix.to_parquet('data/model_data/raw_iplasma_matrix_tha.parquet', index=False)

In [None]:
raw_fgm_matrix.to_parquet('data/model_data/raw_fgm_matrix_tha.parquet', index=False)
raw_state_matrix.to_parquet('data/model_data/raw_state_matrix_tha.parquet', index=False)
raw_iplasma_matrix.to_parquet('data/model_data/raw_iplasma_matrix_tha.parquet', index=False)

In [None]:
interpolated_state = interpolate_columns(raw_state_matrix, raw_fgm_matrix['Time'], ['GSM_x', 'GSM_y'])
interpolated_iplasma = interpolate_columns(raw_iplasma_matrix, raw_fgm_matrix['Time'], ['I_velocity_x', 'I_velocity_y', 'I_velocity_z'])

interpolated_data = {**interpolated_state, **interpolated_iplasma}

raw_matrix = pd.concat([raw_fgm_matrix.reset_index(drop=True), pd.DataFrame(interpolated_data)], axis=1)

In [None]:
raw_matrix.to_parquet('data/model_data/raw_matrix_tha.parquet', index=False)

In [None]:
raw_matrix.info()

In [None]:
RE = 6300

final_matrix = (raw_matrix[
        (raw_matrix['GSM_x'] < -9 * RE) & 
        (raw_matrix['GSM_y'].abs() < raw_matrix['GSM_x'].abs())
    ]
    .copy()
    .dropna(subset=['I_velocity_x', 'I_velocity_y', 'I_velocity_z', 'Bx', 'By', 'Bz'])
    .drop_duplicates(subset=['Time'], keep='last')
    .sort_values(by=['Time'])
    .reset_index(drop=True)
)

In [None]:
V = final_matrix[['I_velocity_x', 'I_velocity_y', 'I_velocity_z']].values
B = final_matrix[['Bx', 'By', 'Bz']].values

V_dot_B = np.sum(V * B, axis=1)
V_mag_sq = np.sum(V**2, axis=1)
B_mag_sq = np.sum(B**2, axis=1)

V_perp_sq = V_mag_sq - (V_dot_B**2 / (B_mag_sq))
final_matrix['|V_perp|'] = np.sqrt(np.clip(V_perp_sq, 0, None))

In [None]:
final_matrix.to_parquet('data/model_data/final_matrix_tha_perp.parquet', index=False)