In [None]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
final_matrix = pd.read_parquet('data/model_data/final_matrix_tha_perp.parquet')

In [24]:
# V_MAX_THRESHOLD = 100
V_MAX_THRESHOLD = 80
V_MIN_THRESHOLD = 30
TIME_EXTENSION = 5 * 60

final_matrix['v100_event'] = final_matrix['|V_perp|'] >= V_MAX_THRESHOLD

In [25]:
final_matrix

Unnamed: 0,Time,Bx,By,Bz,GSM_x,GSM_y,I_velocity_x,I_velocity_y,I_velocity_z,|V|,|V_perp|,v100_event
0,1.199000e+09,0.820385,4.555980,8.969902,-56700.025987,-36249.633253,5.700628,-6.303988,-1.313697,8.600188,7.833573,False
1,1.199000e+09,0.806978,4.196406,9.091022,-56700.246719,-36254.372434,7.522322,-14.056296,2.217093,16.095971,15.762129,False
2,1.199000e+09,0.876657,4.654987,8.925768,-56700.458684,-36259.100898,16.959676,-7.360745,1.274464,18.532011,18.515004,False
3,1.199000e+09,0.856859,4.322873,9.165164,-56700.670648,-36263.829361,14.915799,-1.836039,0.028878,15.028404,15.020007,False
4,1.199000e+09,0.914005,4.682649,9.099053,-56700.882612,-36268.557827,5.112792,-2.196419,0.144919,5.566498,5.550791,False
...,...,...,...,...,...,...,...,...,...,...,...,...
16721133,1.735691e+09,-54.085922,15.975058,14.809833,-69756.343230,44002.330364,4629.639585,4023.316704,1795.932679,6391.088653,5775.859322,True
16721134,1.735691e+09,-53.990715,16.126766,14.875697,-69757.633589,43999.294179,4648.021896,4038.659836,1802.691480,6415.962702,5806.953700,True
16721135,1.735691e+09,-54.014530,16.090166,14.856615,-69758.923948,43996.257995,4666.404208,4054.002968,1809.450282,6440.836795,5827.023887,True
16721136,1.735691e+09,-53.943130,16.204220,14.905483,-69760.214307,43993.221810,4684.786520,4069.346101,1816.209083,6465.710933,5855.963420,True


In [26]:
timestamps = final_matrix['Time'].copy().to_numpy()
velocities = final_matrix['|V_perp|'].copy().to_numpy()
abs_bx = final_matrix['Bx'].abs().copy().to_numpy()
abs_bz = final_matrix['Bz'].abs().copy().to_numpy()
v100_indices = np.where(velocities > V_MAX_THRESHOLD)[0]

In [None]:
# This code block detects high-velocity event intervals by expanding around threshold-crossing points and merging overlapping intervals
# Saves event time ranges and peak velocities

last_end_time = None
last_start_time = None
last_max_velocity = None

events = []
visited = set()

# Iterate through all indices where velocity exceeds V100 threshold
for idx in v100_indices:
    if idx in visited:
        continue

    start_idx, end_idx = idx, idx
    max_velocity = velocities[idx]

    # Expand the start index backward while:
    # - velocity stays above V_MIN_THRESHOLD
    # - time difference between adjacent points is within 300 seconds
    while (
        start_idx > 0
        and velocities[start_idx] > V_MIN_THRESHOLD
        and (timestamps[start_idx] - timestamps[start_idx - 1]) <= 300
    ):
        visited.add(start_idx)
        max_velocity = max(max_velocity, velocities[start_idx])
        start_idx -= 1

    # Expand the end index forward with the same conditions
    while (
        end_idx < len(timestamps) - 1
        and velocities[end_idx] > V_MIN_THRESHOLD
        and (timestamps[end_idx + 1] - timestamps[end_idx]) <= 300
    ):
        visited.add(end_idx)
        max_velocity = max(max_velocity, velocities[end_idx])
        end_idx += 1

    end_idx = min(end_idx, len(timestamps) - 1)

    # Extend the detected event by a buffer defined by TIME_EXTENSION
    start_time = max(timestamps[0], timestamps[start_idx] - TIME_EXTENSION)
    end_time = min(timestamps[-1], timestamps[end_idx] + TIME_EXTENSION)

    # Merge with previous event if overlapping
    if last_end_time is not None and start_time <= last_end_time:
        last_end_time = max(last_end_time, end_time)
        last_max_velocity = max(last_max_velocity, max_velocity)
    else:
        #  Save the previous event and start a new one
        if last_end_time is not None:
            events.append((last_start_time, last_end_time, last_max_velocity))

        last_start_time = start_time
        last_end_time = end_time
        last_max_velocity = max_velocity

if last_end_time is not None:
    events.append((last_start_time, last_end_time, last_max_velocity))

event_df = pd.DataFrame(events, columns=['start_time', 'end_time', 'max_velocity'])


In [None]:
# Computes average magnetic field magnitudes around peak velocity points for each event and classifies events based on the ratio of |Bx| to |Bz|

event_df['avg_|Bx|'] = 0
event_df['avg_|Bz|'] = 0
event_df['event_class'] = 0

# Iterate through each event to compute metrics and assign a class
for idx, row in event_df.iterrows():
    event_indices = np.arange(
        np.searchsorted(timestamps, row['start_time'], side='left'),
        np.searchsorted(timestamps, row['end_time'], side='right')
    )
    
    max_v_idx = event_indices[np.argmax(velocities[event_indices])]

    window_start = max(max(0, max_v_idx - 20), np.min(event_indices))
    window_end = min(min(len(timestamps) - 1, max_v_idx + 20), np.max(event_indices))
    
    avg_bx = np.mean(abs_bx[window_start:window_end + 1])
    avg_bz = np.mean(abs_bz[window_start:window_end + 1])

    event_df.at[idx, 'avg_|Bx|'] = avg_bx
    event_df.at[idx, 'avg_|Bz|'] = avg_bz

    # Classify event based on the relationship between avg |Bx| and avg |Bz|
    class_conditions = [
        avg_bx < (avg_bz / 2),                                     # Class 1: |Bx| much smaller than |Bz|
        ((avg_bz / 2) <= avg_bx) & (avg_bx <= 2 * avg_bz),         # Class 2: |Bx| comparable to |Bz|
        avg_bx > (2 * avg_bz)                                      # Class 3: |Bx| much larger than |Bz|
    ]

    event_df.at[idx, 'event_class'] = np.select(class_conditions, [1, 2, 3])


In [None]:
event_df['start_time'] = pd.to_datetime(event_df['start_time'], unit='s', utc=True)
event_df['end_time'] = pd.to_datetime(event_df['end_time'], unit='s', utc=True)

event_df.insert(0, 'satellite_name', ['THEMIS A'] * len(event_df))
event_df.insert(3, 'event_duration', ((event_df['end_time'] - event_df['start_time']).dt.total_seconds() / 60))

In [30]:
event_df

Unnamed: 0,satellite_name,start_time,end_time,event_duration,max_velocity,avg_|Bx|,avg_|Bz|,event_class
0,THEMIS E,2008-01-01 08:53:47.814451694+00:00,2008-01-01 09:09:23.898902655+00:00,15.601408,147.630382,8.836877,9.387485,2
1,THEMIS E,2008-01-03 07:29:22.428395271+00:00,2008-01-03 08:01:22.759043694+00:00,32.005511,384.382663,2.778665,9.360879,1
2,THEMIS E,2008-01-03 08:11:05.055194855+00:00,2008-01-03 08:26:32.137136459+00:00,15.451366,164.915757,1.382779,9.852980,1
3,THEMIS E,2008-01-03 08:37:05.446145058+00:00,2008-01-03 08:54:38.559691429+00:00,17.551892,81.656782,1.514506,12.537084,1
4,THEMIS E,2008-01-04 08:06:57.361459970+00:00,2008-01-04 08:25:36.495209217+00:00,18.652229,196.783074,4.503519,3.166869,2
...,...,...,...,...,...,...,...,...
12475,THEMIS E,2024-12-31 20:50:01.958037853+00:00,2024-12-31 21:04:02.783228397+00:00,14.013753,81.438547,51.976910,21.421810,3
12476,THEMIS E,2024-12-31 21:40:20.300535679+00:00,2024-12-31 21:57:25.758470774+00:00,17.090966,84.642437,52.836033,22.347258,3
12477,THEMIS E,2024-12-31 21:57:39.862839222+00:00,2024-12-31 22:10:16.399245739+00:00,12.608940,83.201703,57.884041,23.711712,3
12478,THEMIS E,2024-12-31 22:27:50.065874577+00:00,2024-12-31 22:45:39.675072432+00:00,17.826820,113.877356,62.528183,24.744497,3


In [None]:
event_df.to_csv('data/event_data_tha_perp_80_avg.csv', index=False)

In [None]:
# Label each timestamped row in `df` with binary event indicators and class assignments 
# based on whether its time falls within any detected event interval from `event_df`

df = final_matrix.copy()

start_times = (pd.to_datetime(event_df['start_time']).astype('int64') / 10**9).values
end_times = (pd.to_datetime(event_df['end_time']).astype('int64') / 10**9).values
df_times = df['Time'].values

event_label = np.zeros(len(df), dtype=int)
event_class = np.zeros(len(df), dtype=int)

for idx in range(len(event_df)):
    mask = np.logical_and(df_times >= start_times[idx], df_times <= end_times[idx])
    event_label[mask] = 1
    event_class[mask] = event_df.loc[idx, 'event_class']

In [33]:
df['Event_class'] = event_class
df['Event_label'] = event_label

In [34]:
df = df.drop(columns=['GSM_x', 'GSM_y', 'I_velocity_x', 'I_velocity_y', 'I_velocity_z', 'v100_event'])
df = df.rename(columns={'Time': 'Epoch_time'})
df.insert(0, 'Time', pd.to_datetime(df['Epoch_time'], unit='s', utc=True))

In [35]:
df

Unnamed: 0,Time,Epoch_time,Bx,By,Bz,|V|,|V_perp|,Event_class,Event_label
0,2007-12-30 07:32:57.505841494+00:00,1.199000e+09,0.820385,4.555980,8.969902,8.600188,7.833573,0,0
1,2007-12-30 07:33:00.506551981+00:00,1.199000e+09,0.806978,4.196406,9.091022,16.095971,15.762129,0,0
2,2007-12-30 07:33:03.507262707+00:00,1.199000e+09,0.876657,4.654987,8.925768,18.532011,18.515004,0,0
3,2007-12-30 07:33:06.507973193+00:00,1.199000e+09,0.856859,4.322873,9.165164,15.028404,15.020007,0,0
4,2007-12-30 07:33:09.508685589+00:00,1.199000e+09,0.914005,4.682649,9.099053,5.566498,5.550791,0,0
...,...,...,...,...,...,...,...,...,...
16721133,2025-01-01 00:17:31.590120316+00:00,1.735691e+09,-54.085922,15.975058,14.809833,6391.088653,5775.859322,3,1
16721134,2025-01-01 00:17:35.603873491+00:00,1.735691e+09,-53.990715,16.126766,14.875697,6415.962702,5806.953700,3,1
16721135,2025-01-01 00:17:39.617626667+00:00,1.735691e+09,-54.014530,16.090166,14.856615,6440.836795,5827.023887,3,1
16721136,2025-01-01 00:17:43.631380081+00:00,1.735691e+09,-53.943130,16.204220,14.905483,6465.710933,5855.963420,3,1


In [None]:
df.to_parquet('data/final_matrix_tha_perp_labeled_v80.parquet', index=False)