This notebook loads and groups THEMIS satellite datasets by time gaps, merges and shuffles them to create a combined dataset, then saves it for modeling

In [None]:
import pandas as pd
import random

In [None]:
tha_df = pd.read_parquet("data/v80/final_model_data_tha_scaled.parquet")
thd_df = pd.read_parquet("data/v80/final_model_data_thd_scaled.parquet")
the_df = pd.read_parquet("data/v80/final_model_data_the_scaled.parquet")

In [None]:
tha_seqs = tha_df.groupby((tha_df['Epoch_time'].diff() > 300).cumsum()).apply(lambda x: list(x.index)).values
thd_seqs = thd_df.groupby((thd_df['Epoch_time'].diff() > 300).cumsum()).apply(lambda x: list(x.index)).values
the_seqs = the_df.groupby((the_df['Epoch_time'].diff() > 300).cumsum()).apply(lambda x: list(x.index)).values

In [None]:
tha_dict = {f'{i}_A': sublist for i, sublist in enumerate(tha_seqs)}
thd_dict = {f'{i}_D': sublist for i, sublist in enumerate(thd_seqs)}
the_dict = {f'{i}_E': sublist for i, sublist in enumerate(the_seqs)}

In [None]:
merged_dict = {**tha_dict, **thd_dict, **the_dict}

In [None]:
random.seed(18)

items = list(merged_dict.items())
random.shuffle(items)

In [9]:
randomized_dict = dict(items)

In [None]:
df_map = {
    'A': tha_df,
    'D': thd_df,
    'E': the_df
}

In [11]:
selected_rows = []

for key, indices in randomized_dict.items():
    df = df_map[key[-1]]
    selected = df.iloc[indices]
    selected_rows.append(selected)

giant_dataset = pd.concat(selected_rows, ignore_index=True)

In [None]:
giant_dataset.to_parquet("data/final_model_data_all_scaled.parquet")

In [13]:
giant_dataset.shape

(40696367, 18)

In [14]:
giant_dataset['Event_label_100'].value_counts(normalize=True)

Event_label_100
0    0.734866
1    0.265134
Name: proportion, dtype: float64

In [15]:
giant_dataset['Event_label_80'].value_counts(normalize=True)

Event_label_80
0    0.648822
1    0.351178
Name: proportion, dtype: float64

In [None]:
tha_label = pd.read_parquet("data/final_matrix_tha_perp_labeled_v80.parquet")[['|V_perp|', 'Event_class', 'Event_label']][2:].reset_index(drop=True)
thd_label = pd.read_parquet("data/final_matrix_thd_perp_labeled_v80.parquet")[['|V_perp|', 'Event_class', 'Event_label']][2:].reset_index(drop=True)
the_label = pd.read_parquet("data/final_matrix_the_perp_labeled_v80.parquet")[['|V_perp|', 'Event_class', 'Event_label']][2:].reset_index(drop=True)

In [None]:
tha_label['Satellite'] = len(tha_label) * ['THEMIS A']
thd_label['Satellite'] = len(thd_label) * ['THEMIS D']
the_label['Satellite'] = len(the_label) * ['THEMIS E']

In [None]:
df_label_map = {
    'A': tha_label,
    'D': thd_label,
    'E': the_label
}

In [None]:
selected_rows = []

for key, indices in randomized_dict.items():
    df = df_label_map[key[-1]]
    selected = df.iloc[indices]
    selected_rows.append(selected)

giant_labels = pd.concat(selected_rows, ignore_index=True)

In [None]:
giant_labels.columns

Index(['|V_perp|', 'Event_class', 'Event_label', 'Satellite'], dtype='object')

In [None]:
giant_labels['Satellite'].value_counts()

Satellite
THEMIS A    21360590
THEMIS D    19335777
THEMIS E    16721136
Name: count, dtype: int64

In [None]:
giant_dataset[['|V_perp|', 'Event_class', 'Event_label', 'Satellite']] = giant_labels[['|V_perp|', 'Event_class', 'Event_label', 'Satellite']]

In [None]:
split = int(0.8 * len(giant_dataset))
giant_test = giant_dataset[split:]

In [None]:
giant_dataset.to_parquet("data/final_model_data_all_labeled_test.parquet")