In [1]:
import matplotlib.pyplot as plt
import os
import pandas as pd

In [2]:
SOURCE_DIR = "../04-data/orbits"  # where the orbit data is located
LABELS_DIR = "../04-data"  # where the label mapping is located
TARGET_DIR = "merged"  # where the result is to be written

In [3]:
EVENT_COLUMNS = ["SK outer in", "SK inner in","MP outer in", "MP inner in",
                 "MP inner out", "MP outer out", "SK inner out", "SK outer out"]

DATA_COLUMNS = [ 'X_MSO', 'Y_MSO', 'Z_MSO','BX_MSO','BY_MSO','BZ_MSO', 'DBX_MSO', 'DBY_MSO', 'DBZ_MSO', 'RHO_DIPOLE', 'PHI_DIPOLE',
       'THETA_DIPOLE', 'BABS_DIPOLE', 'BX_DIPOLE', 'BY_DIPOLE', 'BZ_DIPOLE',
       'RHO', 'RXY', 'X', 'Y', 'Z', 'VX', 'VY', 'VZ', 'VABS', 'D', 'COSALPHA',
       'EXTREMA']

In [4]:
def normalize(df, columns):
    """Normalize the specified columns in the given data frame."""

    df[columns] = (df[columns] - df[columns].mean()) / df[columns].std()

In [5]:
def prepare_orbit(orbit_id):
    """Load the specified orbit and preprocess it for merging."""

    file = os.path.join(SOURCE_DIR, f"messenger-{orbit_id:04d}.csv")
    if os.path.exists(file):
        df_orbit = pd.read_csv(file, usecols=DATA_COLUMNS + ["DATE"], parse_dates=True)
        #df_orbit.interpolate()  # interpolate missing values
        df_orbit["ORBIT"] = orbit_id  # add orbit id
        #normalize(df_orbit, DATA_COLUMNS)
        return df_orbit

In [6]:
def apply_labels(df_train, df_labels):
    """Assign labels to the training time series using event boundary labels."""

    df_train["LABEL"] = 0 # interplanetary magnetic field is default

    for row in range(len(df_labels)):
        # bow shock crossing
        bs_selector = ((df_train["DATE"] >= df_labels.iloc[row]["SK outer"])
                         & (df_train["DATE"] < df_labels.iloc[row]["SK inner"]))\
                      | ((df_train["DATE"] >= df_labels.iloc[row]["SK inner.1"])
                         & (df_train["DATE"] < df_labels.iloc[row]["SK outer.1"]))
        df_train.loc[bs_selector, "LABEL"] = 1  # bow shock crossing
    
        #magnetosheath
        msh_selector = ((df_train["DATE"] >= df_labels.iloc[row]["SK inner"])
                         & (df_train["DATE"] < df_labels.iloc[row]["MP outer"])) \
                      | ((df_train["DATE"] >= df_labels.iloc[row]["MP outer.1"])
                         & (df_train["DATE"] < df_labels.iloc[row]["SK inner.1"]))
        df_train.loc[msh_selector, "LABEL"] = 2  # magnetosheath

        # magnetopause crossing
        mp_selector = ((df_train["DATE"] >= df_labels.iloc[row]["MP outer"])
                        & (df_train["DATE"] < df_labels.iloc[row]["MP inner"]))\
                       | ((df_train["DATE"] >= df_labels.iloc[row]["MP inner.1"])
                        & (df_train["DATE"] < df_labels.iloc[row]["MP outer.1"]))
        df_train.loc[mp_selector, "LABEL"] = 3  # MP crossing

        # magnetosphere
        msp_selector = (df_train["DATE"] >= df_labels.iloc[row]["MP inner"])\
                         & (df_train["DATE"] < df_labels.iloc[row]["MP inner.1"])
        df_train.loc[msp_selector, "LABEL"] = 4  # magnetosphere

In [7]:
labels_path = os.path.join(LABELS_DIR, "Mercury_BS_and_MP_crossings_Philpott - List 1.csv")
df_labels = pd.read_csv(labels_path, parse_dates=True)
df_labels = df_labels.dropna()  # remove invalid orbits
print(len(df_labels["Orbit"]))

4040


In [None]:
df_train = pd.concat(map(prepare_orbit, df_labels["Orbit"]))
print(df_train.iloc[43460:43465])

In [None]:
apply_labels(df_train, df_labels)
print(df_train.iloc[32245:32255])

In [None]:
df_train.to_csv(os.path.join(TARGET_DIR, "df_train.csv"))

df_train_description = df_train[DATA_COLUMNS].describe()
df_train_description.to_excel(os.path.join(TARGET_DIR, "df_train_total_description.xlsx"))

df_train_description = df_train.groupby(["LABEL"])[DATA_COLUMNS].describe()
df_train_description.to_excel(os.path.join(TARGET_DIR, "df_train_label_description.xlsx"))

In [None]:
for i in range(0,150):
    try:
        df_label = df_labels[df_labels.Orbit == i]
        df_train = pd.concat(map(prepare_orbit, df_label["Orbit"]))
        df = apply_labels(df_train, df_label)
        df_train.to_csv(os.path.join(TARGET_DIR, "df_train_{}.csv".format(i)))
    except:
        print(i)
        pass
