In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt

from datetime import datetime
from sgp4.api import Satrec, SatrecArray, WGS72

import concurrent.futures

pd.set_option("display.max_columns", 999)

In [2]:
global dataset
dataset = "test" # variable for lazy loading defaultdict
input_files = [
    "train",
    "test",
    "secret_test",
]

from collections import defaultdict
# using defaultdict to lazy load dataframes.... probably should stay in notebook as shortcut only
data = defaultdict(lambda: pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../3_min/{dataset}.pkl"))
tle_sup_data = defaultdict(lambda: pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../tle_sup/{dataset}.pkl"))
sgp4_data = defaultdict(lambda: pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../3_min/{dataset}_sgp4rv.pkl"))
# satrec_data = defaultdict(lambda: pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../3_min/{dataset}_satrec.pkl"))

In [3]:
def __jday_convert(x):
    '''
    Algorithm from python-sgp4:

    from sgp4.functions import jday
    jday(x.year, x.month, x.day, x.hour, x.minute, x.second + x.microsecond * 1e-6)
    '''
    jd = (367.0 * x.year
         - 7 * (x.year + ((x.month + 9) // 12.0)) * 0.25 // 1.0
           + 275 * x.month / 9.0 // 1.0
           + x.day
         + 1721013.5)
    fr = (x.second + (x.microsecond * 1e-6) + x.minute * 60.0 + x.hour * 3600.0) / 86400.0;
    return jd, fr

In [4]:
def cyclic_repr(s,v):
    cos = np.cos(np.deg2rad(s * (360/v)))
    sin = np.sin(np.deg2rad(s * (360/v)))
    return cos,sin

In [5]:
def convert_feature_values(df):
    name = df.name
    df = df.sort_values("EPOCH")
    
    # convert ARG_OF_PERICENTER, RA_OF_ASC_NODE, and MEAN_ANOMALY to non-cyclic version
    df["ARG_OF_PERICENTER_ADJUSTED"] = np.cumsum(np.around(df.ARG_OF_PERICENTER.diff().fillna(0) / -360))*360 + df.ARG_OF_PERICENTER
    df["RA_OF_ASC_NODE_ADJUSTED"] = np.cumsum(np.around(df.RA_OF_ASC_NODE.diff().fillna(0) / -360))*360 + df.RA_OF_ASC_NODE
    
    # according to 18 SPCS there was only 1 such case BUT ITS NOT TRUE there are like 70+
    # this is because for REV_AT_EPOCH = 100,000, it's recorded as 10,000 instead of 0
    # this doesn't handle the case for multiple ground stations reporting though, if the previous is different....
    # would it be better to just remove this as an outlier just to be safe?
    # 90k +- 20 max offset based on MEAN_MOTION maximum from earlier steps
    df.loc[(df.REV_AT_EPOCH==10000) & df.REV_AT_EPOCH.diff().between(-89999,-89940),'REV_AT_EPOCH'] = 0

    # combine REV_AT_EPOCH and MEAN_ANOMALY for a non-cyclic representation
    adjusted_rev = df.REV_AT_EPOCH + np.cumsum(np.around(df.REV_AT_EPOCH.diff().fillna(0) / -100000)) * 100000
    df["REV_MEAN_ANOMALY_COMBINED"] = adjusted_rev * 360 + df.MEAN_ANOMALY
    
    # this is to handle the REV_AT_EPOCH problem inconsistency problem
    # otherwise the REV_MEAN_ANOMALY_COMBINED difference may be incorrect
    # bfill because we may start at non-zero due to previous data removal bit
    a = np.round((adjusted_rev.diff().fillna(method='bfill')/300)).fillna(0)
    df["SUBGROUP"] = np.cumsum(a).astype(int)
    
    doycos, doysin = cyclic_repr(df.EPOCH.dt.dayofyear, 366)
    df["DAY_OF_YEAR_COS"] = doycos
    df["DAY_OF_YEAR_SIN"] = doysin
    
    macos, masin = cyclic_repr(df.MEAN_ANOMALY, 360)
    df["MEAN_ANOMALY_COS"] = macos
    df["MEAN_ANOMALY_SIN"] = masin
    
    icos, isin = cyclic_repr(df.INCLINATION, 360)
    df["INCLINATION_COS"] = icos
    df["INCLINATION_SIN"] = isin
    
    rcos, rsin = cyclic_repr(df.RA_OF_ASC_NODE, 360)
    df["RA_OF_ASC_NODE_COS"] = rcos
    df["RA_OF_ASC_NODE_SIN"] = rsin
    
    df[['EPOCH_JD', 'EPOCH_FR']] = df.EPOCH.apply(__jday_convert).to_list()
    return df

In [6]:
# input is 1 groupby of satellite
def generate_X_y(df):
    idx = df.name

    df = df.reset_index(level=1).drop_duplicates(subset=['EPOCH']).sort_values("EPOCH")
    dfs = []
    for i in range(0,20):
        dfi = pd.concat([df.add_suffix("_1"),df.shift(-i).add_suffix("_2")], axis=1).dropna()
        dfs.append(dfi)
    ddf = pd.concat(dfs).reset_index(drop=True)
    # Reference variables only, DO NOT USE TO TRAIN
    __cols = [
        'NORAD_CAT_ID_1','GP_ID_1','GP_ID_2','EPOCH_1','EPOCH_2',
#         'SAT_RX_2', 'SAT_RY_2', 'SAT_RZ_2', 'SAT_VX_2', 'SAT_VY_2', 'SAT_VZ_2', # these are ground truths
    ]
    df = ddf[__cols]
    df.columns = ['__'+x for x in __cols]
    
    # X
    x_cols = [
        'EPOCH_JD_1', 'EPOCH_FR_1', 'EPOCH_JD_2', 'EPOCH_FR_2',
        'MEAN_MOTION_DOT_1', 'BSTAR_1', 'INCLINATION_1', 'RA_OF_ASC_NODE_1', 'ECCENTRICITY_1', 'ARG_OF_PERICENTER_1',
        'MEAN_ANOMALY_1', 'MEAN_MOTION_1',
        'MEAN_ANOMALY_COS_1', 'MEAN_ANOMALY_SIN_1',
        'INCLINATION_COS_1', 'INCLINATION_SIN_1',
        'RA_OF_ASC_NODE_COS_1', 'RA_OF_ASC_NODE_SIN_1',
        'SEMIMAJOR_AXIS_1', 'PERIOD_1', 'APOAPSIS_1', 'PERIAPSIS_1', 'RCS_SIZE_1',
        'SAT_RX_1', 'SAT_RY_1', 'SAT_RZ_1', 'SAT_VX_1', 'SAT_VY_1', 'SAT_VZ_1',
        'YEAR_1', 'DAY_OF_YEAR_COS_1', 'DAY_OF_YEAR_SIN_1',
        'SUNSPOTS_1D_1', 'SUNSPOTS_3D_1', 'SUNSPOTS_7D_1',
        'AIR_MONTH_AVG_TEMP_1','WATER_MONTH_AVG_TEMP_1',
    ]
    
    df['X_delta_EPOCH'] = (ddf.EPOCH_2 - ddf.EPOCH_1).astype(int) / 86400000000000 # in days
    df[['X_'+x for x in x_cols]] = ddf[x_cols]

    y_cols = ['BSTAR', 'INCLINATION', 'RA_OF_ASC_NODE', 'ECCENTRICITY', 'ARG_OF_PERICENTER', 'MEAN_MOTION', 'MEAN_ANOMALY']
    df[['y_'+y for y in y_cols]] = ddf[[y+'_2' for y in y_cols]]
    
    df['y_REV_MA_REG'] = ((ddf.REV_MEAN_ANOMALY_COMBINED_2 - ddf.REV_MEAN_ANOMALY_COMBINED_1) + ddf.MEAN_ANOMALY_1) / 360
    df['y_ARG_OF_PERICENTER_REG'] = (ddf.ARG_OF_PERICENTER_ADJUSTED_2 - ddf.ARG_OF_PERICENTER_ADJUSTED_1 + ddf.ARG_OF_PERICENTER_1) / 360
    df['y_RA_OF_ASC_NODE_REG'] = (ddf.RA_OF_ASC_NODE_ADJUSTED_2 - ddf.RA_OF_ASC_NODE_ADJUSTED_1 + ddf.RA_OF_ASC_NODE_1) / 360

    df = df[(df['X_delta_EPOCH'] < 14) & (df['X_delta_EPOCH'] > 0.04) & ((df.y_REV_MA_REG / df.X_delta_EPOCH).between(df.X_MEAN_MOTION_1*0.99,df.X_MEAN_MOTION_1*1.01))]

    return df

In [7]:
def do_the_thing(df, f):
    df = df.groupby(by="NORAD_CAT_ID", as_index=False).progress_apply(convert_feature_values)
    df = df.merge(sgp4_data[f], left_index=True, right_index=True)
    df = df.merge(tle_sup_data[f], left_on="GP_ID", right_index=True)
    df = df.groupby(["NORAD_CAT_ID","SUBGROUP"], as_index=False).progress_apply(generate_X_y)
    df.reset_index(drop=True, inplace=True)
    return df

In [10]:
# generate smaller set from training set to test
prefix = "sample2_"

dataset = "train"
train_ids = np.random.choice(data[dataset].NORAD_CAT_ID.unique(), 450)
sample_train_df = data[dataset][data[dataset].NORAD_CAT_ID.isin(train_ids)]
sample_train_df = do_the_thing(sample_train_df, dataset)
sample_train_df.to_pickle(f"{os.environ['GP_HIST_PATH']}/../t5_data/{prefix}train.pkl")
print(len(sample_train_df))

dataset = "test"
test_ids = np.random.choice(data[dataset].NORAD_CAT_ID.unique(), 50)
sample_test_df = data[dataset][data[dataset].NORAD_CAT_ID.isin(test_ids)]
sample_test_df = do_the_thing(sample_test_df, dataset)
sample_test_df.to_pickle(f"{os.environ['GP_HIST_PATH']}/../t5_data/{prefix}test.pkl")
print(len(sample_test_df))

  0%|          | 0/444 [00:00<?, ?it/s]

  0%|          | 0/9911 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

18035161
1263171


In [9]:
for f in input_files:
    dataset = f # variable for lazy loading defaultdict
    print(f"Preparing data for: {f}")
    df = do_the_thing(data[f], dataset)
    df.to_pickle(f"{os.environ['GP_HIST_PATH']}/../t5_data/{f}.pkl")

Preparing data for: train


  0%|          | 0/12288 [00:00<?, ?it/s]

  0%|          | 0/314601 [00:00<?, ?it/s]

Preparing data for: test


  0%|          | 0/2702 [00:00<?, ?it/s]

  0%|          | 0/65145 [00:00<?, ?it/s]

Preparing data for: secret_test


  0%|          | 0/2711 [00:00<?, ?it/s]

  0%|          | 0/61486 [00:00<?, ?it/s]

In [None]:
# dataset = "test" # set the lazy loader
# sample_df = data[dataset][data[dataset].NORAD_CAT_ID.isin([20885, 7128])]#, 4756
# sample_df = sample_df.groupby(by="NORAD_CAT_ID", as_index=False).progress_apply(convert_feature_values)
# # sample_df = sample_df.merge(sgp4_data[dataset], left_on="GP_ID", right_index=True)
# sample_df = sample_df.merge(tle_sup_data[dataset], left_on="GP_ID", right_index=True)
# sample_df = sample_df.groupby(["NORAD_CAT_ID","SUBGROUP"], as_index=False).progress_apply(generate_X_y)
# sample_df.reset_index(drop=True, inplace=True)

# sample_df


In [None]:
# dataset = "train" # variable for lazy loading defaultdict
# print(f"Preparing data for: {f}")
# df = data[f].groupby(by="NORAD_CAT_ID", as_index=False).progress_apply(convert_feature_values)
# df = df.merge(sgp4_data[f], left_index=True, right_index=True)
# df = df.merge(tle_sup_data[f], left_on="GP_ID", right_index=True)
# df = df.groupby(["NORAD_CAT_ID","SUBGROUP"], as_index=False).progress_apply(generate_X_y)
# df.reset_index(drop=True, inplace=True)
# df.to_pickle(f"{os.environ['GP_HIST_PATH']}/../t5_data/{f}.pkl")