In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt

from datetime import datetime
from sgp4.api import Satrec, SatrecArray, WGS72

import concurrent.futures

pd.set_option("display.max_columns", 999)

In [2]:
global dataset
dataset = "test" # variable for lazy loading defaultdict

from collections import defaultdict
# using defaultdict to lazy load dataframes.... probably should stay in notebook as shortcut only
data = defaultdict(lambda: pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../3_min/{dataset}.pkl"))
tle_sup_data = defaultdict(lambda: pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../tle_sup/{dataset}.pkl"))
sgp4_data = defaultdict(lambda: pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../3_min/{dataset}_sgp4rv.pkl"))
# satrec_data = defaultdict(lambda: pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../3_min/{dataset}_satrec.pkl"))

In [3]:
def __jday_convert(x):
    '''
    Algorithm from python-sgp4:

    from sgp4.functions import jday
    jday(x.year, x.month, x.day, x.hour, x.minute, x.second + x.microsecond * 1e-6)
    '''
    jd = (367.0 * x.year
         - 7 * (x.year + ((x.month + 9) // 12.0)) * 0.25 // 1.0
           + 275 * x.month / 9.0 // 1.0
           + x.day
         + 1721013.5)
    fr = (x.second + (x.microsecond * 1e-6) + x.minute * 60.0 + x.hour * 3600.0) / 86400.0;
    return jd, fr

In [4]:
def get_satrec(bst, ecc, aop, inc, mea, mem, raa, mmdot=0, mmddot=0, norad=0, epoch=None):
    r = datetime.strptime('12/31/1949 00:00:00', '%m/%d/%Y %H:%M:%S')
    epoch_days = (epoch-r)/np.timedelta64(1, 'D')
    s = Satrec()
    s.sgp4init(
         WGS72,           # gravity model
         'i',             # 'a' = old AFSPC mode, 'i' = improved mode
         norad,               # satnum: Satellite number
         epoch_days,       # epoch: days since 1949 December 31 00:00 UT
         bst,      # bstar: drag coefficient (/earth radii)
         mmdot,   # ndot (NOT USED): ballistic coefficient (revs/day)
         mmddot,             # nddot (NOT USED): mean motion 2nd derivative (revs/day^3)
         ecc,       # ecco: eccentricity
         aop*np.pi/180, # argpo: argument of perigee (radians)
         inc*np.pi/180, # inclo: inclination (radians)
         mea*np.pi/180, # mo: mean anomaly (radians)
         mem*np.pi/(4*180), # no_kozai: mean motion (radians/minute)
         raa*np.pi/180, # nodeo: right ascension of ascending node (radians)
    )
    return s

def add_sgp4_propagation(df):
    satrec = get_satrec(bst=df.iloc[0]["X_BSTAR_1"],
                        ecc=df.iloc[0]["X_ECCENTRICITY_1"],
                        aop=df.iloc[0]["X_ARG_OF_PERICENTER_1"],
                        inc=df.iloc[0]["X_INCLINATION_1"],
                        mea=df.iloc[0]["X_MEAN_ANOMALY_1"],
                        mem=df.iloc[0]["X_MEAN_MOTION_1"],
                        raa=df.iloc[0]["X_RA_OF_ASC_NODE_1"],
                        epoch=df.iloc[0]["__EPOCH_1"])
#     satrec = satrec_data[dataset].loc[satrec_data[dataset].index == df.name,"SATREC_OBJ"].values[0] # this isn't any faster
    jd = df.X_EPOCH_JD_2.values
    fr = df.X_EPOCH_FR_2.values
    e,r,v = satrec.sgp4_array(jd,fr) # these are propagated
    df[['X_SGP4_SAT_RX', 'X_SGP4_SAT_RY', 'X_SGP4_SAT_RZ']] = r
    df[['X_SGP4_SAT_VX', 'X_SGP4_SAT_VY', 'X_SGP4_SAT_VZ']] = v
    return df

In [5]:
def cyclic_repr(s,v):
    cos = np.cos(np.deg2rad(s * (360/v)))
    sin = np.sin(np.deg2rad(s * (360/v)))
    return cos,sin

In [6]:
def convert_feature_values(df):
    name = df.name
    df = df.sort_values("EPOCH")
    
#     # convert ARG_OF_PERICENTER, RA_OF_ASC_NODE, and MEAN_ANOMALY to non-cyclic version
#     df["ARG_OF_PERICENTER_ADJUSTED"] = np.cumsum(np.around(df.ARG_OF_PERICENTER.diff().fillna(0) / -360))*360 + df.ARG_OF_PERICENTER
#     df["RA_OF_ASC_NODE_ADJUSTED"] = np.cumsum(np.around(df.RA_OF_ASC_NODE.diff().fillna(0) / -360))*360 + df.RA_OF_ASC_NODE
    
#     # according to 18 SPCS there was only 1 such case BUT ITS NOT TRUE there are like 70+
#     # this is because for REV_AT_EPOCH = 100,000, it's recorded as 10,000 instead of 0
#     # this doesn't handle the case for multiple ground stations reporting though, if the previous is different....
#     # would it be better to just remove this as an outlier just to be safe?
#     # 90k +- 20 max offset based on MEAN_MOTION maximum from earlier steps
#     df.loc[(df.REV_AT_EPOCH==10000) & df.REV_AT_EPOCH.diff().between(-89999,-89940),'REV_AT_EPOCH'] = 0

#     # combine REV_AT_EPOCH and MEAN_ANOMALY for a non-cyclic representation
#     adjusted_rev = df.REV_AT_EPOCH + np.cumsum(np.around(df.REV_AT_EPOCH.diff().fillna(0) / -100000)) * 100000
#     df["REV_MEAN_ANOMALY_COMBINED"] = adjusted_rev * 360 + df.MEAN_ANOMALY
    
#     # this is to handle the REV_AT_EPOCH problem inconsistency problem
#     # otherwise the REV_MEAN_ANOMALY_COMBINED difference may be incorrect
#     # bfill because we may start at non-zero due to previous data removal bit
#     a = np.round((adjusted_rev.diff().fillna(method='bfill')/300)).fillna(0)
#     df["SUBGROUP"] = np.cumsum(a).astype(int)
    df["SUBGROUP"] = 0
    
    doycos, doysin = cyclic_repr(df.EPOCH.dt.dayofyear, 366)
    df["DAY_OF_YEAR_COS"] = doycos
    df["DAY_OF_YEAR_SIN"] = doysin
    
    macos, masin = cyclic_repr(df.MEAN_ANOMALY, 360)
    df["MEAN_ANOMALY_COS"] = macos
    df["MEAN_ANOMALY_SIN"] = masin
    
    icos, isin = cyclic_repr(df.INCLINATION, 360)
    df["INCLINATION_COS"] = icos
    df["INCLINATION_SIN"] = isin
    
    rcos, rsin = cyclic_repr(df.RA_OF_ASC_NODE, 360)
    df["RA_OF_ASC_NODE_COS"] = rcos
    df["RA_OF_ASC_NODE_SIN"] = rsin
    
    df[['EPOCH_JD', 'EPOCH_FR']] = df.EPOCH.apply(__jday_convert).to_list()
    return df

In [7]:
# input is 1 groupby of satellite
def generate_X_y(df):
    idx = df.name

    df = df.reset_index(level=1).drop_duplicates(subset=['EPOCH']).sort_values("EPOCH")
    dfs = []
    for i in range(0,20):
        dfi = pd.concat([df.add_suffix("_1"),df.shift(-i).add_suffix("_2")], axis=1).dropna()
        dfs.append(dfi)
    ddf = pd.concat(dfs).reset_index(drop=True)
    # Reference variables only, DO NOT USE TO TRAIN
    __cols = [
        'NORAD_CAT_ID_1','GP_ID_1','GP_ID_2','EPOCH_1','EPOCH_2',
    ]
    df = ddf[__cols]
    df.columns = ['__'+x for x in __cols]
    
    # X
    x_cols = [
        'EPOCH_JD_1', 'EPOCH_FR_1', 'EPOCH_JD_2', 'EPOCH_FR_2',
        'MEAN_MOTION_DOT_1', 'BSTAR_1', 'INCLINATION_1', 'RA_OF_ASC_NODE_1', 'ECCENTRICITY_1', 'ARG_OF_PERICENTER_1',
        'MEAN_ANOMALY_1', 'MEAN_MOTION_1',
        'MEAN_ANOMALY_COS_1', 'MEAN_ANOMALY_SIN_1',
        'INCLINATION_COS_1', 'INCLINATION_SIN_1',
        'RA_OF_ASC_NODE_COS_1', 'RA_OF_ASC_NODE_SIN_1',
        'SEMIMAJOR_AXIS_1', 'PERIOD_1', 'APOAPSIS_1', 'PERIAPSIS_1', 'RCS_SIZE_1',
        'YEAR_1', 'DAY_OF_YEAR_COS_1', 'DAY_OF_YEAR_SIN_1',
        'SUNSPOTS_1D_1', 'SUNSPOTS_3D_1', 'SUNSPOTS_7D_1',
        'AIR_MONTH_AVG_TEMP_1','WATER_MONTH_AVG_TEMP_1',
        'SAT_RX_1', 'SAT_RY_1', 'SAT_RZ_1', 'SAT_VX_1', 'SAT_VY_1', 'SAT_VZ_1',
    ]
    
    df['X_delta_EPOCH'] = (ddf.EPOCH_2 - ddf.EPOCH_1).astype(int) / 86400000000000 # in days
    df[['X_'+x for x in x_cols]] = ddf[x_cols]
    df = df[(df['X_delta_EPOCH'] < 14) & (df['X_delta_EPOCH'] > 0.04)]
    df = df.groupby(by=["__GP_ID_1"]).apply(add_sgp4_propagation)

    y_cols = ['SAT_RX', 'SAT_RY', 'SAT_RZ', 'SAT_VX', 'SAT_VY', 'SAT_VZ'] # these are ground truths
    df[['y_'+y for y in y_cols]] = ddf[[y+'_2' for y in y_cols]]


    return df

In [8]:
def do_the_thing(df, f):
    df = df.groupby(by="NORAD_CAT_ID", as_index=False).progress_apply(convert_feature_values)
    df = df.merge(sgp4_data[f], left_index=True, right_index=True)
    df = df.merge(tle_sup_data[f], left_on="GP_ID", right_index=True)
    df = df.groupby(["NORAD_CAT_ID","SUBGROUP"], as_index=False).progress_apply(generate_X_y)
    df.reset_index(drop=True, inplace=True)
    return df

In [12]:
# generate smaller set from training set to test
# prefix = "test500_"
prefix = "sample_2000_"

dataset = "train"
train_ids = np.random.choice(data[dataset].NORAD_CAT_ID.unique(), 2000)
sample_train_df = data[dataset][data[dataset].NORAD_CAT_ID.isin(train_ids)]
sample_train_df = do_the_thing(sample_train_df, dataset)
sample_train_df.to_pickle(f"{os.environ['GP_HIST_PATH']}/../t7_data/{prefix}train.pkl")
print(len(sample_train_df))

dataset = "test"
test_ids = np.random.choice(data[dataset].NORAD_CAT_ID.unique(), 100)
sample_test_df = data[dataset][data[dataset].NORAD_CAT_ID.isin(test_ids)]
sample_test_df = do_the_thing(sample_test_df, dataset)
sample_test_df.to_pickle(f"{os.environ['GP_HIST_PATH']}/../t7_data/{prefix}test.pkl")
print(len(sample_test_df))

  0%|          | 0/1825 [00:00<?, ?it/s]

  0%|          | 0/1825 [00:00<?, ?it/s]

113628308


  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

3508664


In [10]:
# # Save all

# input_files = [
#     "train",
#     "test",
#     "secret_test",
# ]

# for f in input_files:
#     dataset = f # variable for lazy loading defaultdict
#     print(f"Preparing data for: {f}")
#     df = do_the_thing(data[f], dataset)
#     df.to_pickle(f"{os.environ['GP_HIST_PATH']}/../t7_data/{f}.pkl")