In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt

from datetime import datetime
from sgp4.api import Satrec, SatrecArray, WGS72

import concurrent.futures

pd.set_option("display.max_columns", 999)

In [2]:
global dataset
dataset = "test" # variable for lazy loading defaultdict
input_files = [
    "train",
    "test",
    "secret_test",
]

from collections import defaultdict
# using defaultdict to lazy load dataframes.... probably should stay in notebook as shortcut only
data = defaultdict(lambda: pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../3_min/{dataset}.pkl"))
tle_sup_data = defaultdict(lambda: pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../tle_sup/{dataset}.pkl"))
sgp4_data = defaultdict(lambda: pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../3_min/{dataset}_sgp4rv.pkl"))
# satrec_data = defaultdict(lambda: pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../3_min/{dataset}_satrec.pkl"))

In [3]:
def __jday_convert(x):
    '''
    Algorithm from python-sgp4:

    from sgp4.functions import jday
    jday(x.year, x.month, x.day, x.hour, x.minute, x.second + x.microsecond * 1e-6)
    '''
    jd = (367.0 * x.year
         - 7 * (x.year + ((x.month + 9) // 12.0)) * 0.25 // 1.0
           + 275 * x.month / 9.0 // 1.0
           + x.day
         + 1721013.5)
    fr = (x.second + (x.microsecond * 1e-6) + x.minute * 60.0 + x.hour * 3600.0) / 86400.0;
    return jd, fr

In [4]:
def cyclic_repr(s,v):
    cos = np.cos(np.deg2rad(s * (360/v)))
    sin = np.sin(np.deg2rad(s * (360/v)))
    return cos,sin

In [5]:
def convert_feature_values(df):
    name = df.name
    df = df.sort_values("EPOCH")
    
    df["SUBGROUP"] = 0
    
    doycos, doysin = cyclic_repr(df.EPOCH.dt.dayofyear, 366)
    df["DAY_OF_YEAR_COS"] = doycos
    df["DAY_OF_YEAR_SIN"] = doysin
    
    macos, masin = cyclic_repr(df.MEAN_ANOMALY, 360)
    df["MEAN_ANOMALY_COS"] = macos
    df["MEAN_ANOMALY_SIN"] = masin
    
    icos, isin = cyclic_repr(df.INCLINATION, 360)
    df["INCLINATION_COS"] = icos
    df["INCLINATION_SIN"] = isin
    
    rcos, rsin = cyclic_repr(df.RA_OF_ASC_NODE, 360)
    df["RA_OF_ASC_NODE_COS"] = rcos
    df["RA_OF_ASC_NODE_SIN"] = rsin
    
    df[['EPOCH_JD', 'EPOCH_FR']] = df.EPOCH.apply(__jday_convert).to_list()
    return df

In [22]:
# input is 1 groupby of satellite
def generate_X_y(df):
    idx = df.name

    df = df.reset_index(level=1).drop_duplicates(subset=['EPOCH']).sort_values("EPOCH")
    dfs = []
    for i in range(0,20):
        dfi = pd.concat([df.add_suffix("_1"),df.shift(-i).add_suffix("_2")], axis=1).dropna()
        dfs.append(dfi)
    ddf = pd.concat(dfs).reset_index(drop=True)
    # Reference variables only, DO NOT USE TO TRAIN
    __cols = [
        'NORAD_CAT_ID_1','GP_ID_1','GP_ID_2','EPOCH_1','EPOCH_2',
    ]
    df = ddf[__cols]
    df.columns = ['__'+x for x in __cols]
    
    # X
    x_cols = [
        'EPOCH_JD_1', 'EPOCH_FR_1', 'EPOCH_JD_2', 'EPOCH_FR_2',
        'MEAN_MOTION_DOT_1', 'BSTAR_1', 'INCLINATION_1', 'RA_OF_ASC_NODE_1', 'ECCENTRICITY_1', 'ARG_OF_PERICENTER_1',
        'MEAN_ANOMALY_1', 'MEAN_MOTION_1',
        'MEAN_ANOMALY_COS_1', 'MEAN_ANOMALY_SIN_1',
        'INCLINATION_COS_1', 'INCLINATION_SIN_1',
        'RA_OF_ASC_NODE_COS_1', 'RA_OF_ASC_NODE_SIN_1',
        'SEMIMAJOR_AXIS_1', 'PERIOD_1', 'APOAPSIS_1', 'PERIAPSIS_1', 'RCS_SIZE_1',
        'SAT_RX_1', 'SAT_RY_1', 'SAT_RZ_1', 'SAT_VX_1', 'SAT_VY_1', 'SAT_VZ_1',
        'YEAR_1', 'DAY_OF_YEAR_COS_1', 'DAY_OF_YEAR_SIN_1',
        'SUNSPOTS_1D_1', 'SUNSPOTS_3D_1', 'SUNSPOTS_7D_1',
        'AIR_MONTH_AVG_TEMP_1','WATER_MONTH_AVG_TEMP_1',
    ]
    
    df['X_delta_EPOCH'] = (ddf.EPOCH_2 - ddf.EPOCH_1).astype(int) / 86400000000000 # in days
    df[['X_'+x for x in x_cols]] = ddf[x_cols]

    y_cols = ['SAT_RX', 'SAT_RY', 'SAT_RZ', 'SAT_VX', 'SAT_VY', 'SAT_VZ']
    df[['y_'+y for y in y_cols]] = ddf[[y+'_2' for y in y_cols]]
    df['y_SAT_R'] = np.sqrt((df[['y_SAT_RX', 'y_SAT_RY', 'y_SAT_RZ']]**2).sum(axis=1))
    df['y_SAT_V'] = np.sqrt((df[['y_SAT_VX', 'y_SAT_VY', 'y_SAT_VZ']]**2).sum(axis=1))

    df = df[(df['X_delta_EPOCH'] < 14) & (df['X_delta_EPOCH'] > 0.04)]

    return df

In [23]:
def do_the_thing(df, f):
    df = df.groupby(by="NORAD_CAT_ID", as_index=False).progress_apply(convert_feature_values)
    df = df.merge(sgp4_data[f], left_index=True, right_index=True)
    df = df.merge(tle_sup_data[f], left_on="GP_ID", right_index=True)
    df = df.groupby(["NORAD_CAT_ID","SUBGROUP"], as_index=False).progress_apply(generate_X_y)
    df.reset_index(drop=True, inplace=True)
    return df

In [30]:
# generate smaller set from training set to test
prefix = "sample3_"

dataset = "train"
train_ids = np.random.choice(data[dataset].NORAD_CAT_ID.unique(), 3000)
sample_train_df = data[dataset][data[dataset].NORAD_CAT_ID.isin(train_ids)]
sample_train_df = do_the_thing(sample_train_df, dataset)
sample_train_df.to_pickle(f"{os.environ['GP_HIST_PATH']}/../t6_data/{prefix}train.pkl")
print(len(sample_train_df))

dataset = "test"
test_ids = np.random.choice(data[dataset].NORAD_CAT_ID.unique(), 200)
sample_test_df = data[dataset][data[dataset].NORAD_CAT_ID.isin(test_ids)]
sample_test_df = do_the_thing(sample_test_df, dataset)
sample_test_df.to_pickle(f"{os.environ['GP_HIST_PATH']}/../t6_data/{prefix}test.pkl")
print(len(sample_test_df))

  0%|          | 0/2660 [00:00<?, ?it/s]

  0%|          | 0/2660 [00:00<?, ?it/s]

154171374


  0%|          | 0/194 [00:00<?, ?it/s]

  0%|          | 0/194 [00:00<?, ?it/s]

11496647


In [9]:
for f in input_files:
    dataset = f # variable for lazy loading defaultdict
    print(f"Preparing data for: {f}")
    df = do_the_thing(data[f], dataset)
    df.to_pickle(f"{os.environ['GP_HIST_PATH']}/../t6_data/{f}.pkl")

Preparing data for: train


  0%|          | 0/12288 [00:00<?, ?it/s]

  0%|          | 0/314601 [00:00<?, ?it/s]

In [10]:
# dataset = "test" # set the lazy loader
# sample_df = data[dataset][data[dataset].NORAD_CAT_ID.isin([20885, 7128])]#, 4756
# sample_df = sample_df.groupby(by="NORAD_CAT_ID", as_index=False).progress_apply(convert_feature_values)
# # sample_df = sample_df.merge(sgp4_data[dataset], left_on="GP_ID", right_index=True)
# sample_df = sample_df.merge(tle_sup_data[dataset], left_on="GP_ID", right_index=True)
# sample_df = sample_df.groupby(["NORAD_CAT_ID","SUBGROUP"], as_index=False).progress_apply(generate_X_y)
# sample_df.reset_index(drop=True, inplace=True)

# sample_df


In [11]:
# dataset = "train" # variable for lazy loading defaultdict
# print(f"Preparing data for: {f}")
# df = data[f].groupby(by="NORAD_CAT_ID", as_index=False).progress_apply(convert_feature_values)
# df = df.merge(sgp4_data[f], left_index=True, right_index=True)
# df = df.merge(tle_sup_data[f], left_on="GP_ID", right_index=True)
# df = df.groupby(["NORAD_CAT_ID","SUBGROUP"], as_index=False).progress_apply(generate_X_y)
# df.reset_index(drop=True, inplace=True)
# df.to_pickle(f"{os.environ['GP_HIST_PATH']}/../t6_data/{f}.pkl")