In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt

from datetime import datetime
from sgp4.api import Satrec, SatrecArray, WGS72

In [2]:
def __jday_convert(x):
    '''
    Algorithm from python-sgp4:

    from sgp4.functions import jday
    jday(x.year, x.month, x.day, x.hour, x.minute, x.second + x.microsecond * 1e-6)
    '''
    jd = (367.0 * x.year
         - 7 * (x.year + ((x.month + 9) // 12.0)) * 0.25 // 1.0
           + 275 * x.month / 9.0 // 1.0
           + x.day
         + 1721013.5)
    fr = (x.second + (x.microsecond * 1e-6) + x.minute * 60.0 + x.hour * 3600.0) / 86400.0;
    return jd, fr

In [3]:
def cyclic_repr(s,v):
    cos = np.cos(np.deg2rad(s * (360/v)))
    sin = np.sin(np.deg2rad(s * (360/v)))
    return cos,sin

In [23]:
def convert_feature_values(df):
    name = df.name
    df = df.sort_values("EPOCH")
    
    
    # convert ARG_OF_PERICENTER, RA_OF_ASC_NODE, and MEAN_ANOMALY to non-cyclic version
    df["ARG_OF_PERICENTER_ADJUSTED"] = np.cumsum(np.around(df.ARG_OF_PERICENTER.diff().fillna(0) / -360))*360 + df.ARG_OF_PERICENTER
    df["RA_OF_ASC_NODE_ADJUSTED"] = np.cumsum(np.around(df.RA_OF_ASC_NODE.diff().fillna(0) / -360))*360 + df.RA_OF_ASC_NODE
    
    # this is because for REV_AT_EPOCH = 100,000, it's recorded as 10,000 instead of 0
    # this doesn't handle the case for multiple ground stations reporting though, if the previous is different....
    # would it be better to just remove this as an outlier just to be safe?
    # 90k +- 20 max offset based on MEAN_MOTION maximum from earlier steps
    df.loc[(df.REV_AT_EPOCH==10000) & df.REV_AT_EPOCH.diff().between(-90020,-89980),'REV_AT_EPOCH'] = 0

    # combine REV_AT_EPOCH and MEAN_ANOMALY for a non-cyclic representation
    adjusted_rev = df.REV_AT_EPOCH + np.cumsum(np.around(df.REV_AT_EPOCH.diff().fillna(0) / -100000)) * 100000
    df["REV_MEAN_ANOMALY_COMBINED"] = adjusted_rev * 360 + df.MEAN_ANOMALY
    
    # this is to handle the REV_AT_EPOCH problem inconsistency problem
    # otherwise the REV_MEAN_ANOMALY_COMBINED difference may be incorrect
    # bfill because we may start at non-zero due to previous data removal bit
    a = np.round((adjusted_rev.diff().fillna(method='bfill')/300)).fillna(0)
    df["SUBGROUP"] = np.cumsum(a).astype(int)
    
    doycos, doysin = cyclic_repr(df.EPOCH.dt.dayofyear, 366)
    df["DAY_OF_YEAR_COS"] = doycos
    df["DAY_OF_YEAR_SIN"] = doysin
    
#     synodic = df.EPOCH.astype(int) % 2551442976000000
#     sidereal = df.EPOCH.astype(int) % 2360591510400000
    
#     syn_m_cos, syn_m_sin = cyclic_repr(synodic, 2551442976000000)
#     df["SYNODIC_MONTH_COS"] = syn_m_cos
#     df["SYNODIC_MONTH_SIN"] = syn_m_sin

#     sr_m_cos, sr_m_sin = cyclic_repr(synodic, 2360591510400000)
#     df["SIDEREAL_MONTH_COS"] = sr_m_cos
#     df["SIDEREAL_MONTH_SIN"] = sr_m_sin
    
    return df

In [33]:
# input is 1 groupby of satellite
def generate_X_y(df):
    idx = df.name

    df = df.reset_index().drop_duplicates(subset=['EPOCH']).sort_values("EPOCH")
    dfs = []
#     for i in range(1,11):
    for i in [1,2,4,6]: # use less for now
        dfi = pd.concat([df,df.shift(-i).add_suffix("_b")], axis=1).dropna()
        dfs.append(dfi)
    ddf = pd.concat(dfs)

    # Reference variables only, DO NOT USE TO TRAIN
    df = ddf[['NORAD_CAT_ID','GP_ID','GP_ID_b','EPOCH','EPOCH_b']]
    df.columns = ['__NORAD_CAT_ID','__GP_ID_1','__GP_ID_2','__EPOCH_1','__EPOCH_2']
    df['__GP_ID_2'] = df['__GP_ID_2'].astype(int)
    
    # Ignore these columns completely
#     'MONTH', 'DAY', # month and day should be well-represented as day_of_year
#     'REV_AT_EPOCH' # this one doesn't matter if we are predicting cartesian
    
    # X
    df['X_delta_EPOCH'] = (ddf.EPOCH_b - ddf.EPOCH).astype(int) / 86400000000000 # in days

    df[['X_EPOCH_JD', 'X_EPOCH_FR']] = ddf.EPOCH.apply(__jday_convert).to_list()

    x_cols = ['MEAN_MOTION_DOT', 'BSTAR', 'INCLINATION', 'RA_OF_ASC_NODE', 'ECCENTRICITY', 'ARG_OF_PERICENTER',
              'MEAN_ANOMALY', 'MEAN_MOTION',
              'YEAR', 'DAY_OF_YEAR_COS', 'DAY_OF_YEAR_SIN',
#               'SYNODIC_MONTH_COS', 'SYNODIC_MONTH_SIN', 'SIDEREAL_MONTH_COS', 'SIDEREAL_MONTH_SIN',
              'SUNSPOTS_1D', 'SUNSPOTS_3D', 'SUNSPOTS_7D',
              'AIR_MONTH_AVG_TEMP','WATER_MONTH_AVG_TEMP',
              'SAT_RX', 'SAT_RY', 'SAT_RZ', 'SAT_VX', 'SAT_VY', 'SAT_VZ',
             ]
    
    df[['X_'+x for x in x_cols]] = ddf[x_cols]

    # y
    df[['y_REV_MA_REG']] = ((ddf.REV_MEAN_ANOMALY_COMBINED_b - ddf.REV_MEAN_ANOMALY_COMBINED) + ddf.MEAN_ANOMALY) / 360
    df[['y_ARG_OF_PERICENTER_REG']] = (ddf.ARG_OF_PERICENTER_ADJUSTED_b - ddf.ARG_OF_PERICENTER_ADJUSTED + ddf.ARG_OF_PERICENTER) / 360
    df[['y_RA_OF_ASC_NODE_REG']] = (ddf.RA_OF_ASC_NODE_ADJUSTED_b - ddf.RA_OF_ASC_NODE_ADJUSTED + ddf.RA_OF_ASC_NODE) / 360
#     y_cols = ['SAT_RX', 'SAT_RY', 'SAT_RZ', 'SAT_VX', 'SAT_VY', 'SAT_VZ']
#     df[['y_'+y for y in y_cols]] = ddf[y_cols]
    
    # not sure if this day limiting thing makes sense....
    df = df[(df['X_delta_EPOCH'] < 5) & (df['X_delta_EPOCH'] > 0.1) & ((df.y_REV_MA_REG / df.X_delta_EPOCH).between(df.X_MEAN_MOTION*0.9,df.X_MEAN_MOTION*1.1))]
    return df

In [36]:
%%time

# Generate actual data

input_files = [
    "test",
    "train",
    "secret_test",
]

for f in input_files:
    print(f"Preparing data for: {f}")
    df = pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../3_min/{f}.pkl")
    converted_df = df.groupby(by="NORAD_CAT_ID", as_index=False).progress_apply(convert_feature_values)
    sgp4_df = pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../3_min/{f}_sgp4rv.pkl")
    converted_df = converted_df.merge(sgp4_df, left_index=True, right_index=True)
    processed_df = converted_df.groupby(["NORAD_CAT_ID","SUBGROUP"], as_index=False).progress_apply(generate_X_y)
    processed_df.reset_index(drop=True, inplace=True)
    processed_df.to_pickle(f"{os.environ['GP_HIST_PATH']}/../t2_data/{f}.pkl")

Preparing data for: test


  0%|          | 0/2702 [00:00<?, ?it/s]

  0%|          | 0/65145 [00:00<?, ?it/s]

Preparing data for: train


  0%|          | 0/12288 [00:00<?, ?it/s]

  0%|          | 0/314602 [00:00<?, ?it/s]

Preparing data for: secret_test


  0%|          | 0/2711 [00:00<?, ?it/s]

  0%|          | 0/61486 [00:00<?, ?it/s]

CPU times: user 5h 15min 40s, sys: 6min 49s, total: 5h 22min 30s
Wall time: 5h 27min 30s


In [34]:
# %%time

# # generate smaller set from training set to test

train_df = pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../3_min/train.pkl")
converted_df = train_df.groupby(by="NORAD_CAT_ID", as_index=False).progress_apply(convert_feature_values)


# narrow down using certain inclination range only
# sample_df = converted_df[converted_df.INCLINATION.between(65,67)]
sample_df = converted_df

sgp4_df = pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../3_min/train_sgp4rv.pkl")
sample_df = sample_df.merge(sgp4_df, left_index=True, right_index=True)

# narrow down further with random norad IDs
train_ids = np.random.choice(sample_df.NORAD_CAT_ID.unique(), 200)
test_ids = np.random.choice(list(set(sample_df.NORAD_CAT_ID.unique())-set(train_ids)),50)

sample_train_df = sample_df[sample_df.NORAD_CAT_ID.isin(train_ids)]
sample_test_df = sample_df[sample_df.NORAD_CAT_ID.isin(test_ids)]

processed_sample_train_df = sample_train_df.groupby(["NORAD_CAT_ID","SUBGROUP"], as_index=False).progress_apply(generate_X_y)
processed_sample_train_df.reset_index(drop=True, inplace=True)

processed_sample_test_df = sample_test_df.groupby(["NORAD_CAT_ID","SUBGROUP"], as_index=False).progress_apply(generate_X_y)
processed_sample_test_df.reset_index(drop=True, inplace=True)

# save samples
processed_sample_train_df.to_pickle(f"{os.environ['GP_HIST_PATH']}/../t2_data/sample_train.pkl")
processed_sample_test_df.to_pickle(f"{os.environ['GP_HIST_PATH']}/../t2_data/sample_test.pkl")

print(len(processed_sample_train_df))
print(len(processed_sample_test_df))

  0%|          | 0/4465 [00:00<?, ?it/s]

  0%|          | 0/762 [00:00<?, ?it/s]

2260999
561651
