# Feature Engineering

Convert data into more ML friendly formats.  Reversible so the model output later can be reverted back to TLE style format.

This conversion needs to be performed on all datasets.

Features:

| Column        | Desc  |
| :------------- | :------|
| `NORAD_CAT_ID` | Satellite identifier, not used in training, no action needed |
| `OBJECT_TYPE` | Satellite meta data, not used in training, no action needed (only in `full` version) |
| `TLE_LINE1` | Actual TLE line 1, not used in training, no action needed (only in `full` version) |
| `TLE_LINE2` | Actual TLE line 2, not used in training, no action needed (only in `full` version) |
| `MEAN_MOTION_DOT` | Some sort of scaling may be needed |
| `MEAN_MOTION_DDOT` | Some sort of scaling may be needed |
| `BSTAR` | Some sort of scaling may be needed |
| `INCLINATION` | Convert cyclic 0 .. 180 |
| `RA_OF_ASC_NODE` | Convert cyclic 0 .. 360 |
| `ECCENTRICITY` | Some scaling needed, 0 .. 0.25 |
| `ARG_OF_PERICENTER` | Convert cyclic 0 .. 360 |
| `MEAN_ANOMALY` | Convert cyclic 0 .. 360, this loops multiple times per day and most cycles are unobserved in the data |
| `MEAN_MOTION` | > 11.25 |
| `REV_AT_EPOCH` | 0-99999, but sometimes inconcsistency in data where there is an offset to this from different ground stations (a guess) |
| `EPOCH` | Time, while no scaling is needed, we will need to use this for constructing `X` and `y` |
| `GP_ID` | Unique identifier for the TLE entry, not used in training, no action needed |

While `MEAN_ANOMALY` is represeted in degrees, because a lot of cycles are left out due to how sparse the data is, a combination of `REV_AT_EPOCH` + `MEAN_ANOMALY` may be a better representation of the features rather than using sin/cos representation.  Other conversion can be done without grouping, but due to `REV_AT_EPOCH` rolling over at 100k and inconsistency between ground stations, we might need to handle it per satellite.



Datasets:

```
2_min/train.pkl
0_min/test.pkl
0_min/secret_test.pkl
```

Converting `min` versions only for now to save some memory and disk space.  Can be replaced with `full` if needed.

In [2]:
import pandas as pd
import numpy as np
import os

from tqdm.notebook import tqdm
tqdm.pandas()

In [3]:
version = "min" # or "min" or "full" data

In [4]:
input_files = [
    (2, "train.pkl"),
    (0, "test.pkl"),
    (0, "secret_test.pkl")
]

for n,f in input_files:
    print(f"{os.environ['GP_HIST_PATH']}/../{n}_{version}/{f}")

train_df = pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../2_{version}/train.pkl")
# test_df = pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../0_{version}/test.pkl")
# secret_test_df = pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../0_{version}/secret_test.pkl")

/mistorage/mads/data/gp_history/../2_min/train.pkl
/mistorage/mads/data/gp_history/../0_min/test.pkl
/mistorage/mads/data/gp_history/../0_min/secret_test.pkl


In [5]:
train_df = pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../2_{version}/train.pkl")


In [7]:
sample_df = train_df.sample(20)

In [28]:
def convert_feature_values(df):
    df['REV_MEAN_ANOMALY'] = df.MEAN_ANOMALY + df.REV_AT_EPOCH*360
    df['INCLINATION_X'] = np.cos(np.deg2rad(df.INCLINATION * 2)) # 2 because inclination goes between 0-180 only
    df['INCLINATION_Y'] = np.sin(np.deg2rad(df.INCLINATION * 2))
    df['RA_OF_ASC_NODE_X'] = np.cos(np.deg2rad(df.RA_OF_ASC_NODE))
    df['RA_OF_ASC_NODE_Y'] = np.sin(np.deg2rad(df.RA_OF_ASC_NODE))
    df['ARG_OF_PERICENTER_X'] = np.cos(np.deg2rad(df.ARG_OF_PERICENTER))
    df['ARG_OF_PERICENTER_Y'] = np.sin(np.deg2rad(df.ARG_OF_PERICENTER))
    
    remove_cols = [
        'MEAN_ANOMALY',
        'REV_AT_EPOCH',
        'INCLINATION',
        'RA_OF_ASC_NODE',
        'ARG_OF_PERICENTER'
    ]
    return df[filter(lambda v: v not in remove_cols, df.columns)].copy()



# FIXME: values seems to be correct but COLUMNS ARE OUT OF ORDER
def revert_feature_values(df):
    
    df['REV_AT_EPOCH'] = (df.REV_MEAN_ANOMALY // 360).astype(int)
    df['MEAN_ANOMALY'] = df.REV_MEAN_ANOMALY % 360
    df['INCLINATION'] = np.rad2deg(np.arctan2(df.INCLINATION_Y,df.INCLINATION_X)) / 2
    df['RA_OF_ASC_NODEaaa'] = np.arctan2(df.RA_OF_ASC_NODE_Y,df.RA_OF_ASC_NODE_X)
    df['RA_OF_ASC_NODE'] = np.rad2deg(np.arctan2(df.RA_OF_ASC_NODE_Y,df.RA_OF_ASC_NODE_X))
    df['ARG_OF_PERICENTER'] = np.rad2deg(np.arctan2(df.ARG_OF_PERICENTER_Y,df.ARG_OF_PERICENTER_X))
    
    remove_cols = [
        'REV_MEAN_ANOMALY',
        'INCLINATION_X',
        'INCLINATION_Y',
        'RA_OF_ASC_NODE_X',
        'RA_OF_ASC_NODE_Y',
        'ARG_OF_PERICENTER_X',
        'ARG_OF_PERICENTER_Y'
    ]
    return df[filter(lambda v: v not in remove_cols, df.columns)].copy()
    
converted = convert_feature_values(sample_df)
display(converted)
# reverted = revert_feature_values(converted)
# display(reverted)

Unnamed: 0,NORAD_CAT_ID,MEAN_MOTION_DOT,MEAN_MOTION_DDOT,BSTAR,ECCENTRICITY,MEAN_MOTION,EPOCH,GP_ID,REV_MEAN_ANOMALY,INCLINATION_X,INCLINATION_Y,RA_OF_ASC_NODE_X,RA_OF_ASC_NODE_Y,ARG_OF_PERICENTER_X,ARG_OF_PERICENTER_Y
46163652,26020,-1.16e-05,0.0,-0.006112,0.011252,12.656302,2018-07-15 22:28:14.679264,121826793,8283946.0,-0.968038,0.250802,-0.988801,0.149239,0.66696,0.745093
45243950,21936,2.4e-07,0.0,0.000425,0.026176,12.009752,2018-04-05 16:16:54.084000,119143305,5373121.0,-0.849176,0.528109,0.083415,0.996515,0.993296,-0.115603
54185163,21547,2.67e-06,0.0,0.00037,0.025412,13.544405,2021-01-25 12:51:00.159264,170326337,16721580.0,-0.940438,-0.339966,-0.600462,-0.799653,-0.107451,0.99421
19459729,21690,0.00011447,0.0,0.003795,0.007679,14.362972,1991-05-31 01:38:55.955616,49703119,930412.7,-0.952694,-0.30393,-0.989232,-0.146354,-0.991778,-0.127967
43094520,16266,8.5e-07,0.0,0.000398,0.00011,12.6565,2017-08-04 21:54:02.553120,113025327,16837560.0,-0.966863,0.255297,-0.940533,0.339702,0.496077,0.868278
38797839,33875,6.062e-05,0.0,0.00112,0.003511,14.648817,2015-10-24 23:15:03.244320,101262319,12774660.0,-0.99217,0.124893,-0.906656,0.42187,0.963017,-0.269439
25316683,30146,2.68e-06,0.0,0.000256,0.018897,13.799824,2008-03-31 18:05:33.268127,65693765,2196395.0,-0.96067,-0.277694,0.934545,0.355844,0.805154,-0.593066
9947642,7933,3e-08,0.0,2.2e-05,0.035765,14.010893,1997-04-17 03:19:59.091168,24651164,4012287.0,-0.950444,-0.310895,-0.997484,0.07089,-0.024366,-0.999703
34298908,24974,3.56e-06,0.0,0.000152,0.002406,14.285361,2012-06-23 18:14:11.656896,89277070,28487740.0,-0.84901,0.528376,0.749744,-0.661728,-0.815608,0.578605
10327827,12752,4.11e-06,0.0,0.000571,0.014052,13.518785,1997-08-27 05:07:40.138463,25607477,28475020.0,-0.970525,0.241001,0.934445,-0.356107,-0.234136,-0.972204


In [8]:
sample_df

Unnamed: 0,NORAD_CAT_ID,MEAN_MOTION_DOT,MEAN_MOTION_DDOT,BSTAR,INCLINATION,RA_OF_ASC_NODE,ECCENTRICITY,ARG_OF_PERICENTER,MEAN_ANOMALY,MEAN_MOTION,REV_AT_EPOCH,EPOCH,GP_ID
46163652,26020,-1.16e-05,0.0,-0.006112,82.7375,171.4172,0.011252,48.1671,345.8395,12.656302,23010,2018-07-15 22:28:14.679264,121826793
45243950,21936,2.4e-07,0.0,0.000425,74.0611,85.2151,0.026176,353.3616,120.6084,12.009752,14925,2018-04-05 16:16:54.084000,119143305
54185163,21547,2.67e-06,0.0,0.00037,99.9374,233.097,0.025412,96.1684,295.5731,13.544405,46448,2021-01-25 12:51:00.159264,170326337
19459729,21690,0.00011447,0.0,0.003795,98.8469,188.4157,0.007679,187.3521,172.6524,14.362972,2584,1991-05-31 01:38:55.955616,49703119
43094520,16266,8.5e-07,0.0,0.000398,82.6044,160.1413,0.00011,60.2592,2.7139,12.6565,46771,2017-08-04 21:54:02.553120,113025327
38797839,33875,6.062e-05,0.0,0.00112,86.4127,155.0473,0.003511,344.3691,64.5828,14.648817,35485,2015-10-24 23:15:03.244320,101262319
25316683,30146,2.68e-06,0.0,0.000256,98.0613,20.8452,0.018897,323.6251,35.2195,13.799824,6101,2008-03-31 18:05:33.268127,65693765
9947642,7933,3e-08,0.0,2.2e-05,99.0566,175.9349,0.035765,268.6038,87.4143,14.010893,11145,1997-04-17 03:19:59.091168,24651164
34298908,24974,3.56e-06,0.0,0.000152,74.0521,318.5682,0.002406,144.6475,215.6271,14.285361,79132,2012-06-23 18:14:11.656896,89277070
10327827,12752,4.11e-06,0.0,0.000571,83.0272,339.1387,0.014052,256.4593,102.0864,13.518785,79097,1997-08-27 05:07:40.138463,25607477
