# Remove Abnormal Data from Training Set

Load DataFrame from `0_min/train.pkl`

Define multile functions that take a dataframe and return a boolean mask for entries to keep.

Combine masks in the end into a final mask, then it can be applied to `0_full` version as well if that's preferred.

In [28]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor

In [2]:
if ('my_home_path' not in os.environ) and ('MY_HOME_PATH' in os.environ):
    os.environ['my_home_path'] = os.environ['MY_HOME_PATH'] # because stupid :(

In [3]:
train_df = pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../0_min/train.pkl")

In [4]:
# just using a small subset for testing

sample_df = train_df[train_df.NORAD_CAT_ID.isin([12223, 26285])]

In [5]:
# Early TLEs are more prone to errors, cut off should be somewhere in the 80s
# TODO: putting in 1990 for now to be on the safe side

def more_recent_only(df):
    mask = df.EPOCH > "1990"
    return mask

In [6]:
# Space track LEO definition: Mean Motion > 11.25 and Eccentricity < 0.25
# This means that satellite that decay into LEO will not have non-LEO-like entries removed

def leo_check(df):
    mask = (df['MEAN_MOTION'] > 11.25) & (df['ECCENTRICITY'] < 0.25)
    return mask

In [7]:
# valid range for 'RA_OF_ASC_NODE', 'ARG_OF_PERICENTER', 'MEAN_ANOMALY' is 0..360

def degrees_range_check(df):
    degree_columns = ['RA_OF_ASC_NODE', 'ARG_OF_PERICENTER', 'MEAN_ANOMALY']
    mask = df[degree_columns].apply(lambda x:x.between(0,360), axis=0).all(axis=1)
    return mask

In [8]:
# valid INCLINATION range is 0..180

def inclination_range_check(df):
    mask = df['INCLINATION'].between(0,180)
    return mask

In [9]:
# TODO: Anything beyond 20 should be outliers... I think...
# > 16.5 you do get multiple entries from the same satellite, so those shouldn't be outliers
# train_df[train_df['MEAN_MOTION'] > 16.5].NORAD_CAT_ID.value_counts()    

def mean_motion_range_check(df):
    mask = df['MEAN_MOTION'].between(11.25,20)
    return mask

In [10]:
# Skipping first few entries may be a good idea due to initial readings being less accurate (?)
# TODO: using N=5 for now

def skip_first_n(df, n=5):
    mask = df.groupby(by="NORAD_CAT_ID", as_index=False).apply(lambda x:x.EPOCH.rank() > n).reset_index(level=0, drop=True).sort_index()
    return mask

In [19]:
anomaly_functions = [
    more_recent_only,
    leo_check,
    degrees_range_check,
    inclination_range_check,
    mean_motion_range_check,
    skip_first_n,
]

anomaly_results = []
for fn in tqdm(anomaly_functions):
    print(f"Processing function: \"{fn.__name__}\"")
    %time res = fn(train_df)
    print("==========================================================")
    res.name = fn.__name__
    anomaly_results.append(res)

  0%|          | 0/6 [00:00<?, ?it/s]

Processing function: "more_recent_only"
CPU times: user 179 ms, sys: 0 ns, total: 179 ms
Wall time: 179 ms
Processing function: "leo_check"
CPU times: user 226 ms, sys: 136 ms, total: 363 ms
Wall time: 361 ms
Processing function: "degrees_range_check"
CPU times: user 825 ms, sys: 592 ms, total: 1.42 s
Wall time: 1.41 s
Processing function: "inclination_range_check"
CPU times: user 152 ms, sys: 63.3 ms, total: 216 ms
Wall time: 213 ms
Processing function: "mean_motion_range_check"
CPU times: user 143 ms, sys: 44.5 ms, total: 187 ms
Wall time: 186 ms
Processing function: "skip_first_n"
CPU times: user 45.2 s, sys: 5.41 s, total: 50.6 s
Wall time: 50.6 s


In [26]:
# mask results

for s in anomaly_results:
    display(s.value_counts())
    
combined = pd.concat(anomaly_results, axis=1).all(axis=1)
combined.name = "combined_masks"
print("==========================================================\nMasks combined:")
display(combined.value_counts())

True     50453855
False     4785984
Name: more_recent_only, dtype: int64

True     54678326
False      561513
Name: leo_check, dtype: int64

True     55239837
False           2
Name: degrees_range_check, dtype: int64

True    55239839
Name: inclination_range_check, dtype: int64

True     54678335
False      561504
Name: mean_motion_range_check, dtype: int64

True     55167118
False       72721
Name: skip_first_n, dtype: int64

Masks combined:


True     49901955
False     5337884
Name: combined_masks, dtype: int64

In [13]:
# total unique NORAD_IDs
len(train_df.NORAD_CAT_ID.unique())

14628

In [14]:
# def plot_stuff(n,idf):
#     ax = idf.plot(subplots=True,figsize=(20,30));
#     ax[-1].figure.suptitle(n)
    
# for norad_id, df in sample_df.groupby("NORAD_CAT_ID"):
#     plot_stuff(norad_id, df)