# Remove Abnormal Data from Training Set

Load DataFrame from `1_min/train.pkl`

Define multile functions that take a dataframe and return a boolean mask for entries to keep.

Combine masks in the end into a final mask, then it can be applied to `1_full` version as well if that's preferred.

In [1]:
import pandas as pd
import numpy as np
import os

from tqdm.notebook import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import LocalOutlierFactor

In [2]:
if ('my_home_path' not in os.environ) and ('MY_HOME_PATH' in os.environ):
    os.environ['my_home_path'] = os.environ['MY_HOME_PATH'] # because stupid :(

In [3]:
train_df = pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../1_min/train.pkl")

# total unique NORAD_IDs
# len(train_df.NORAD_CAT_ID.unique())

In [4]:
# np.random.choice(train_df.NORAD_CAT_ID.unique(),10)

In [5]:
# just using a small subset for testing
# train_backup_df = train_df.copy()
# # sample_ids = [12223, 26285, 10760, 14345, 34588, 330, 20970]
# sample_ids = [12223, 26285, 10760, 14345, 34588, 330, 20970, 35253, 38899, 36390, 27507, 31539,  8386,  6299, 18428, 17228, 42126]
# sample_ids += list(np.random.choice(train_df.NORAD_CAT_ID.unique(),20))
# # sample_ids = [28974,36024,24403] # this is for arg of pericenter fails
# train_df = train_backup_df[train_backup_df.NORAD_CAT_ID.isin(sample_ids)]

## Remove Erroneous Data
First we start by removing data which we don't want to include in our model.  This includes values which are outside of acceptable ranges or are physically impossible.

In [6]:
# Early TLEs are more prone to errors, cut off should be somewhere in the 80s
# TODO: putting in 1990 for now to be on the safe side

def more_recent_only(df):
    mask = df.EPOCH > "1990"
    return mask

In [7]:
# Space track LEO definition: Mean Motion > 11.25 and Eccentricity < 0.25
# This means that satellite that decay into LEO will not have non-LEO-like entries removed

def leo_check(df):
    mask = (df['MEAN_MOTION'] > 11.25) & (df['ECCENTRICITY'] < 0.25)
    return mask

In [8]:
# valid range for 'RA_OF_ASC_NODE', 'ARG_OF_PERICENTER', 'MEAN_ANOMALY' is 0..360

def degrees_range_check(df):
    degree_columns = ['RA_OF_ASC_NODE', 'ARG_OF_PERICENTER', 'MEAN_ANOMALY']
    mask = df[degree_columns].apply(lambda x:x.between(0,360), axis=0).all(axis=1)
    return mask

In [9]:
# valid INCLINATION range is 0..180

def inclination_range_check(df):
    mask = df['INCLINATION'].between(0,180)
    return mask

In [10]:
# TODO: Anything beyond 20 should be outliers... I think...
# > 16.5 you do get multiple entries from the same satellite, so those shouldn't be outliers
# train_df[train_df['MEAN_MOTION'] > 16.5].NORAD_CAT_ID.value_counts()    

def mean_motion_range_check(df):
    mask = df['MEAN_MOTION'].between(11.25,20)
    return mask

In [11]:
# Skipping first few entries may be a good idea due to initial readings being less accurate (?)
# TODO: using N=5 for now
# this one takes longer because of the grouping

def skip_first_n(df, n=5):
    mask = df.groupby(by="NORAD_CAT_ID", as_index=False).apply(lambda x:x.EPOCH.rank() > n).reset_index(level=0, drop=True).sort_index()
    return mask

In [12]:
anomaly_functions = [
    more_recent_only,
    leo_check,
    degrees_range_check,
    inclination_range_check,
    mean_motion_range_check,
    skip_first_n,
]

anomaly_results = []
for fn in tqdm(anomaly_functions):
    print(f"Processing function: \"{fn.__name__}\"")
    %time res = fn(train_df)
    print("==========================================================")
    res.name = fn.__name__
    anomaly_results.append(res)

  0%|          | 0/6 [00:00<?, ?it/s]

Processing function: "more_recent_only"
CPU times: user 164 ms, sys: 24.7 ms, total: 188 ms
Wall time: 188 ms
Processing function: "leo_check"
CPU times: user 281 ms, sys: 61.4 ms, total: 342 ms
Wall time: 106 ms
Processing function: "degrees_range_check"
CPU times: user 1.11 s, sys: 600 ms, total: 1.71 s
Wall time: 918 ms
Processing function: "inclination_range_check"
CPU times: user 305 ms, sys: 58.4 ms, total: 364 ms
Wall time: 103 ms
Processing function: "mean_motion_range_check"
CPU times: user 276 ms, sys: 70.6 ms, total: 346 ms
Wall time: 102 ms
Processing function: "skip_first_n"
CPU times: user 49.5 s, sys: 8.76 s, total: 58.3 s
Wall time: 53.5 s


In [13]:
# mask results

for s in anomaly_results:
    display(s.value_counts())
    
combined = pd.concat(anomaly_results, axis=1).all(axis=1)
combined.name = "combined_masks"
print("==========================================================\nMasks combined:")
display(combined.value_counts())

True     50453855
False     4785984
Name: more_recent_only, dtype: int64

True     54678326
False      561513
Name: leo_check, dtype: int64

True     55239837
False           2
Name: degrees_range_check, dtype: int64

True    55239839
Name: inclination_range_check, dtype: int64

True     54678335
False      561504
Name: mean_motion_range_check, dtype: int64

True     55167118
False       72721
Name: skip_first_n, dtype: int64

Masks combined:


True     49901955
False     5337884
Name: combined_masks, dtype: int64

## Generate a new DataFrame for Outliers

Masked version of DataFrame for unsupervised learning outlier detection

In [14]:
masked_df = train_df[combined]
# masked_sample_df = masked_df[masked_df.NORAD_CAT_ID.isin(sample_ids)]

## Use Unsupervised Learning to Remove Outliers

We'll be using `DBSCAN`

In [15]:
# # testing DBSCAN

# from sklearn.cluster import DBSCAN

# def dbscan_removal(df, debug=False):
# #     columns = ["INCLINATION","ECCENTRICITY","MEAN_MOTION"]
#     # mean motion turns out to be not very good, due to the final decay as well as outliers reflected in other fields as well
#     columns = ["INCLINATION","ECCENTRICITY"]

#     def detect_outliers(input_df):
#         name = input_df.name
#         dbscan_min_samples = max(len(input_df)/100, 20)
        
#         sub_df = input_df.set_index('EPOCH', append=True).sort_index(level=1)
#         outlier_labels = []
#         for i,column in enumerate(columns):
#             col_diff = np.minimum(sub_df[column].diff()**2, sub_df[column].diff(-1)**2).fillna(0) + np.minimum(sub_df[column].diff(2)**2, sub_df[column].diff(-2)**2).fillna(0)
#             dbscan_eps = col_diff.std()*3
#             if not dbscan_eps > 0.0: # should never or rarely happen, but has happened before....
#                 dbscan_eps = 1 # arbitary, which should mean no outliers for this satellite
#                 dbscan_eps_zero_neg.append(name) # keep track of it
            
#             db = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples).fit(col_diff.to_frame())
#             core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
#             core_samples_mask[db.core_sample_indices_] = True
#             labels = db.labels_
#             # Number of clusters in labels, ignoring noise if present.
#             outlier_labels.append(labels)

#         all_normal = (np.array(outlier_labels).T.min(axis=1) != -1)

#         normal_data = sub_df[all_normal]

#         if debug:
#             print(f"=============================\nnorad id: {name}, rows:{len(input_df)}")
#             ax = (sub_df[columns].droplevel(0)).plot(subplots=True,figsize=(20,6));
#             outlier_data = sub_df[~all_normal]
#             num_all_outliers = len(input_df)-np.sum(all_normal)
#             for i,column in enumerate(columns):
#                 labels = outlier_labels[i]
#                 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
#                 n_noise_ = list(labels).count(-1)    
#                 print(f"column: {column}, n_clusters_: {n_clusters_}, n_noise_: {n_noise_}, noise %:{n_noise_/len(input_df):.5f}%")

#                 ax[i].scatter(outlier_data.index.get_level_values(1), outlier_data[column], s=40, color="red", alpha=1, marker="x", zorder=-1)
#     #             ax[i].set_title=f"{column} #clusters: {n_clusters_}, #noise: {n_noise_}, noise %:{n_noise_/len(input_df):.5f}%"
# #                 ax[i].set_title=f"AAAAAA"
#                 ax[i].scatter(outlier_data.index.get_level_values(1), outlier_data[column], s=40, color="black", alpha=0.7, marker="x", zorder=-2)
#             ax[-1].figure.suptitle(f"combined noise: {num_all_outliers}, noise %:{num_all_outliers/len(input_df):.5f}%")
#             print(f"norad id: {name}, rows:{len(input_df)}, combined noise count: {num_all_outliers}, noise %:{num_all_outliers/len(input_df):.5f}%")
#             print(f"last 30 {all_normal[-30:].astype(int)}")
#             plt.show()
        
#         # should just return boolean mask with index from input
#         return pd.Series(all_normal.astype(bool), index=sub_df.index)
    
#     # combine mask from each group then reset, sort, etc.
#     return df.groupby(by="NORAD_CAT_ID", as_index=False).progress_apply(detect_outliers).droplevel([0,2]).sort_index()

In [16]:
# Multithreaded version of DBSCAN

from sklearn.cluster import DBSCAN
import concurrent.futures


def detect_outliers(input_df):
    columns = ["INCLINATION","ECCENTRICITY","ARG_OF_PERICENTER","RA_OF_ASC_NODE"]
    name = input_df.name
    dbscan_min_samples = max(len(input_df)/100, 20)

    sub_df = input_df.set_index('EPOCH', append=True).sort_index(level=1)
    outlier_labels = []
    for i,column in enumerate(columns):
        if column in ["ARG_OF_PERICENTER","RA_OF_ASC_NODE"]:
            c_diff = sub_df[column].diff(-1)
            c_diff_adj = np.minimum(np.abs(c_diff), 360-np.abs(c_diff))
            c_diff = c_diff_adj * np.sign(c_diff)
            col_diff = np.minimum(c_diff.diff()**2, c_diff.diff(-1)**2).fillna(0) + np.minimum(c_diff.diff(2)**2, c_diff.diff(-2)**2).fillna(0)
            dbscan_eps = col_diff.std()*1
            if not dbscan_eps > 0.0: # should never or rarely happen, but has happened before....
                dbscan_eps = 1 # arbitary, which should mean no outliers for this satellite
                dbscan_eps_zero_neg.append(name) # keep track of it

            db = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples).fit(col_diff.to_frame())
        else:
            col_diff = np.minimum(sub_df[column].diff()**2, sub_df[column].diff(-1)**2).fillna(0) + np.minimum(sub_df[column].diff(2)**2, sub_df[column].diff(-2)**2).fillna(0)
            dbscan_eps = col_diff.std()*3
            if not dbscan_eps > 0.0: # should never or rarely happen, but has happened before....
                dbscan_eps = 1 # arbitary, which should mean no outliers for this satellite
                dbscan_eps_zero_neg.append(name) # keep track of it

            db = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples).fit(col_diff.to_frame())
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_
        # Number of clusters in labels, ignoring noise if present.
        outlier_labels.append(labels)

    all_normal = (np.array(outlier_labels).T.min(axis=1) != -1)

    normal_data = sub_df[all_normal]

    # should just return boolean mask with index from input
    return pd.Series(all_normal.astype(bool), index=sub_df.index)

def process_dbscan_batch(df, b):
    tqdm.pandas(desc=f'Batch {b}',leave=False)
    return df.groupby(by="NORAD_CAT_ID", as_index=False).progress_apply(detect_outliers).droplevel([0,2]).sort_index()

def dbscan_removal(df, threaded=False, batch_size=10, num_workers=5):
    if threaded:
        df_out = pd.DataFrame()
        norads = df['NORAD_CAT_ID'].unique()
        batches = [norads[i:i+batch_size] for i in range(0, len(norads), batch_size)]
        pbar = tqdm(total=len(norads), desc='DBSCAN (threaded)')
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
            tasks = []
            for b, norad_batch in enumerate(batches):
                t = executor.submit(process_dbscan_batch, df[df.NORAD_CAT_ID.isin(norad_batch)], b)
                t.add_done_callback(lambda p: pbar.update(batch_size))
                tasks.append(t)
            for t in concurrent.futures.as_completed(tasks):
                df_out = pd.concat([df_out, t])
        pbar.close()
        return df_out.droplevel([0,2]).sort_index()
    else:
        tqdm.pandas(desc='DBSCAN (non-threaded)')
        return df.groupby(by="NORAD_CAT_ID", as_index=False).progress_apply(detect_outliers).droplevel([0,2]).sort_index()
    

In [None]:
dbscan_eps_zero_neg = [] # this is to catch cases where std is negative(!!!?) or zero
dbscan_mask = dbscan_removal(masked_df, threaded=True, num_workers=8, batch_size=40)
#dbscan_mask = dbscan_removal(masked_df)

DBSCAN (threaded):   0%|          | 0/12764 [00:00<?, ?it/s]

Batch 0:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 1:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 2:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 3:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 4:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 5:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 6:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 7:   0%|          | 0/40 [00:06<?, ?it/s]

Batch 8:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 9:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 10:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 11:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 12:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 13:   0%|          | 0/40 [00:02<?, ?it/s]

Batch 14:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 15:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 16:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 17:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 18:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 19:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 20:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 21:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 22:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 23:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 24:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 25:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 26:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 27:   0%|          | 0/40 [00:01<?, ?it/s]

Batch 28:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 29:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 30:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 31:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 32:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 33:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 34:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 35:   0%|          | 0/40 [00:05<?, ?it/s]

Batch 36:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 37:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 38:   0%|          | 0/40 [00:00<?, ?it/s]

Batch 39:   0%|          | 0/40 [00:00<?, ?it/s]

In [None]:
# cases where std is negative or zero
# Seems to be satellites with only a single entry, safe to ignore

# dbscan_eps_zero_neg

In [None]:
# final filtering of minimum entry count is needed? (not really because DBSCAN's min_samples)

In [None]:
masked_df[dbscan_mask].groupby("NORAD_CAT_ID")['EPOCH'].count().describe()

# Save DataFrame with anomaly removed

`min` version is saved to `2_min` in the shared data folder.

In [None]:
%%time

# save both masks
combined.to_pickle(f"{os.environ['GP_HIST_PATH']}/../2_min/anomaly_mask.pkl")
#dbscan_mask.to_pickle(f"{os.environ['GP_HIST_PATH']}/../2_min/dbscan_mask.pkl")
dbscan_mask = pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../2_min/dbscan_mask.pkl")

# save min version
masked_df[dbscan_mask].to_pickle(f"{os.environ['GP_HIST_PATH']}/../2_min/train.pkl")

# We don't save full version anymore, since we no longer need the extra information
# # load
# train_df_full = pd.read_pickle(f"{os.environ['GP_HIST_PATH']}/../1_full/train.pkl")
# masked_df_full = train_df_full[combined]
# masked_df_full[dbscan_mask].to_pickle(f"{os.environ['GP_HIST_PATH']}/../2_full/train.pkl")

# del train_df_full, masked_df_full