In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import random
from tqdm.notebook import tqdm

In [2]:
X_train = pd.read_pickle(os.environ['GP_HIST_PATH'] + '/cleaned/x_train1.pkl')
y_train = pd.read_pickle(os.environ['GP_HIST_PATH'] + '/cleaned/y_train1.pkl')

X_test = pd.read_pickle(os.environ['GP_HIST_PATH'] + '/cleaned/x_test1.pkl')
y_test = pd.read_pickle(os.environ['GP_HIST_PATH'] + '/cleaned/y_test1.pkl')

In [3]:
# Remove strange data
# These are from 2_remove_anomaly/remove_anomaly.ipynb
# This is already done to X_train, but not X_test
# Modified because the values have since been normalized

def more_recent_only(df):
    mask = df.EPOCH > "1990"
    return mask
def leo_check(df):
    mask = df['ECCENTRICITY'].between(0,1)
    return mask
def degrees_range_check(df):
    degree_columns = ['RA_OF_ASC_NODE', 'ARG_OF_PERICENTER', 'MEAN_ANOMALY']
    mask = df[degree_columns].apply(lambda x:x.between(0,1), axis=0).all(axis=1)
    return mask
def inclination_range_check(df):
    mask = df['INCLINATION'].between(0,1)
    return mask
def mean_motion_range_check(df):
    mask = df['MEAN_MOTION'].between(0,1)
    return mask
def skip_first_n(df, n=5):
    mask = df.groupby(by="NORAD_CAT_ID", as_index=False).apply(lambda x:x.EPOCH.rank() > n).reset_index(level=0, drop=True).sort_index()
    return mask

anomaly_functions = [
    more_recent_only,
    leo_check,
    degrees_range_check,
    inclination_range_check,
    mean_motion_range_check,
    skip_first_n,
]

anomaly_results = []
for fn in tqdm(anomaly_functions):
    print(f"Processing function: \"{fn.__name__}\"")
    %time res = fn(X_test)
    print("==========================================================")
    res.name = fn.__name__
    anomaly_results.append(res)
    
for s in anomaly_results:
    display(s.value_counts())
    
combined = pd.concat(anomaly_results, axis=1).all(axis=1)
combined.name = "combined_masks"
print("==========================================================\nMasks combined:")
display(combined.value_counts())

X_test_ra = X_test[combined]
y_test_ra = y_test[combined]

  0%|          | 0/6 [00:00<?, ?it/s]

Processing function: "more_recent_only"
Wall time: 23.9 ms
Processing function: "leo_check"
Wall time: 18 ms
Processing function: "degrees_range_check"
Wall time: 1.34 s
Processing function: "inclination_range_check"
Wall time: 18 ms
Processing function: "mean_motion_range_check"
Wall time: 17 ms
Processing function: "skip_first_n"
Wall time: 4.26 s


True     9619869
False       6640
Name: more_recent_only, dtype: int64

True     9530614
False      95895
Name: leo_check, dtype: int64

True     9626507
False          2
Name: degrees_range_check, dtype: int64

True     9626508
False          1
Name: inclination_range_check, dtype: int64

True     9487897
False     138612
Name: mean_motion_range_check, dtype: int64

True     9613291
False      13218
Name: skip_first_n, dtype: int64

Masks combined:


True     9470808
False     155701
Name: combined_masks, dtype: int64

In [4]:
number = 1000
np.random.seed(0)
train_ids = np.random.choice(list(set(X_train.NORAD_CAT_ID.unique())),number)
X_train_sample = X_train[X_train.NORAD_CAT_ID.isin(train_ids)]
X_train_sample_idx = X_train_sample.index
y_train_sample = y_train.loc[X_train_sample_idx]
X_train_sample.to_pickle(os.environ['GP_HIST_PATH'] + f'/cleaned/x_train_sample_{number}.pkl')
y_train_sample.to_pickle(os.environ['GP_HIST_PATH'] + f'/cleaned/y_train_sample_{number}.pkl')

In [6]:
number = 100
np.random.seed(0)
test_ids = np.random.choice(list(set(X_test_ra.NORAD_CAT_ID.unique())),number)
X_test_sample = X_test_ra[X_test_ra.NORAD_CAT_ID.isin(test_ids)]
X_test_sample_idx = X_test_sample.index
y_test_sample = y_test_ra.loc[X_test_sample_idx]
X_test_sample.to_pickle(os.environ['GP_HIST_PATH'] + f'/cleaned/X_test_sample_{number}.pkl')
y_test_sample.to_pickle(os.environ['GP_HIST_PATH'] + f'/cleaned/y_test_sample_{number}.pkl')