In [1]:
import pandas as pd
import numpy as np
import math
import os

df = pd.read_csv(os.environ['my_home_path'] + '/data/norad_debris_count.csv')

In [2]:
# Parameters
minimum_tles = 1
minimum_tles_in_trainset = 10
split_size = (0.70, 0.15, 0.15)
seed = 42

# this list is from 'find_ILRS_sats.ipynb' - Ideally this would be loaded from a file...oh well
ilrs_sats = [16908, 1328, 36508, 43476, 43477, 41579, 43655, 46469, 43613, 41240, 39227, 38077, 27944, 43215, 39086,
             43189, 43188, 39068, 41335, 43437, 46984, 7646, 22824, 39452, 39451, 39453, 36605, 42829, 31698]

# This list contains NORADs that were classified as both a PAYLOAD and DEBRIS at different times.
# For this reason, these will be ignored.
misclassd = [28225, 28226, 28227, 28416, 28799, 28940, 29292, 36134, 39497, 42813, 43240, 43599, 43870, 44318, 44440,
             47409, 47412, 47411, 47410, 1641, 28233, 29111, 31865, 14483, 43523, 43671, 28232, 28224, 28513, 28975,
             39765, 43718, 42902]

# Validate split_size
if sum(split_size) != 1 or len(split_size) != 3 or all([x>0for x in split_size]) == False:
    raise Exception('Split size must contain 3 greater-than-zero floating points that add to 1.0')

In [3]:
df['type'] = 'debris'

# ignore all norads with TLE count less than minimum_tles
df = df[(df['count'] > minimum_tles) & ~df['norad'].isin(ilrs_sats) & ~df['norad'].isin(misclassd)]

# find the norads suitable for training
suitable_train_df = df[df['count'] >= minimum_tles_in_trainset]

df_suitable_size = len(suitable_train_df)

train_size = math.ceil(len(df) * split_size[0])
validate_size = math.ceil(len(df) * split_size[1])
test_size = len(df) - (train_size + validate_size)

# Make sure we have enough records to make a train set
if train_size > df_suitable_size:
    raise Exception('Not enough NORADs for training.  Reduce minimum_tles_in_trainset.')

sizes = (train_size, validate_size, test_size)
print(f'Train/Validate/Test Sizes: {sizes}')

Train/Validate/Test Sizes: (14628, 3135, 3133)


In [4]:
train_norads = suitable_train_df.sample(n=train_size, random_state=seed)
train_norads

Unnamed: 0,norad,count,type
13170,33012,275,debris
10527,30001,6429,debris
12439,32073,1099,debris
1815,22600,12722,debris
3394,21721,11104,debris
...,...,...,...
14319,34510,2446,debris
10249,29717,195,debris
1689,27577,4338,debris
17862,39196,104,debris


In [5]:
validate_norads = df[~df.isin(train_norads)].dropna().sample(n=validate_size, random_state=seed)
validate_norads

Unnamed: 0,norad,count,type
10467,29904.0,7419.0,debris
4744,1914.0,938.0,debris
20559,45930.0,569.0,debris
18749,41060.0,3561.0,debris
8497,23891.0,293.0,debris
...,...,...,...
16931,37593.0,1875.0,debris
2052,12735.0,14732.0,debris
7205,17132.0,680.0,debris
5048,4146.0,3141.0,debris


In [6]:
test_norads = df[~df.isin(train_norads) & ~df.isin(validate_norads)].dropna().sample(frac=1, random_state=seed)
test_norads

Unnamed: 0,norad,count,type
2592,10573.0,16289.0,debris
14185,34357.0,2631.0,debris
20219,44595.0,654.0,debris
16846,37566.0,2487.0,debris
2181,26731.0,6359.0,debris
...,...,...,...
20851,46969.0,79.0,debris
7458,17595.0,692.0,debris
7693,18303.0,154.0,debris
8750,24262.0,1703.0,debris


In [7]:
ilrs_split = (split_size[1]) / (split_size[1] + split_size[2])
validate_ilrs_size = math.ceil(len(ilrs_sats) * ilrs_split)
test_ilrs_size = len(ilrs_sats) - validate_ilrs_size

np.random.seed(seed)
validate_ilrs_sats = np.random.choice(ilrs_sats, validate_ilrs_size, replace=False).tolist()
test_ilrs_sats = [x for x in ilrs_sats if x not in validate_ilrs_sats]

validate_ilrs_df = pd.DataFrame({'norad': validate_ilrs_sats, 'count': [999]*len(validate_ilrs_sats), 'type': ['ilrs']*len(validate_ilrs_sats)})
test_ilrs_df = pd.DataFrame({'norad': test_ilrs_sats, 'count': [999]*len(test_ilrs_sats), 'type': ['ilrs']*len(test_ilrs_sats)})

display(validate_ilrs_df)
display(test_ilrs_df)

Unnamed: 0,norad,count,type
0,42829,999,ilrs
1,43188,999,ilrs
2,27944,999,ilrs
3,22824,999,ilrs
4,43613,999,ilrs
5,41240,999,ilrs
6,7646,999,ilrs
7,16908,999,ilrs
8,36605,999,ilrs
9,43215,999,ilrs


Unnamed: 0,norad,count,type
0,36508,999,ilrs
1,43476,999,ilrs
2,41579,999,ilrs
3,43655,999,ilrs
4,46469,999,ilrs
5,39227,999,ilrs
6,39086,999,ilrs
7,41335,999,ilrs
8,43437,999,ilrs
9,46984,999,ilrs


In [8]:
validate_norads = pd.concat([validate_norads, validate_ilrs_df]).reset_index(drop=True)
validate_norads

Unnamed: 0,norad,count,type
0,29904.0,7419.0,debris
1,1914.0,938.0,debris
2,45930.0,569.0,debris
3,41060.0,3561.0,debris
4,23891.0,293.0,debris
...,...,...,...
3145,43189.0,999.0,ilrs
3146,38077.0,999.0,ilrs
3147,39068.0,999.0,ilrs
3148,1328.0,999.0,ilrs


In [9]:
test_norads = pd.concat([test_norads, test_ilrs_df]).reset_index(drop=True)
test_norads

Unnamed: 0,norad,count,type
0,10573.0,16289.0,debris
1,34357.0,2631.0,debris
2,44595.0,654.0,debris
3,37566.0,2487.0,debris
4,26731.0,6359.0,debris
...,...,...,...
3142,46984.0,999.0,ilrs
3143,39452.0,999.0,ilrs
3144,39451.0,999.0,ilrs
3145,39453.0,999.0,ilrs


In [10]:
train_norads.to_pickle(os.environ['my_home_path'] + '/data/split_by_norad/train_norads.pkl.gz')
validate_norads.to_pickle(os.environ['my_home_path'] + '/data/split_by_norad/validate_norads.pkl.gz')
test_norads.to_pickle(os.environ['my_home_path'] + '/data/split_by_norad/test_norads.pkl.gz')