In [1]:
import pandas as pd
import numpy as np
import math

# This file is just the "multiproc_output.txt" with a header added 'norad,count'
df = pd.read_csv('norad_debris_count.csv')

In [2]:
# Parameters
minimum_tles = 1
minimum_tles_in_trainset = 10
split_size = (0.70, 0.15, 0.15)
seed = 42
ilrs_sats = [16908, 1328, 36508, 43476, 43477, 41579, 43655, 46469, 43613, 41240, 39227, 38077, 27944, 43215, 39086,
             43189, 43188, 39068, 41335, 43437, 46984, 7646, 22824, 39452, 39451, 39453, 36605, 42829, 31698]

# Validate split_size
if sum(split_size) != 1 or len(split_size) != 3 or all([x>0for x in split_size]) == False:
    raise Exception('Split size must contain 3 greater-than-zero floating points that add to 1.0')

In [3]:
df['type'] = 'non-payload'

# ignore all norads with TLE count less than minimum_tles
df = df[(df['count'] > minimum_tles) & ~df['norad'].isin(ilrs_sats)]

# find the norads suitable for training
suitable_train_df = df[df['count'] >= minimum_tles_in_trainset]

df_suitable_size = len(suitable_train_df)

train_size = math.ceil(len(df) * split_size[0])
validate_size = math.ceil(len(df) * split_size[1])
test_size = len(df) - (train_size + validate_size)

# Make sure we have enough records to make a train set
if train_size > df_suitable_size:
    raise Exception('Not enough NORADs for training.  Reduce minimum_tles_in_trainset.')

sizes = (train_size, validate_size, test_size)
print(f'Train/Validate/Test Sizes: {sizes}')

Train/Validate/Test Sizes: (26785, 5740, 5739)


In [4]:
train_norads = suitable_train_df.sample(n=train_size, random_state=seed)
train_norads

Unnamed: 0,norad,count,type
14857,16325,338,non-payload
16162,18125,23,non-payload
38869,270227,65,non-payload
12417,12287,3882,non-payload
21655,29212,243,non-payload
...,...,...,...
31283,40406,4188,non-payload
28588,37010,32,non-payload
14328,15502,85,non-payload
24224,32066,1683,non-payload


In [5]:
validate_norads = df[~df.isin(train_norads)].dropna().sample(n=validate_size, random_state=seed)
validate_norads

Unnamed: 0,norad,count,type
8265,4434.0,1003.0,non-payload
11531,11010.0,8.0,non-payload
10722,9502.0,2.0,non-payload
39038,47269.0,4.0,non-payload
39049,47280.0,3.0,non-payload
...,...,...,...
37642,46026.0,6.0,non-payload
33220,42772.0,57.0,non-payload
14006,15118.0,11.0,non-payload
17018,20076.0,7.0,non-payload


In [6]:
test_norads = df[~df.isin(train_norads) & ~df.isin(validate_norads)].dropna().sample(frac=1, random_state=seed)
test_norads

Unnamed: 0,norad,count,type
7177,2829.0,4.0,non-payload
20212,28481.0,7651.0,non-payload
33327,42889.0,3.0,non-payload
23649,31440.0,2391.0,non-payload
22326,30079.0,6530.0,non-payload
...,...,...,...
25841,33970.0,4.0,non-payload
37519,21983.0,4.0,non-payload
37699,46066.0,3.0,non-payload
38202,46673.0,3.0,non-payload


In [7]:
ilrs_split = (split_size[1]) / (split_size[1] + split_size[2])
validate_ilrs_size = math.ceil(len(ilrs_sats) * ilrs_split)
test_ilrs_size = len(ilrs_sats) - validate_ilrs_size

np.random.seed(seed)
validate_ilrs_sats = np.random.choice(ilrs_sats, validate_ilrs_size, replace=False).tolist()
test_ilrs_sats = [x for x in ilrs_sats if x not in validate_ilrs_sats]

validate_ilrs_df = pd.DataFrame({'norad': validate_ilrs_sats, 'count': [999]*len(validate_ilrs_sats), 'type': ['ilrs']*len(validate_ilrs_sats)})
test_ilrs_df = pd.DataFrame({'norad': test_ilrs_sats, 'count': [999]*len(test_ilrs_sats), 'type': ['ilrs']*len(test_ilrs_sats)})

display(validate_ilrs_df)
display(test_ilrs_df)

Unnamed: 0,norad,count,type
0,42829,999,ilrs
1,43188,999,ilrs
2,27944,999,ilrs
3,22824,999,ilrs
4,43613,999,ilrs
5,41240,999,ilrs
6,7646,999,ilrs
7,16908,999,ilrs
8,36605,999,ilrs
9,43215,999,ilrs


Unnamed: 0,norad,count,type
0,36508,999,ilrs
1,43476,999,ilrs
2,41579,999,ilrs
3,43655,999,ilrs
4,46469,999,ilrs
5,39227,999,ilrs
6,39086,999,ilrs
7,41335,999,ilrs
8,43437,999,ilrs
9,46984,999,ilrs


In [8]:
validate_norads = pd.concat([validate_norads, validate_ilrs_df]).reset_index(drop=True)
validate_norads

Unnamed: 0,norad,count,type
0,4434.0,1003.0,non-payload
1,11010.0,8.0,non-payload
2,9502.0,2.0,non-payload
3,47269.0,4.0,non-payload
4,47280.0,3.0,non-payload
...,...,...,...
5750,43189.0,999.0,ilrs
5751,38077.0,999.0,ilrs
5752,39068.0,999.0,ilrs
5753,1328.0,999.0,ilrs


In [9]:
test_norads = pd.concat([test_norads, test_ilrs_df]).reset_index(drop=True)
test_norads

Unnamed: 0,norad,count,type
0,2829.0,4.0,non-payload
1,28481.0,7651.0,non-payload
2,42889.0,3.0,non-payload
3,31440.0,2391.0,non-payload
4,30079.0,6530.0,non-payload
...,...,...,...
5748,46984.0,999.0,ilrs
5749,39452.0,999.0,ilrs
5750,39451.0,999.0,ilrs
5751,39453.0,999.0,ilrs


In [10]:
train_norads.to_pickle('train_norads.pkl.gz')
validate_norads.to_pickle('validate_norads.pkl.gz')
test_norads.to_pickle('test_norads.pkl.gz')