In [1]:
import pandas as pd
import numpy as np
import math

df = pd.read_csv('norad_debris_count.csv')

In [2]:
# Parameters
minimum_tles = 1
minimum_tles_in_trainset = 10
split_size = (0.70, 0.15, 0.15)
seed = 42

# Validate split_size
if sum(split_size) != 1 or len(split_size) != 3 or all([x>0for x in split_size]) == False:
    raise Exception('Split size must contain 3 greater-than-zero floating points that add to 1.0')

In [3]:
df['type'] = 'non-payload'

# ignore all norads with TLE count less than minimum_tles
df = df[df['count'] > minimum_tles]

# find the norads suitable for training
suitable_train_df = df[df['count'] >= minimum_tles_in_trainset]

df_suitable_size = len(suitable_train_df)

train_size = math.ceil(len(df) * split_size[0])
validate_size = math.ceil(len(df) * split_size[1])
test_size = len(df) - (train_size + validate_size)

# Make sure we have enough records to make a train set
if train_size > df_suitable_size:
    raise Exception('Not enough NORADs for training.  Reduce minimum_tles_in_trainset.')

sizes = (train_size, validate_size, test_size)
print(f'Train/Validate/Test Sizes: {sizes}')

Train/Validate/Test Sizes: (26805, 5744, 5743)


In [13]:
train_norads = suitable_train_df.sample(n=train_size, random_state=seed)
train_norads

Unnamed: 0,norad,count,type
34182,87405,768,non-payload
7437,3187,49,non-payload
4925,26923,412,non-payload
13443,14116,387,non-payload
5504,28202,3088,non-payload
...,...,...,...
8199,4317,1360,non-payload
31275,40398,3621,non-payload
28586,37008,1767,non-payload
14328,15502,85,non-payload


In [5]:
validate_norads = df[~df.isin(train_norads)].dropna().sample(n=validate_size, random_state=seed)
validate_norads

Unnamed: 0,norad,count,type
13361,13983.0,2.0,non-payload
28810,37295.0,2183.0,non-payload
12494,12434.0,2.0,non-payload
17024,20085.0,5.0,non-payload
10430,8807.0,5.0,non-payload
...,...,...,...
39345,47569.0,3.0,non-payload
13358,13978.0,8.0,non-payload
13384,14018.0,3.0,non-payload
17483,20968.0,3414.0,non-payload


In [6]:
test_norads = df[~df.isin(train_norads) & ~df.isin(validate_norads)].dropna().sample(frac=1, random_state=seed)
test_norads

Unnamed: 0,norad,count,type
7177,2829.0,4.0,non-payload
21559,29109.0,223.0,non-payload
38404,46912.0,220.0,non-payload
39030,47262.0,3.0,non-payload
35870,6269.0,9.0,non-payload
...,...,...,...
25705,33834.0,6460.0,non-payload
37446,45739.0,19.0,non-payload
37683,46051.0,3.0,non-payload
38185,46626.0,183.0,non-payload


In [7]:
ilrs_sats = [16908, 1328, 36508, 43476, 43477, 41579, 43655, 46469, 43613, 41240, 39227, 38077, 27944, 43215, 39086,
             43189, 43188, 39068, 41335, 43437, 46984, 7646, 22824, 39452, 39451, 39453, 36605, 42829, 31698]
ilrs_split = (split_size[1]) / (split_size[1] + split_size[2])
validate_ilrs_size = math.ceil(len(ilrs_sats) * ilrs_split)
test_ilrs_size = len(ilrs_sats) - validate_ilrs_size

np.random.seed(seed)
validate_ilrs_sats = np.random.choice(ilrs_sats, validate_ilrs_size, replace=False).tolist()
test_ilrs_sats = [x for x in ilrs_sats if x not in validate_ilrs_sats]

validate_ilrs_df = pd.DataFrame({'norad': validate_ilrs_sats, 'count': [999]*len(validate_ilrs_sats), 'type': ['ilrs']*len(validate_ilrs_sats)})
test_ilrs_df = pd.DataFrame({'norad': test_ilrs_sats, 'count': [999]*len(test_ilrs_sats), 'type': ['ilrs']*len(test_ilrs_sats)})

display(validate_ilrs_df)
display(test_ilrs_df)

Unnamed: 0,norad,count,type
0,42829,999,ilrs
1,43188,999,ilrs
2,27944,999,ilrs
3,22824,999,ilrs
4,43613,999,ilrs
5,41240,999,ilrs
6,7646,999,ilrs
7,16908,999,ilrs
8,36605,999,ilrs
9,43215,999,ilrs


Unnamed: 0,norad,count,type
0,36508,999,ilrs
1,43476,999,ilrs
2,41579,999,ilrs
3,43655,999,ilrs
4,46469,999,ilrs
5,39227,999,ilrs
6,39086,999,ilrs
7,41335,999,ilrs
8,43437,999,ilrs
9,46984,999,ilrs


In [8]:
validate_norads = pd.concat([validate_norads, validate_ilrs_df]).reset_index(drop=True)
validate_norads

Unnamed: 0,norad,count,type
0,13983.0,2.0,non-payload
1,37295.0,2183.0,non-payload
2,12434.0,2.0,non-payload
3,20085.0,5.0,non-payload
4,8807.0,5.0,non-payload
...,...,...,...
5754,43189.0,999.0,ilrs
5755,38077.0,999.0,ilrs
5756,39068.0,999.0,ilrs
5757,1328.0,999.0,ilrs


In [9]:
test_norads = pd.concat([test_norads, test_ilrs_df]).reset_index(drop=True)
test_norads

Unnamed: 0,norad,count,type
0,2829.0,4.0,non-payload
1,29109.0,223.0,non-payload
2,46912.0,220.0,non-payload
3,47262.0,3.0,non-payload
4,6269.0,9.0,non-payload
...,...,...,...
5752,46984.0,999.0,ilrs
5753,39452.0,999.0,ilrs
5754,39451.0,999.0,ilrs
5755,39453.0,999.0,ilrs


In [10]:
train_norads.to_pickle('train_norads.pkl.gz')