In [1]:
import pandas as pd
import numpy as np
import math
import os

df = pd.read_csv(os.environ['my_home_path'] + '/data/norad_debris_count.csv')

In [2]:
# Parameters
minimum_tles = 1
minimum_tles_in_trainset = 10
split_size = (0.70, 0.15, 0.15)
seed = 42

# this list is from 'find_ILRS_sats.ipynb' - Ideally this would be loaded from a file...oh well
ilrs_sats = [16908, 1328, 36508, 43476, 43477, 41579, 43655, 46469, 43613, 41240, 39227, 38077, 27944, 43215, 39086,
             43189, 43188, 39068, 41335, 43437, 46984, 7646, 22824, 39452, 39451, 39453, 36605, 42829, 31698]

# Validate split_size
if sum(split_size) != 1 or len(split_size) != 3 or all([x>0for x in split_size]) == False:
    raise Exception('Split size must contain 3 greater-than-zero floating points that add to 1.0')

In [3]:
df['type'] = 'non-payload'

# ignore all norads with TLE count less than minimum_tles
df = df[(df['count'] > minimum_tles) & ~df['norad'].isin(ilrs_sats)]

# find the norads suitable for training
suitable_train_df = df[df['count'] >= minimum_tles_in_trainset]

df_suitable_size = len(suitable_train_df)

train_size = math.ceil(len(df) * split_size[0])
validate_size = math.ceil(len(df) * split_size[1])
test_size = len(df) - (train_size + validate_size)

# Make sure we have enough records to make a train set
if train_size > df_suitable_size:
    raise Exception('Not enough NORADs for training.  Reduce minimum_tles_in_trainset.')

sizes = (train_size, validate_size, test_size)
print(f'Train/Validate/Test Sizes: {sizes}')

Train/Validate/Test Sizes: (16406, 3516, 3515)


In [4]:
train_norads = suitable_train_df.sample(n=train_size, random_state=seed)
train_norads

Unnamed: 0,norad,count,type
23062,45825,540,non-payload
4049,20283,8050,non-payload
8495,17321,811,non-payload
7063,10427,175,non-payload
17517,35821,1502,non-payload
...,...,...,...
21900,42457,308,non-payload
3407,23318,13927,non-payload
18176,35309,137,non-payload
16623,34834,983,non-payload


In [5]:
validate_norads = df[~df.isin(train_norads)].dropna().sample(n=validate_size, random_state=seed)
validate_norads

Unnamed: 0,norad,count,type
8511,17783.0,528.0,non-payload
22793,44976.0,126.0,non-payload
20665,24938.0,794.0,non-payload
10369,23916.0,5.0,non-payload
13716,31375.0,1427.0,non-payload
...,...,...,...
9494,21859.0,149.0,non-payload
15780,33874.0,6588.0,non-payload
14823,32657.0,462.0,non-payload
3453,26838.0,4066.0,non-payload


In [6]:
test_norads = df[~df.isin(train_norads) & ~df.isin(validate_norads)].dropna().sample(frac=1, random_state=seed)
test_norads

Unnamed: 0,norad,count,type
16698,31866.0,594.0,non-payload
557,27115.0,9349.0,non-payload
2831,12531.0,13549.0,non-payload
2568,26152.0,8755.0,non-payload
14938,32762.0,2.0,non-payload
...,...,...,...
7958,15310.0,9.0,non-payload
9078,18914.0,193.0,non-payload
6349,5875.0,504.0,non-payload
23618,47326.0,136.0,non-payload


In [7]:
ilrs_split = (split_size[1]) / (split_size[1] + split_size[2])
validate_ilrs_size = math.ceil(len(ilrs_sats) * ilrs_split)
test_ilrs_size = len(ilrs_sats) - validate_ilrs_size

np.random.seed(seed)
validate_ilrs_sats = np.random.choice(ilrs_sats, validate_ilrs_size, replace=False).tolist()
test_ilrs_sats = [x for x in ilrs_sats if x not in validate_ilrs_sats]

validate_ilrs_df = pd.DataFrame({'norad': validate_ilrs_sats, 'count': [999]*len(validate_ilrs_sats), 'type': ['ilrs']*len(validate_ilrs_sats)})
test_ilrs_df = pd.DataFrame({'norad': test_ilrs_sats, 'count': [999]*len(test_ilrs_sats), 'type': ['ilrs']*len(test_ilrs_sats)})

display(validate_ilrs_df)
display(test_ilrs_df)

Unnamed: 0,norad,count,type
0,42829,999,ilrs
1,43188,999,ilrs
2,27944,999,ilrs
3,22824,999,ilrs
4,43613,999,ilrs
5,41240,999,ilrs
6,7646,999,ilrs
7,16908,999,ilrs
8,36605,999,ilrs
9,43215,999,ilrs


Unnamed: 0,norad,count,type
0,36508,999,ilrs
1,43476,999,ilrs
2,41579,999,ilrs
3,43655,999,ilrs
4,46469,999,ilrs
5,39227,999,ilrs
6,39086,999,ilrs
7,41335,999,ilrs
8,43437,999,ilrs
9,46984,999,ilrs


In [8]:
validate_norads = pd.concat([validate_norads, validate_ilrs_df]).reset_index(drop=True)
validate_norads

Unnamed: 0,norad,count,type
0,17783.0,528.0,non-payload
1,44976.0,126.0,non-payload
2,24938.0,794.0,non-payload
3,23916.0,5.0,non-payload
4,31375.0,1427.0,non-payload
...,...,...,...
3526,43189.0,999.0,ilrs
3527,38077.0,999.0,ilrs
3528,39068.0,999.0,ilrs
3529,1328.0,999.0,ilrs


In [9]:
test_norads = pd.concat([test_norads, test_ilrs_df]).reset_index(drop=True)
test_norads

Unnamed: 0,norad,count,type
0,31866.0,594.0,non-payload
1,27115.0,9349.0,non-payload
2,12531.0,13549.0,non-payload
3,26152.0,8755.0,non-payload
4,32762.0,2.0,non-payload
...,...,...,...
3524,46984.0,999.0,ilrs
3525,39452.0,999.0,ilrs
3526,39451.0,999.0,ilrs
3527,39453.0,999.0,ilrs


In [10]:
train_norads.to_pickle(os.environ['my_home_path'] + '/data/split_by_norad/train_norads.pkl.gz')
validate_norads.to_pickle(os.environ['my_home_path'] + '/data/split_by_norad/validate_norads.pkl.gz')
test_norads.to_pickle(os.environ['my_home_path'] + '/data/split_by_norad/test_norads.pkl.gz')