## Load Raw Data into Train/Val/Test sets and Save
This also has the ability to write the raw data files into train/validate/test files.  These files only contain the raw TLE data.  Extra pre-processing is still required as well as the assembly of input/label data.

In [1]:
from load_data import *

In [2]:
norad_lists = load_norads(['train','validate','secret_test'])
df_dict = load_data(norad_lists, use_all_data=True, debug=True, multiproc=True)  # Takes about 4min

Loading files from path: C:\Datasets\gp_history


100%|██████████████████████████████████████████████████████████████████████████████| 1472/1472 [03:34<00:00,  6.85it/s]
  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Finished loading.


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:08<00:00,  2.99s/it]

Finished assembling.





In [3]:
write_data(df_dict, use_all_data=True, debug=True, threaded=True)

Saving files to path: C:\Datasets\gp_history/raw_compiled
Writing raw data for to: C:\Datasets\gp_history/raw_compiled/train.pkl
Writing raw data for to: C:\Datasets\gp_history/raw_compiled/test.pklWriting raw data for to: C:\Datasets\gp_history/raw_compiled/secret_test.pkl

Finished saving C:\Datasets\gp_history/raw_compiled/secret_test.pkl
Finished saving C:\Datasets\gp_history/raw_compiled/test.pkl
Finished saving C:\Datasets\gp_history/raw_compiled/train.pkl


## Load Raw Train set and create a model

In [1]:
import pandas as pd
import os

files = ['train']

%time df = pd.read_pickle(os.environ['GP_HIST_PATH'] + '/raw_compiled/train.pkl' )  # Takes about 20s

Wall time: 18.4 s


In [2]:
import clean_data

%time df = clean_data.add_epoch_data(df)  # Takes about 4min

Wall time: 4min 2s


In [None]:
import numpy as np

def create_index_map(df):
    '''
    This will create a map between an input record (for X_train) and
    a label record (for y_train) that will be used by the pytorch
    dataset class to dynamically build a dataset without taking up
    more space than is necessary.
    '''
    
    # ML Structure
    # Input:
    #  - Reference TLE Data (+ EPOCH)
    #  - Target EPOCH
    # Output:
    #  - Target TLE Data
    
    def groups(lst):
        arr = lst.copy()
        np.random.shuffle(arr)
        i=1
        if len(lst)<=1:
            return
        while True:
            if i==len(lst):
                yield tuple((arr[i-1],arr[0]))
                break
            else:
                yield tuple((arr[i-1],arr[i]))
                i+=1
    
    # For each unique NORAD, find all TLE indexes and generate
    # a list of combinations
    idx_pairs = []
    for norad in df['NORAD_CAT_ID'].unique():
        norad_idxs = df[df['NORAD_CAT_ID']==norad].index.values
        if len(norad_idxs > 1):
            idx_pairs.extend(groups(norad_idxs))
    idx_pairs = np.array(idx_pairs)
    
#     # Build our X/Y datasets
#     X_all = df.loc[idx_pairs[:,0]].reset_index()
#     Y_all = df.loc[idx_pairs[:,1]].reset_index()
    
#     # This will be the column that links x and y
#     key_columns = ['epoch_jd', 'epoch_fr']
#     target_columns = ['target_epoch_jd', 'target_epoch_fr']
#     X_all[target_columns] = Y_all[key_columns]
    
    return idx_pairs

%time idx_pairs = create_index_map(df)  # 16min - look at ways to improve this through parallelism/concurrency

In [68]:
# Test our dataset structure
import torch

model_cols = ['MEAN_MOTION_DOT', 'MEAN_MOTION_DDOT', 'BSTAR', 'INCLINATION', 'RA_OF_ASC_NODE',
              'ECCENTRICITY', 'ARG_OF_PERICENTER', 'MEAN_ANOMALY', 'MEAN_MOTION', 'epoch_jd', 'epoch_fr']

t1 = torch.from_numpy(df.to_numpy())  # data
p = torch.tensor([[0,1]])

index = 0
X_pre = t1[p[index][0]]
X = torch.cat((X_pre, t1[p[index][1]][-2:]), 0)
y = t1[p[index][1]][:-2]
display(X)
display(y)

tensor([1.8010e-05, 0.0000e+00, 2.5919e-03, 6.2242e+01, 1.8016e+02, 7.0489e-02,
        2.6568e+02, 8.6277e+01, 1.2853e+01, 2.4531e+06, 5.9639e-01, 2.4531e+06,
        6.6644e-01], dtype=torch.float64)

tensor([-2.0000e-08,  0.0000e+00,  1.0000e-04,  7.3360e+01,  3.4569e+02,
         8.8152e-03,  2.7040e+02,  8.8691e+01,  1.2642e+01],
       dtype=torch.float64)

In [62]:
import torch
import torch.nn as nn
torch.manual_seed(0)

hiddenSize = 300
batchSize = 200
learningRate = 0.01
numEpochs = 10

model_cols = ['MEAN_MOTION_DOT', 'MEAN_MOTION_DDOT', 'BSTAR', 'INCLINATION', 'RA_OF_ASC_NODE',
              'ECCENTRICITY', 'ARG_OF_PERICENTER', 'MEAN_ANOMALY', 'MEAN_MOTION', 'epoch_jd', 'epoch_fr']

#device = torch.device('cpu')
device = torch.device('cuda')

def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, data, idx_pairs, device='cpu'):
        'Initialization'
        self.data = to_device(torch.from_numpy(data.to_numpy()).float(), device)
        self.idx_pairs = to_device(torch.from_numpy(idx_pairs).float(), device)

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.idx_pairs)

    def __getitem__(self, index):
        'Generates one sample of data'
        p = self.idx_pairs[index]
        
        # This will use the idx_pairs (x,y) to build the inputs(X) and labels (y)
        # output.  It adds the last 2 columns of y to X and removes them from y.
        X = torch.cat((self.data[p[index][0]], self.data[p[index][1]][-2:]), 0)
        y = self.data[p[index][1]][:-2]

        return X, y

In [64]:
class NNModel(nn.Module):
    def __init__(self, inputSize, outputSize, hiddenSize, activate=None):
        super().__init__()
        self.activate = nn.Sigmoid() if activate == "Sigmoid" else nn.Tanh() if activate == "Tanh" else nn.ReLU()
        self.layer1 = nn.Linear(inputSize, hiddenSize)
        self.layer2 = nn.Linear(hiddenSize, outputSize)

    def forward(self, X):
        hidden = self.activate(self.layer1(X))
        return self.layer2(hidden)
        
        
net = NNModel(len(model_cols) + 2, len(model_cols) - 2, hiddenSize)
to_device(net, device)
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(net.parameters(), lr=learningRate)

trainDataset = Dataset(df[model_cols], idx_pairs, device)
trainLoader = torch.utils.data.DataLoader(dataset=trainDataset,
                                          batch_size=batchSize,
                                          shuffle=True,
                                         )

In [69]:
df[model_cols]

Unnamed: 0,MEAN_MOTION_DOT,MEAN_MOTION_DDOT,BSTAR,INCLINATION,RA_OF_ASC_NODE,ECCENTRICITY,ARG_OF_PERICENTER,MEAN_ANOMALY,MEAN_MOTION,epoch_jd,epoch_fr
0,1.801000e-05,0.000000,0.002592,62.2415,180.1561,0.070489,265.6761,86.2771,12.852684,2453122.5,0.596391
1,-2.000000e-08,0.000000,0.000100,73.3600,345.6887,0.008815,270.3999,88.6911,12.642166,2453122.5,0.666444
2,1.280000e-05,0.000000,0.001076,83.0239,250.9465,0.008493,184.3222,175.7249,13.856401,2453122.5,0.823075
3,1.320000e-06,0.000000,0.000166,70.9841,207.4830,0.020756,161.3777,199.5075,13.715209,2453122.5,0.654993
4,2.280000e-06,0.000000,0.000739,90.1460,192.1834,0.002746,300.4617,59.3655,12.992417,2453122.5,0.154908
...,...,...,...,...,...,...,...,...,...,...,...
54899682,1.871000e-05,0.000000,0.000263,51.4710,233.3741,0.152907,349.1034,7.9896,12.139866,2459296.5,0.821280
54899683,2.870000e-05,0.000000,0.000430,62.8880,161.0401,0.046410,125.6077,238.9153,14.233898,2459297.5,0.189309
54899684,6.427000e-05,0.000000,0.003201,51.3447,252.9299,0.083999,62.7382,305.6523,13.083883,2459296.5,0.659999
54899685,7.216000e-05,0.000000,0.000945,97.9051,133.8333,0.000740,263.2779,96.7592,14.806388,2459297.5,0.102227


In [65]:
%%time
print('>>> Beginning training!')
for epoch in range(numEpochs):
    for i, (inputs, labels) in enumerate(trainLoader):
        optimizer.zero_grad()
        # Forward propagation
        outputs = net(inputs)
        # Backpropagation
        loss = criterion(outputs, labels)
        loss.backward()
        # Gradient descent
        optimizer.step()
        # Logging
        if (i+1) % 1000 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {}'.format(epoch+1, numEpochs, i+1,
                                                                 len(trainDataset)//batchSize, loss))

>>> Beginning training!


IndexError: index 50238054 is out of bounds for dimension 0 with size 2