In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
from dataset import Dataset, to_device
from model import NN3FeaturesModel
from model import tims_add_combine_xyz_loss
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import pickle
import normalize_data
import random
from time import time
from tqdm.notebook import tqdm

pd.set_option('display.max_columns', 999)
pd.set_option('display.precision', 10)

In [2]:
raw_data = {} # loads raw data and stores as a dict cache

def dataset_key(dataset='', validation=False):
    return dataset+('test' if validation else 'train')


def load_data(raw, dataset='', validation=False):
    '''
    Return dataframe matching data set and validation. Dictionary input will be updated.

    Parameters
    ----------
    raw : dict
        dictionary which caches the dataframes and will be updated accordingly

    dataset : str
        which dataset to use? valid input includes: empty str for full set, sample_, and secret_

    validation : bool
        load validation set? if true then use _test, otherwise use _train.  Note secret_ doesn't have _train
    '''
    key = dataset+('test' if validation else 'train')
    if key not in raw:
        print(f"Loading data to cache for: {key}")
        raw[key] = pd.read_pickle(f'{os.environ["GP_HIST_PATH"]}/../t7_data/{key}.pkl')
    return raw[key]

In [3]:
def load_sub_model_with_config(train_config, model_configs, sub_model_key, X_count=0, force_recreate=False):
    # a bit hacky, but in the training phase, we never load and use the minmax scalers
    # just putting it here for when we want to load the model elsewhere THEN revert scaling
    # probably better to have the scalers saved separately....

    path = train_config['model_path']
    prefix = train_config['model_prefix']
    model_config = model_configs[sub_model_key]
    f = f"{path}/{prefix}{sub_model_key}.pth"
    if os.path.exists(f) and not force_recreate:
        print("Loading existing model")
        checkpoint = torch.load(f)
        net = checkpoint['net']
        loss_func = checkpoint['loss_func']
        optimizer = checkpoint['optimizer']
        mean_losses = checkpoint['mean_losses']
        next_epoch = checkpoint['next_epoch']
    else:
        if X_count == 0:
            raise Exception('Cannot create model without training_set')
        print("New model created")
        net = NN3FeaturesModel(X_count, model_config['feature_idxs'], model_config['shared_model_definition'], model_config['model_definition'])
        # just setup a dummy loss / optimizer and override it later
        loss_func = torch.nn.MSELoss()  # this is for regression mean squared loss
        optimizer = torch.optim.Adam(net.parameters())
        mean_losses = []
        next_epoch = 0
        save_model_with_config(train_config, sub_model_key, net=net, loss_func=loss_func,
                               optimizer=optimizer, mean_losses=mean_losses, next_epoch=next_epoch )
    return net, loss_func, optimizer, mean_losses, next_epoch

def save_model_with_config(train_config, sub_model_key, **kwargs):
    path = train_config['model_path']
    prefix = train_config['model_prefix']
    f = f"{path}/{prefix}{sub_model_key}.pth"
    torch.save(kwargs, f)

In [4]:
def train_model(X_train, y_train, X_test, y_test, train_config, model_configs, sub_model_key, force_optimizer=None):
    torch.manual_seed(train_config.get('random_seed',0))
    device = train_config.get('device','cpu')
    pyt_device = torch.device(device)
    training_set = Dataset(X_train, y_train)
    training_generator = torch.utils.data.DataLoader(training_set, **train_config['train_params'])
    testing_set = Dataset(X_test, y_test)
    testing_generator = torch.utils.data.DataLoader(testing_set, **train_config['test_params'])
    net, loss_func, optimizer, mean_losses, next_epoch = load_sub_model_with_config(train_config, model_configs, sub_model_key, training_set)
    
    if force_optimizer != None:
        optimizer = force_optimizer
    to_device(net, pyt_device)
    net.train()
#     print(net)
    if next_epoch == train_config['max_epochs']:
        print("Model finished training. To retrain set force_train = True ")
        net.eval()
        return net, mean_losses
    
    epbar = tqdm(range(next_epoch, train_config['max_epochs']))
    for epoch in epbar:
        epbar.set_description(f"Epoch {epoch}")

        running_eloss = 0
        running_vloss = 0

        ipbar = tqdm(training_generator, leave=False)
        ipbar.set_description(f"Training")

        for i, (x, y) in enumerate(ipbar):
            x = to_device(x, pyt_device)
            y = to_device(y, pyt_device)

            optimizer.zero_grad()
            prediction = net(x)     # input x and predict based on x
            loss = loss_func(prediction, y)     # must be (1. nn output, 2. target)
            loss.backward()         # backpropagation, compute gradients
            optimizer.step()        # apply gradients
            running_eloss += loss.item()

        net.eval()
        mean_vlosses = 0
        if train_config['do_validate']:
            with torch.set_grad_enabled(False):
                vpbar = tqdm(testing_generator, leave=False)
                vpbar.set_description("Validating")
                for i, (x, y) in enumerate(vpbar):
                    x = to_device(x, pyt_device)
                    y = to_device(y, pyt_device)
                    prediction = net(x)
                    loss = loss_func(prediction, y)
                    running_vloss += loss.item()
            mean_vlosses = running_vloss / len(testing_generator)

        path = train_config['model_path']
        prefix = train_config['model_prefix']
        f = f"{path}/model_history/{prefix}{sub_model_key}_{next_epoch:04}.pth"
        torch.save(net, f)

        mean_elosses = running_eloss / len(training_generator)
        mean_losses.append((mean_elosses, mean_vlosses))
        next_epoch = len(mean_losses)
        
        save_model_with_config(train_config, sub_model_key, net=net, loss_func=loss_func,
                               optimizer=optimizer, mean_losses=mean_losses, next_epoch=next_epoch )
        net.train()

        epbar.set_postfix({'train_loss':f"{mean_elosses:.12f}", 'val_loss':f"{mean_vlosses:.12f}"})
    net.eval()
    return net, mean_losses

In [5]:
def get_ref_X_y(df):
    ref_cols = [c for c in df.columns if c.startswith('__')]
    X_cols = [c for c in df.columns if c.startswith('X_')]
    y_cols = [c for c in df.columns if c.startswith('y_')]
    return (df[ref_cols], df[X_cols], df[y_cols])

In [6]:
train_config = {
    'dataset' : 'sample_500_', # eg: '', 'sample_', 'sample2_', 'secret_', 'sample_big_'
    'model_path' : f"{os.environ['GP_HIST_PATH']}/../t7_models",
    'model_prefix' : "T7_2_", 
    'device' : 'cpu',
    'random_seed' : 0,
    'max_epochs' : 500,
    'do_validate' : True,
    'train_params' : {
        'batch_size': 100000,
        'shuffle': True,
        'num_workers': 5,
        'pin_memory': True,
    },
    'test_params' : {
        'batch_size': 200000,
        'num_workers': 4,
        'pin_memory': True,
    },
}

In [7]:
%%time

train_df = normalize_data.normalize_all_columns(load_data(raw_data,dataset=train_config['dataset'],validation=False).copy()).dropna()
test_df = normalize_data.normalize_all_columns(load_data(raw_data,dataset=train_config['dataset'],validation=True).copy()).dropna()
ref_train, X_train, y_train = get_ref_X_y(train_df)
ref_test, X_test, y_test = get_ref_X_y(test_df)
y_cols = ['y_SAT_RX', 'y_SAT_RY', 'y_SAT_RZ']
y_train = y_train[y_cols]
y_test = y_test[y_cols]

Loading data to cache for: sample_500_train
Loading data to cache for: sample_500_test
CPU times: user 36.6 s, sys: 36.4 s, total: 1min 12s
Wall time: 1min 1s


In [8]:
model_configs = {
    'position': {
        'feature_idxs': [X_train.columns.get_loc('X_SGP4_SAT_RX'),
                         X_train.columns.get_loc('X_SGP4_SAT_RY'),
                         X_train.columns.get_loc('X_SGP4_SAT_RZ')],
        'shared_model_definition' : [
            ('layer', 1000), ('relu', None), ('drop', 0.5),
        ],
        'model_definition' : [
            ('layer', 100), ('relu', None), ('drop', 0.5),
            ('layer', 100), ('relu', None), ('drop', 0.5),
            ('layer', 100), ('relu', None), ('drop', 0.5),
            ('layer', 100), ('relu', None), ('drop', 0.5),
            ('layer', 100), ('relu', None), ('drop', 0.5),
            ('layer', 100), ('relu', None), ('drop', 0.5),
            ('layer', 100), ('relu', None), ('drop', 0.5),
            ('layer', 100), ('relu', None), ('drop', 0.5),
            ('layer', 60), ('relu', None), ('drop', 0.5),
            ('layer', 60), ('relu', None), ('drop', 0.5),
            ('layer', 60), ('relu', None), ('drop', 0.5),
            ('layer', 60), ('relu', None), ('drop', 0.5),
            ('layer', 60), ('relu', None), ('drop', 0.5),
        ]
    },
    'velocity': {
        'feature_idxs': [X_train.columns.get_loc('X_SGP4_SAT_VX'),
                          X_train.columns.get_loc('X_SGP4_SAT_VY'),
                          X_train.columns.get_loc('X_SGP4_SAT_VZ')],
        'shared_model_definition' : [
            ('layer', 400), ('relu', None), ('drop', 0.5),
            ('layer', 200), ('relu', None), ('drop', 0.5),
            ('layer', 100), ('relu', None), ('drop', 0.5),
        ],
        'model_definition' : [
            ('layer', 50), ('relu', None), ('drop', 0.5),
            ('layer', 50), ('relu', None), ('drop', 0.5),
            ('layer', 50), ('relu', None), ('drop', 0.5),
        ]
    },
}

In [9]:
# Create or load all new sub models here if needed.
all_models = {}
for sub_key in model_configs.keys():
    # When new models are created, a dummy optimizer is used
    model, loss_func, optimizer, mean_losses, next_epoch = load_sub_model_with_config(train_config, model_configs, sub_key, len(X_train.columns), force_recreate=False)
    print(f'Model for "{sub_key}" __ Last loss: {mean_losses[-1:]} __ Epoch: {next_epoch}')
    print(optimizer, loss_func)
    all_models[sub_key] = (model, loss_func, optimizer, mean_losses, next_epoch)

Loading existing model
Model for "position" __ Last loss: [(0.0006314650924879887, 0.0013020476384813914)] __ Epoch: 54
SGD (
Parameter Group 0
    dampening: 0
    lr: 1e-06
    momentum: 0.95
    nesterov: False
    weight_decay: 0
) MSELoss()
Loading existing model
Model for "velocity" __ Last loss: [] __ Epoch: 0
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
) MSELoss()


In [28]:
# possible submodel keys
# 'position' or 'velocity'

# set a specific target to train
submodel_to_train = "position"

# set force_recreate to True if you want to reset the model from scratch
force_recreate = False

# load / recreate the model
net, loss_func, optimizer, mean_losses, next_epoch = load_sub_model_with_config(
    train_config, model_configs, submodel_to_train, len(X_train.columns), force_recreate=force_recreate)


# # revert to the best validation loss epoch
# min_val_loss_idx = np.argmin(np.array(mean_losses)[:,1])
# min_val_loss_idx = 42 # set a specific epoch by hand
# print(f'Reverting to epoch: {min_val_loss_idx}')
# f = f"{train_config['model_path']}/model_history/{train_config['model_prefix']}{submodel_to_train}_{min_val_loss_idx:04}.pth"
# net = torch.load(f)
# mean_losses = mean_losses[:min_val_loss_idx+1]
# next_epoch = len(mean_losses)


# # Only allow loss function to change during new model creation
if force_recreate == True:
    loss_func = torch.nn.MSELoss()
#     loss_func = torch.nn.SmoothL1Loss()
#     loss_func = torch.nn.L1Loss()
#     loss_func = tims_add_combine_xyz_loss
    pass

# # Override optimizer here
# optimizer = torch.optim.AdamW(net.parameters(), lr=1e-6, weight_decay=0.1, eps=1e-8, amsgrad=True)
optimizer = torch.optim.SGD(net.parameters(), lr=1e-5, momentum=0.99)

Loading existing model


In [29]:
# save is a separate cell incase something messed up above
save_model_with_config(train_config, submodel_to_train, net=net, loss_func=loss_func,
                       optimizer=optimizer, mean_losses=mean_losses, next_epoch=next_epoch)

In [30]:
train_model(X_train, y_train[y_cols], X_test, y_test[y_cols], train_config, model_configs, submodel_to_train)

Loading existing model


  0%|          | 0/447 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f9b95e9dca0>
Traceback (most recent call last):
  File "/data1/home/ttcchen/anaconda3/envs/siads-orbital/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1324, in __del__
    self._shutdown_workers()
  File "/data1/home/ttcchen/anaconda3/envs/siads-orbital/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1297, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/data1/home/ttcchen/anaconda3/envs/siads-orbital/lib/python3.8/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/data1/home/ttcchen/anaconda3/envs/siads-orbital/lib/python3.8/multiprocessing/popen_fork.py", line 44, in wait
    if not wait([self.sentinel], timeout):
  File "/data1/home/ttcchen/anaconda3/envs/siads-orbital/lib/python3.8/multiprocessing/connection.py", line 931, in wait
    ready = selector.select(timeout)
  File "/data1/home/ttcchen/a

KeyboardInterrupt: 