In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
from dataset import Dataset, to_device
from model import NNModelXYZ
from model import tims_mse_loss, tims_mae_loss, tim95_mse_loss
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import pickle
import normalize_data
import random
from time import time
from tqdm.notebook import tqdm

pd.set_option('display.max_columns', 999)
pd.set_option('display.precision', 10)

In [2]:
raw_data = {} # loads raw data and stores as a dict cache

def dataset_key(dataset='', validation=False):
    return dataset+('test' if validation else 'train')


def load_data(raw, dataset='', validation=False):
    '''
    Return dataframe matching data set and validation. Dictionary input will be updated.

    Parameters
    ----------
    raw : dict
        dictionary which caches the dataframes and will be updated accordingly

    dataset : str
        which dataset to use? valid input includes: empty str for full set, sample_, and secret_

    validation : bool
        load validation set? if true then use _test, otherwise use _train.  Note secret_ doesn't have _train
    '''
    key = dataset+('test' if validation else 'train')
    if key not in raw:
        print(f"Loading data to cache for: {key}")
        raw[key] = pd.read_pickle(f'{os.environ["GP_HIST_PATH"]}/../t6_data/{key}.pkl')
    return raw[key]

In [3]:
def load_model_with_config(train_config, model_config, X_count=0, force_recreate=False):
    # a bit hacky, but in the training phase, we never load and use the minmax scalers
    # just putting it here for when we want to load the model elsewhere THEN revert scaling
    # probably better to have the scalers saved separately....

    path = train_config['model_path']
    prefix = train_config['model_prefix']
    f = f"{path}/{prefix}model.pth"
    if os.path.exists(f) and not force_recreate:
        print("Loading existing model")
        checkpoint = torch.load(f)
        net = checkpoint['net']
        loss_func = checkpoint['loss_func']
        optimizer = checkpoint['optimizer']
        mean_losses = checkpoint['mean_losses']
        next_epoch = checkpoint['next_epoch']
    else:
        if X_count == 0:
            raise Exception('Cannot create model without training_set')
        print("New model created")
        net = NNModelXYZ(X_count,8)
        loss_func = torch.nn.MSELoss()  # this is for regression mean squared loss
#         loss_func = torch.nn.L1Loss()
        # just setup a dummy optimizer and override it later
        optimizer = torch.optim.Adam(net.parameters())
#         optimizer = torch.optim.SGD(net.parameters(), lr=config['lr'], momentum=config['momentum'], weight_decay=config['weight_decay'])
        mean_losses = []
        next_epoch = 0
        save_model_with_config(train_config, net=net, loss_func=loss_func,
                               optimizer=optimizer, mean_losses=mean_losses, next_epoch=next_epoch )
    return net, loss_func, optimizer, mean_losses, next_epoch

def save_model_with_config(train_config, **kwargs):
    path = train_config['model_path']
    prefix = train_config['model_prefix']
    f = f"{path}/{prefix}model.pth"
    torch.save(kwargs, f)

In [4]:
def train_model(X_train, y_train, X_test, y_test, train_config, model_config, force_optimizer=None):
    torch.manual_seed(train_config.get('random_seed',0))
    device = train_config.get('device','cpu')
    pyt_device = torch.device(device)
    training_set = Dataset(X_train, y_train)
    training_generator = torch.utils.data.DataLoader(training_set, **train_config['train_params'])
    testing_set = Dataset(X_test, y_test)
    testing_generator = torch.utils.data.DataLoader(testing_set, **train_config['test_params'])
    net, loss_func, optimizer, mean_losses, next_epoch = load_model_with_config(train_config, model_config, training_set)
    
    if force_optimizer != None:
        optimizer = force_optimizer
    to_device(net, pyt_device)
    net.train()
#     print(net)
    if next_epoch == train_config['max_epochs']:
        print("Model finished training. To retrain set force_train = True ")
        net.eval()
        return net, mean_losses
    
    epbar = tqdm(range(next_epoch, train_config['max_epochs']))
    for epoch in epbar:
        epbar.set_description(f"Epoch {epoch}")

        running_eloss = 0
        running_vloss = 0

        ipbar = tqdm(training_generator, leave=False)
        ipbar.set_description(f"Training")

        for i, (x, y) in enumerate(ipbar):
            x = to_device(x, pyt_device)
            y = to_device(y, pyt_device)

            optimizer.zero_grad()
            prediction = net(x)     # input x and predict based on x
            loss = loss_func(prediction, y)     # must be (1. nn output, 2. target)
            loss.backward()         # backpropagation, compute gradients
            optimizer.step()        # apply gradients
            running_eloss += loss.item()

        net.eval()
        mean_vlosses = 0
        if train_config['do_validate']:
            with torch.set_grad_enabled(False):
                vpbar = tqdm(testing_generator, leave=False)
                vpbar.set_description("Validating")
                for i, (x, y) in enumerate(vpbar):
                    x = to_device(x, pyt_device)
                    y = to_device(y, pyt_device)
                    prediction = net(x)
                    loss = loss_func(prediction, y)
                    running_vloss += loss.item()
            mean_vlosses = running_vloss / len(testing_generator)

        path = train_config['model_path']
        prefix = train_config['model_prefix']
        f = f"{path}/model_history/{prefix}model_{next_epoch:04}.pth"
        torch.save(net, f)

        mean_elosses = running_eloss / len(training_generator)
        mean_losses.append((mean_elosses, mean_vlosses))
        next_epoch = len(mean_losses)
        
        save_model_with_config(train_config, net=net, loss_func=loss_func,
                               optimizer=optimizer, mean_losses=mean_losses, next_epoch=next_epoch )
        net.train()

        epbar.set_postfix({'train_loss':f"{mean_elosses:.12f}", 'val_loss':f"{mean_vlosses:.12f}"})
    net.eval()
    return net, mean_losses

In [5]:
def get_ref_X_y(df):
    ref_cols = [c for c in df.columns if c.startswith('__')]
    X_cols = [c for c in df.columns if c.startswith('X_')]
    y_cols = [c for c in df.columns if c.startswith('y_')]
    return (df[ref_cols], df[X_cols], df[y_cols])

In [6]:
train_config = {
    'dataset' : 'sample2_', # eg: '', 'sample_', 'sample2_', 'secret_', 'sample_big_'
    'model_path' : f"{os.environ['GP_HIST_PATH']}/../t6_models",
    'model_prefix' : "TRY_2_", 
    'device' : 'cpu',
    'random_seed' : 0,
    'max_epochs' : 500,
    'do_validate' : True,
    'train_params' : {
        'batch_size': 50000,
        'shuffle': True,
        'num_workers': 4,
        'pin_memory': True,
    },
    'test_params' : {
        'batch_size': 100000,
        'num_workers': 4,
        'pin_memory': True,
    },
}

In [7]:
%%time

train_df = normalize_data.normalize_all_columns(load_data(raw_data,dataset=train_config['dataset'],validation=False)).dropna()
test_df = normalize_data.normalize_all_columns(load_data(raw_data,dataset=train_config['dataset'],validation=True)).dropna()
ref_train, X_train, y_train = get_ref_X_y(train_df)
ref_test, X_test, y_test = get_ref_X_y(test_df)

Loading data to cache for: sample2_train
Loading data to cache for: sample2_test
CPU times: user 1min 8s, sys: 1min 8s, total: 2min 16s
Wall time: 1min 55s


In [8]:
model_config = {
    'model_definition' : [
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
        ('layer', 300), ('relu', None), ('drop', 0.5),
    ]
}

In [12]:
# set force_recreate to True if you want to reset the model from scratch
force_recreate = False

# load / recreate the model
net, loss_func, optimizer, mean_losses, next_epoch = load_model_with_config(
    train_config, model_config, len(X_train.columns), force_recreate=force_recreate)


# # revert to the best validation loss epoch
# min_val_loss_idx = np.argmin(np.array(mean_losses)[:,1])
# min_val_loss_idx = 130 # set a specific epoch by hand
# print(f'Reverting to epoch: {min_val_loss_idx}')
# f = f"{train_config['model_path']}/model_history/{train_config['model_prefix']}{submodel_to_train}_{min_val_loss_idx:04}.pth"
# net = torch.load(f)
# mean_losses = mean_losses[:min_val_loss_idx+1]
# next_epoch = len(mean_losses)


# # Only allow loss function to change during new model creation
if force_recreate == True:
#     loss_func = torch.nn.SmoothL1Loss()
    loss_func = torch.nn.MSELoss()
#     loss_func = torch.nn.L1Loss()
#     loss_func = tims_mse_loss
#     loss_func = tims_mae_loss
#     loss_func = tim95_mse_loss
    pass

# # Override optimizer here
optimizer = torch.optim.AdamW(net.parameters(), lr=1e-6, weight_decay=0.1, eps=1e-8, amsgrad=True)
# optimizer = torch.optim.SGD(net.parameters(), lr=1e-7, momentum=0.97)

Loading existing model


In [13]:
# save is a separate cell incase something messed up above
save_model_with_config(train_config, net=net, loss_func=loss_func,
                       optimizer=optimizer, mean_losses=mean_losses, next_epoch=next_epoch)

In [14]:
train_model(X_train, y_train, X_test, y_test, train_config, model_config)

Loading existing model


  0%|          | 0/467 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1124 [00:00<?, ?it/s]

KeyboardInterrupt: 