In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import pickle
import normalize_data
import random
import train

pd.set_option('display.max_columns', 999)

In [2]:
# For this model, the data preprocessing part is already completed with the exception of scaling.
# so we just need to scale here.

In [3]:
def get_ref_X_y(df):
    ref_cols = [c for c in df.columns if c.startswith('__')]
    X_cols = [c for c in df.columns if c.startswith('X_')]
    y_cols = [c for c in df.columns if c.startswith('y_')]
    return (df[ref_cols], df[X_cols], df[y_cols])

In [4]:
raw_data = {} # loads raw data and stores as a dict cache

def dataset_key(dataset='', validation=False):
    return dataset+('test' if validation else 'train')


def load_data(raw, dataset='', validation=False):
    '''
    Return dataframe matching data set and validation. Dictionary input will be updated.

    Parameters
    ----------
    raw : dict
        dictionary which caches the dataframes and will be updated accordingly

    dataset : str
        which dataset to use? valid input includes: empty str for full set, sample_, and secret_

    validation : bool
        load validation set? if true then use _test, otherwise use _train.  Note secret_ doesn't have _train
    '''
    key = dataset+('test' if validation else 'train')
    if key not in raw:
        print(f"Loading data to cache for: {key}")
        raw[key] = pd.read_pickle(f'{os.environ["GP_HIST_PATH"]}/../t2_data/{key}.pkl')
    return raw[key]

In [None]:
configurations = {
    'dataset' : '', # '', 'sample_', 'secret_'
    'model_identifier' : "bi_full_1",
    'model_path' : f"{os.environ['GP_HIST_PATH']}/../t2_models",
    'device' : 'cpu',
    'random_seed' : 0,
    'lr' : 0.003,
    'momentum' : 0.99, #SGD
    'weight_decay' : 1e-5, #Adam
    'max_epochs' : 500,
    'do_validate' : True,
    'train_params' : {
        'batch_size': 100000,
        'shuffle': True,
        'num_workers': 2,
        'pin_memory': True,
    },
    'test_params' : {
        'batch_size': 200000,
        'num_workers': 2,
        'pin_memory': True,
    },
}

In [6]:
%%time

train_df = normalize_data.normalize_all_columns(load_data(raw_data,dataset=configurations['dataset'],validation=False))
test_df = normalize_data.normalize_all_columns(load_data(raw_data,dataset=configurations['dataset'],validation=True))

ref_train, X_train, y_train = get_ref_X_y(train_df)
ref_test, X_test, y_test = get_ref_X_y(test_df)

Loading data to cache for: train
Loading data to cache for: test
CPU times: user 1min 5s, sys: 1min 20s, total: 2min 25s
Wall time: 1min 57s


In [7]:
y_train = y_train[['y_REV_MA_REG']]
y_test = y_test[['y_REV_MA_REG']]

In [None]:
model, mean_losses = train.train_model(X_train, y_train, X_test, y_test, configurations, force_train=False)

New model created
NNBranchModel(
  (linear1): Linear(in_features=24, out_features=500, bias=True)
  (relu1): ReLU()
  (linear2): Linear(in_features=500, out_features=500, bias=True)
  (relu2): ReLU()
  (dropout2): Dropout(p=0.5, inplace=False)
  (lineara2): Linear(in_features=500, out_features=500, bias=True)
  (relu2a): ReLU()
  (dropout2a): Dropout(p=0.5, inplace=False)
  (linear2b): Linear(in_features=500, out_features=500, bias=True)
  (relu2b): ReLU()
  (dropout2b): Dropout(p=0.5, inplace=False)
  (linear2c): Linear(in_features=500, out_features=500, bias=True)
  (relu2c): ReLU()
  (dropout2c): Dropout(p=0.5, inplace=False)
  (linear3): Linear(in_features=500, out_features=250, bias=True)
  (relu3): ReLU()
  (dropout3): Dropout(p=0.5, inplace=False)
  (linear4): Linear(in_features=250, out_features=150, bias=True)
  (relu4): ReLU()
  (dropout4): Dropout(p=0.5, inplace=False)
  (linear5): Linear(in_features=150, out_features=100, bias=True)
  (relu5): ReLU()
  (dropout5): Dropout(p

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/1326 [00:00<?, ?it/s]

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/1326 [00:00<?, ?it/s]

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/1326 [00:00<?, ?it/s]

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/1326 [00:00<?, ?it/s]

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/1326 [00:00<?, ?it/s]

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/1326 [00:00<?, ?it/s]

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/1326 [00:00<?, ?it/s]

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/1326 [00:00<?, ?it/s]

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/1326 [00:00<?, ?it/s]

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/1326 [00:00<?, ?it/s]

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/1326 [00:00<?, ?it/s]

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/1326 [00:00<?, ?it/s]

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/1326 [00:00<?, ?it/s]

In [None]:
trained_model, _, _, mean_losses, _ = train.load_model_with_config(configurations)

tl, vl = zip(*mean_losses)

fig,ax = plt.subplots()
ax.plot(tl, label="Training Loss")
ax.plot(vl, label="Validation Loss")

fig.legend()
plt.show()

In [None]:
y_train_pred = train.predict(trained_model, X_train, y_train, device="cpu") # get predictions for each train
y_train_pred_df = pd.DataFrame(y_train_pred, columns=y_train.columns)  # put results into a dataframe
y_test_pred = train.predict(trained_model, X_test, y_test, device="cpu") # get predictions for each train
y_test_pred_df = pd.DataFrame(y_test_pred, columns=y_test.columns)  # put results into a dataframe

print(f'    Test set MAE (L1) loss: {mean_absolute_error(y_test, y_test_pred_df)}')
print(f'    Test set MSE (L2) loss: {mean_squared_error(y_test, y_test_pred_df)}')

random.seed(0)
sample = random.sample(list(y_train_pred_df.index), 100)
# sample = [0,1]

print("Train - Ground Truth (normalized):")
display(y_train.loc[sample])
# print("Train - Ground Truth (non-normalized):")
# display(normalize_data.normalize_all_columns(y_train.loc[sample].copy(), reverse=True))  # see ground truths
print("Train - Prediction (normalized):")
display(y_train_pred_df.loc[sample])
# print("Train - Prediction (non-normalized):")
# display(normalize_data.normalize_all_columns(y_train_pred_df.loc[sample].copy(), reverse=True))  # See predictions

In [None]:
errors = y_train - y_train_pred_df
errors = errors * 360 * 70

In [None]:
display(errors)
errors.hist(bins=300)