This notebook loads data and trains a neural network.  

It makes heavy use of the modified vizML code, and that code can also be run independently using the agg.py script: from within the `neural_network` directory, use `python agg.py` followed by `LOAD` then `TRAIN` then `EVAL`.

This notebook requires specific versions of several packages-- create a virtual environment, then `pip install -r requirements.txt`

It also requires the extracted features from the plotly community feed.  If training a general model, you can download the prepaired features from vizML using the retrieve_vizML_data.sh script. If training on custom data, first run data cleaning and feature extraction scripts. 

In [9]:
# Imports
import pandas as pd
import scipy as sc
import numpy as np
import time
import gc
import os
import sys
import torch

from imblearn.over_sampling import RandomOverSampler

sys.path.insert(0, '..')
#import neural_network.evaluate as evaluate
import neural_network.util as util
import neural_network.nets as nets
import neural_network.train as train
from helpers.processing import *
from helpers.analysis import *

RANDOM_STATE = 42

features_directory = '../features/processed'
saves_directory = '../neural_network/saves'
num_datapoints = None  # None if you want all


In [2]:
# this script tests aggregate features over line, scatter, and bar outcomes
def load_features():
    features_df_file_name = 'features_aggregate_single.csv'
    outcomes_df_file_name = 'chart_outcomes.csv'

    features_df = pd.read_csv(
        os.path.join(
            features_directory,
            features_df_file_name))
    outcomes_df = pd.read_csv(
        os.path.join(
            features_directory,
            outcomes_df_file_name))

    features_df = features_df[:num_datapoints]
    outcome_variable_name = 'all_one_trace_type'
    # photon: add circos here? 
    outcomes = ['line', 'scatter', 'bar']
    outcomes_df_subset = outcomes_df[outcomes_df[outcome_variable_name].isin(
        outcomes)][['fid', outcome_variable_name]]

    final_df = pd.merge(features_df, outcomes_df_subset, on='fid', how='inner')
    final_df = final_df.drop(['fid'], axis=1, inplace=False, errors='ignore')
    final_df.sample(frac=1.0)

    last_index = final_df.columns.get_loc(outcome_variable_name)
    X = final_df.iloc[:, :last_index]
    y = final_df.iloc[:, last_index]
    y = pd.get_dummies(y).values.argmax(1)

    res = RandomOverSampler(random_state=RANDOM_STATE)
    X, y = res.fit_sample(X, y)
    # shuffle X and y in unison, and then return
    return util.unison_shuffle(X, y)

In [3]:
# all the parameters for the script can be adjusted here

# num_epochs: the max number of epochs we will train the NN for
# hidden_sizes: the number of neurons in each hidden layer, enter it as a list
# output_dim: the dimension of the output. Since outputs are 'line', 'scatter', 'bar', it's 3
#                                                            + 'circos' ; +  'table' will be 5
# weight_decay: how much to decay LR in the NN. This can be set to 0 since we decrease LR already through
#   the ReduceLROnPlateau() function
# dropout: the dropout in each layer
# patience: how many epochs we go through (with a near constant learning rate, this threshold is adjusted using
#   threshold) before dropping learning rate by a factor of 10
# model_prefix: all models will be loaded/saved with the prefix of the file in the beginning
# save_model: save each epoch's model onto models/ folder.
# print_test: print test accuracies into test.txt
# test_best: test the test accuracy of the best model we've found (best
# model determined using val accuracy)

# note: training is automatically stopped when learning rate < 0.01 *
# starting learning rate

parameters = {
    'batch_size': 200,
    'num_epochs': 100,
    'hidden_sizes': [800, 800, 800],
    'learning_rate': 5e-4,
    'output_dim': 3,
    'weight_decay': 0,
    'dropout': 0.00,
    'patience': 20,
    'threshold': 1e-3,
    'model_prefix': 'agg',
    'save_model': False,
    'print_test': True,
    'test_best': False
}

Load features

In [4]:
X, y = load_features()
# split 10% of examples into val, and 10% into test
util.save_matrices_to_disk(
    X, y, [0.1, 0.1], saves_directory, parameters['model_prefix'], num_datapoints)

number of total examples is  210
indexes for splitting between train/val/test are  [168, 189]


In [5]:
X_train, y_train, X_val, y_val, X_test, y_test = util.load_matrices_from_disk(
        saves_directory, parameters['model_prefix'], num_datapoints)

Train model

In [6]:
# make sure this works, its had problems.  Should return a date and time string. 
util.get_time()

'2020-06-17 11:03:50'

In [13]:
# set "device" to regular cpu if no GPU is available 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# to-do: maybe add this to the train.py script? 
print(device)

cpu


In [7]:
train_dataloader, val_dataloader, test_dataloader = train.load_datasets(
    X_train, y_train, X_val, y_val, parameters, X_test=X_test, y_test=y_test)
train.train(
    train_dataloader,
    val_dataloader,
    test_dataloader,
    parameters)

output_dim is 3
Starting training at 2020-06-17 11:03:50
batch_size=200, dropout=0.0, hidden_sizes=[800, 800, 800], input_dim=657, learning_rate=0.0005, model_prefix='agg', num_epochs=100, output_dim=3, patience=20, print_test=True, save_model=False, test_best=False, threshold=0.001, weight_decay=0
starting training
epoch: 1, lr: 5.0e-04    2020-06-17 11:03:50


AssertionError: Torch not compiled with CUDA enabled