This notebook loads data and trains a neural network using data from the Plotly community feed.  

It makes heavy use of the modified vizML code, and that code can also be run independently using the agg.py script: from within the `neural_network` directory, use `python agg.py` followed by `LOAD` then `TRAIN` then `EVAL`.

This notebook requires specific versions of several packages-- create a virtual environment, then `pip install -r requirements.txt`

It also requires the extracted features from the plotly community feed.  If training a general model, you can download the prepaired features from vizML using the retrieve_vizML_data.sh script. If training on custom data, first run data cleaning and feature extraction scripts. 

In [152]:
# Imports
import pandas as pd
import scipy as sc
import numpy as np
import time
import gc
import os
import sys
import time
import datetime
import copy
import torch
from torch import nn, optim

from imblearn.over_sampling import RandomOverSampler

sys.path.insert(0, '..')
#import neural_network.evaluate as evaluate
import neural_network.util as util
import neural_network.nets as nets
import neural_network.train as train
from helpers.processing import *
from helpers.analysis import *

RANDOM_STATE = 42

features_directory = '../features/processed'
saves_directory = '../neural_network/saves'
num_datapoints = None  # None if you want all


In [153]:
# this script tests aggregate features over line, scatter, and bar outcomes
def load_features():
    features_df_file_name = 'features_aggregate_single.csv'
    outcomes_df_file_name = 'chart_outcomes.csv'

    features_df = pd.read_csv(
        os.path.join(
            features_directory,
            features_df_file_name))
    outcomes_df = pd.read_csv(
        os.path.join(
            features_directory,
            outcomes_df_file_name))

    features_df = features_df[:num_datapoints]
    outcome_variable_name = 'all_one_trace_type'
    # photon: add circos here? 
    outcomes = ['line', 'scatter', 'bar']
    outcomes_df_subset = outcomes_df[outcomes_df[outcome_variable_name].isin(
        outcomes)][['fid', outcome_variable_name]]

    final_df = pd.merge(features_df, outcomes_df_subset, on='fid', how='inner')
    final_df = final_df.drop(['fid'], axis=1, inplace=False, errors='ignore')
    final_df.sample(frac=1.0)

    last_index = final_df.columns.get_loc(outcome_variable_name)
    X = final_df.iloc[:, :last_index]
    y = final_df.iloc[:, last_index]
    y = pd.get_dummies(y).values.argmax(1)

    res = RandomOverSampler(random_state=RANDOM_STATE)
    X, y = res.fit_sample(X, y)
    # shuffle X and y in unison, and then return
    return util.unison_shuffle(X, y)

In [154]:
# all the parameters for the script can be adjusted here

# num_epochs: the max number of epochs we will train the NN for
# hidden_sizes: the number of neurons in each hidden layer, enter it as a list
# output_dim: the dimension of the output. Since outputs are 'line', 'scatter', 'bar', it's 3
#                                                            + 'circos' ; +  'table' will be 5
# weight_decay: how much to decay LR in the NN. This can be set to 0 since we decrease LR already through
#   the ReduceLROnPlateau() function
# dropout: the dropout in each layer
# patience: how many epochs we go through (with a near constant learning rate, this threshold is adjusted using
#   threshold) before dropping learning rate by a factor of 10
# model_prefix: all models will be loaded/saved with the prefix of the file in the beginning
# save_model: save each epoch's model onto models/ folder.
# print_test: print test accuracies into test.txt
# test_best: test the test accuracy of the best model we've found (best
# model determined using val accuracy)

# note: training is automatically stopped when learning rate < 0.01 *
# starting learning rate

parameters = {
    'batch_size': 200,
    'num_epochs': 100,
    'hidden_sizes': [800, 800, 800],
    'learning_rate': 5e-4,
    'output_dim': 3,
    'weight_decay': 0,
    'dropout': 0.00,
    'patience': 20,
    'threshold': 1e-3,
    'model_prefix': 'agg',
    'save_model': False,
    'print_test': True,
    'test_best': False
}

Load features

In [155]:
X, y = load_features()
# split 10% of examples into val, and 10% into test
util.save_matrices_to_disk(
    X, y, [0.1, 0.1], saves_directory, parameters['model_prefix'], num_datapoints)

number of total examples is  210
indexes for splitting between train/val/test are  [168, 189]


In [156]:
X_train, y_train, X_val, y_val, X_test, y_test = util.load_matrices_from_disk(
        saves_directory, parameters['model_prefix'], num_datapoints)

Train model

In [157]:
# set "device" to regular cpu if no GPU is available 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# to-do: maybe add this to the train.py script? 
print(device)

cuda:0


In [158]:
train_dataloader, val_dataloader, test_dataloader = train.load_datasets(
    X_train.astype(np.float64), y_train.astype(np.float64), X_val.astype(np.float64), y_val.astype(np.float64), parameters, X_test=X_test.astype(np.float64), y_test=y_test.astype(np.float64))

output_dim is 3


In [177]:
# batch_size is determined in the dataloader, so the variable is irrelevant here
batch_size = parameters.get('batch_size', 200)
num_epochs = parameters.get('num_epochs', 100)
hidden_sizes = parameters.get('hidden_sizes', [200])
learning_rate = parameters.get('learning_rate', 0.0005)
weight_decay = parameters.get('weight_decay', 0)
dropout = parameters.get('dropout', 0.0)
patience = parameters.get('patience', 10)
threshold = parameters.get('threshold', 1e-3)
input_dim = parameters['input_dim']
output_dim = parameters['output_dim']
# output_period: output training loss every x batches
output_period = parameters.get('output_period', 0)
model_prefix = parameters.get('model_prefix', None)
only_train = parameters.get('only_train', False)
save_model = parameters.get('save_model', False)
test_best = parameters.get('test_best', False)
print_test = parameters.get(
    'print_test', False) and (
    test_dataloader is not None)
# set device to cuda if available, cpu if not
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device is ", device)	

device is  cuda:0


In [178]:
input_dim

657

In [160]:
# print out test accuracies to a separate file
suffix=''
test_file = None
if print_test:
    test_file = open('test{}.txt'.format(suffix), 'a')
    test_file.write('\n\n')
    test_file.write('Starting at ' + util.get_time() + '\n')
    test_file.write(', '.join(['{}={!r}'.format(k, v) for k, v in sorted(parameters.items())]) + '\n\n')

In [169]:
# define a model that can be passed the above parameters
class AdvancedNet(nn.Module):
    '''
    I'm borrowing AdvancedNet from vizML. 
    It uses nn.ModuleList to construct a list of modules/layers according to the above paramiters. 
    hidden_sizes is a list where each intiger becomes the number of neurons in a hidden layer. 
    By default it is [800, 800, 800] so makes a network with 3 hidden layers.
    '''
    def __init__(self, input_size, hidden_sizes, num_classes, dropout=0.0):
        super(AdvancedNet, self).__init__()
        self.nn_list = nn.ModuleList()
        self.nn_list.append(nn.Linear(input_size, hidden_sizes[0]))
        self.nn_list.append(nn.ReLU())
        if dropout:
            self.nn_list.append(nn.Dropout(p=dropout))

        for i in range(1, len(hidden_sizes)):
            self.nn_list.append(
                nn.Linear(hidden_sizes[i - 1], hidden_sizes[i]))
            self.nn_list.append(nn.ReLU())
            if dropout:
                self.nn_list.append(nn.Dropout(p=dropout))
        self.nn_list.append(nn.Linear(hidden_sizes[-1], num_classes))

    def forward(self, x):
        for module in self.nn_list:
            x = module(x)
        return x

In [175]:
class Feedforward(nn.Module):
        def __init__(self, input_size, hidden_size, num_classes, dropout=0.0):
            super(Feedforward, self).__init__()
            self.input_size = input_size
            self.hidden_size  = hidden_size
            self.fc1 = nn.Linear(input_size, hidden_size)
            self.fc2 = nn.Linear(hidden_size, hidden_size)
            self.fc3 = nn.Linear(hidden_size, hidden_size)
            self.fc4 = nn.Linear(hidden_size, hidden_size)
            self.fc5 = nn.Linear(hidden_size, num_classes)
            
        def forward(self, x):
            x = nn.functional.relu(self.fc1(x))
            x = nn.functional.relu(self.fc2(x))
            x = nn.functional.relu(self.fc3(x))
            x = nn.functional.relu(self.fc4(x))
            x = self.fc5(x)
            return F.log_softmax(x)

In [173]:
class Feedforward(nn.Module):
    '''
    This is my own model, more explicitly written
    Less flexible than AdvancedNet becayse you can't change the 
    architecture with a change to hidden_sizes -- but identical otherwise.
    Useful for debugging the cuda stuff.
    Change models below 
    '''
        def __init__(self, input_size, hidden_size, num_classes, dropout=0.0):
            super(Feedforward, self).__init__()
            self.input_size = input_size
            self.hidden_size  = hidden_size
            self.fc1 = nn.Linear(self.input_size, self.hidden_size)
            self.fc2 = nn.Linear(self.hidden_size, self.hidden_size)
            self.fc3 = nn.Linear(self.hidden_size, self.hidden_size)
            self.fc4 = nn.Linear(self.hidden_size, self.hidden_size)
            self.fc5 = nn.Linear(self.hidden_size, self.num_classes)
            
        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = F.relu(self.fc3(x))
            x = F.relu(self.fc4(x))
            x = self.fc5(x)
            return F.log_softmax(x)

In [179]:
# set key paramiters
input_dim = parameters['input_dim']
output_dim = parameters['output_dim']\dropout = parameters.get('dropout', 0.0)
learning_rate = parameters.get('learning_rate', 0.0005)
weight_decay = parameters.get('weight_decay', 0)
# hidden_size (a value) is different than hidden_sizes (a list)
# Feedforward model applies hidden_size to all hidden layers
# while AdvancedNet applies the list hidden_sizes itteratively to create layesr
hidden_size = 800
# nets and optimizers
criterion = nn.CrossEntropyLoss().to(device)
model = Feedforward(
    input_dim,
    hidden_size,
    output_dim,
    dropout=dropout).to(device)
optimizer = optim.Adam(
    net.parameters(),
    lr=learning_rate,
    weight_decay=weight_decay)

# ReduceLROnPlateau reduces learning rate by factor of 10 once val loss has plateaued
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, patience=patience, threshold=threshold)

In [180]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()                                                  # Set model parameters to training mode
    running_loss = 0.0                                             # Keep track of running loss so average loss can be calculated at the end of the epoch
    all_true = []                                                  # Keep track of all of the true and predicted labels
    all_pred = []                                                  #   so AUC score can be calculated at the end of the epoch
    
    for batch_num, (inputs, labels) in enumerate(dataloader):      # Loop through all batches in the dataloader
        
        all_true += labels.tolist()                                # Save true labels
        inputs = inputs.to(device)                                 # Send images and labels to computing device. If the model is on 'cuda',
        labels = labels.to(device)                                 #   the images and labels must also be on cuda
        
        optimizer.zero_grad()                                      # Zero the gradients so the optimizer can keep track of a new pass of data through the network
        pred = model(inputs)                                         # Pass a batch of images through the model
        loss = criterion(pred, labels)                             # Calculate the loss between the ground truth labels and the model predictions
        loss.backward()                                            # Perform backpropagation on the loss to train the network
        optimizer.step()                                           # Step the optimizer forward
        
        all_pred += torch.sigmoid(pred).tolist()                   # Keep track of model predictions to calculate AUC later.
                                                                   #   When we want to make predictions with our model without calculating BCEWithLogitsLoss,
                                                                   #   we must manually apply the sigmoid function to the model output to get predicted values
                                                                   #   between 0 and 1. tolist() simply converts the result to a normal Python list.
        
        running_loss += loss.item()
    return running_loss / (i+1), roc_auc_score(all_true, all_pred, average='weighted')    # Return the average loss over the epoch and the AUC score

In [181]:
def test(model, dataloader, criterion, device):
    model.eval()                                                   # Set model parameters to evaluation mode
    running_loss = 0.0
    all_true = []
    all_pred = []
    for i, (inputs, labels) in enumerate(dataloader):
        all_true += labels.tolist()
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        with torch.no_grad():                                      # Don't track gradients through the model
            pred = model(inputs)
            loss = criterion(pred, labels)
            all_pred += torch.sigmoid(pred).tolist()
        
        running_loss += loss.item()
    return running_loss / (i+1), roc_auc_score(all_true, all_pred, average='weighted')

In [182]:
def fit(model, train_dataloader, val_dataloader, optimizer, criterion, device, num_epochs):
    # Log start time
    start_time = str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
    msg = f'Start model training {start_time}'
    with open('log.txt', 'a') as f: f.write(msg+'\n')
    print(msg)
    
    # Initialize best model weights, AUC score, and keep track of train/val loss and AUC
    best_model_wts = copy.deepcopy(model.state_dict())
    best_model_name = ''
    best_score = -1
    track_values = {'train_loss': [],
                    'val_loss': [],
                    'train_auc': [],
                    'val_auc': []}

    # Calculate initial loss and score on train and validation sets
    start = time.time()
    train_loss, train_score = test(model, train_dataloader, criterion, device)
    val_loss, val_score = test(model, val_dataloader, criterion, device)
    
    # Store initial losses and AUC scores
    track_values['train_loss'].append(train_loss)
    track_values['val_loss'].append(val_loss)
    track_values['train_auc'].append(train_score)
    track_values['val_auc'].append(val_score)
    
    # Print training status and write to a log file
    msg = f'Epoch 0/{num_epochs} Train Loss: {train_loss:.4f}, Train AUC: {train_score:.4f}, Val Loss: {val_loss:.4f}, Val AUC: {val_score:.4f} Time: {time.time()-start:.2f}s'
    with open('log.txt', 'a') as f: f.write(msg+'\n')
    print(msg)

    # Loop over the specified number of epochs
    for epoch in range(num_epochs):
        start = time.time() # Start timer to keep track of how long an epoch takes

        # Run train and test functions on train and val sets
        train_loss, train_score = train(model, train_dataloader, optimizer, criterion, device)
        val_loss, val_score = test(model, val_dataloader, criterion, device)
        
        # Store losses and AUC scores
        track_values['train_loss'].append(train_loss)
        track_values['val_loss'].append(val_loss)
        track_values['train_auc'].append(train_score)
        track_values['val_auc'].append(val_score)

        # Save the model weights if the AUC score on the validation set is higher than the previous best model
        if val_score > best_score:
            best_score = val_score
            best_model_wts = copy.deepcopy(model.state_dict())
            if os.path.exists(best_model_name): os.remove(best_model_name)
            best_model_name = f'./best_model_weights_epoch_{epoch+1}_auc_{val_score:.4f}.pt'
            torch.save(model.state_dict(), best_model_name)

        # Print training status and write to a log file
        msg = f'Epoch {epoch+1}/{num_epochs} Train Loss: {train_loss:.4f}, Train AUC: {train_score:.4f}, Val Loss: {val_loss:.4f}, Val AUC: {val_score:.4f} Time: {time.time()-start:.2f}s'
        with open('log.txt', 'a') as f: f.write(msg+'\n')
        print(msg)
    
    return track_values

In [184]:
# start training! 
track_values = fit(model, train_dataloader, val_dataloader, optimizer, criterion, device, num_epochs)

Start model training 2020-06-19-13-44-22


RuntimeError: Expected object of scalar type Float but got scalar type Double for argument #2 'mat1' in call to _th_addmm

In [130]:
# start trainging 
num_train_batches = len(train_dataloader)
epoch = 1
best_epoch, best_acc = 0, 0
train_acc = [0]

print('Starting training at ' + util.get_time())
print(', '.join(['{}={!r}'.format(k, v)
                for k, v in sorted(parameters.items())]))

# print out test accuracies to a separate file
suffix=''
test_file = None
if print_test:
    test_file = open('test{}.txt'.format(suffix), 'a')
    test_file.write('\n\n')
    test_file.write('Starting at ' + util.get_time() + '\n')
    test_file.write(', '.join(['{}={!r}'.format(k, v) for k, v in sorted(parameters.items())]) + '\n\n')

print('starting training')
while epoch <= num_epochs:
    running_loss = 0.0
    epoch_acc = 0.0

    net.train()
    print(
        'epoch: %d, lr: %.1e' %
        (epoch,
        optimizer.param_groups[0]['lr']) +
        '    ' +
        util.get_time())
    for batch_num, (inputs, labels) in enumerate(train_dataloader, 1):
        optimizer.zero_grad()
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = net(inputs) ### here's where it's hanging up 
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
        # output is 2D array (logsoftmax output), so we flatten it to a 1D to get the max index for each example
        # and then calculate accuracy off that
        max_index = outputs.max(dim=1)[1]
        epoch_acc += np.sum(max_index.data.cpu().numpy()
                           == labels.data.cpu().numpy()) / inputs.size()[0]

        # output every output_period batches
        if output_period:
            if batch_num % output_period == 0:
                print('[%d:%.2f] loss: %.3f' % (
                    epoch, batch_num * 1.0 / num_train_batches,
                    running_loss / output_period))
                running_loss = 0.0
                gc.collect()

    # save model after every epoch in models/ folder
    if save_model:
        torch.save(
            net.state_dict(),
            models_directory +
            '/' +
            model_prefix +
            ".%d" %
            epoch)

    # print training/val accuracy
    epoch_acc = epoch_acc / num_train_batches
    train_acc.append(epoch_acc)
    print('train acc: %.4f' % (epoch_acc))
    if only_train:
        scheduler.step(loss)
    else:
        val_accuracy, total_loss = evaluate.eval_error(
            net, val_dataloader, criterion)
        print('val acc: %.4f, loss: %.4f' % (val_accuracy, total_loss))
        # remember: feed val loss into scheduler
        scheduler.step(total_loss)
        if val_accuracy > best_acc:
            best_epoch, best_acc = epoch, val_accuracy
        print()

        # write test accuracy
        if print_test:
            test_accuracy, total_loss = evaluate.eval_error(
                net, test_dataloader, criterion)
            test_file.write(
                'epoch: %d' %
                (epoch) +
                '    ' +
                util.get_time() +
                '\n')
            test_file.write('train acc: %.4f' % (epoch_acc) + '\n')
            test_file.write('val acc: %.4f' % (val_accuracy) + '\n')
            test_file.write('test acc: %.4f' % (test_accuracy) + '\n')
            test_file.write('loss: %.4f' % (test_accuracy) + '\n')

        gc.collect()
            
    # perform early stopping here if our learning rate is below a threshold
    # because small lr means little change in accuracy anyways
    if optimizer.param_groups[0]['lr'] < (0.9 * 0.01 * learning_rate):
        print('Low LR reached, finishing training early')
        break
    epoch += 1

print('best epoch: %d' % best_epoch)
print('best val accuracy: %.4f' % best_acc)
print('train accuracy at that epoch: %.4f' % train_acc[best_epoch])
print('ending at', time.ctime())

if test_best:
    net.load_state_dict(
        torch.load(
            models_directory +
            '/' +
            model_prefix +
            '.' +
            str(best_epoch)))
    best_test_accuracy, total_loss = evaluate.eval_error(
        net, test_dataloader, criterion)
    test_file.write('*****\n')
    test_file.write(
        'best test acc: %.4f, loss: %.4f' %
        (best_test_accuracy, total_loss) + '\n')
    test_file.write('*****\n')
    print('best test acc: %.4f, loss: %.4f' %
        (best_test_accuracy, total_loss))

if print_test:
    test_file.write('\n')
    test_file.close()

print('\n\n\n')



Starting training at 2020-06-19 06:49:45
batch_size=200, dropout=0.0, hidden_sizes=[800, 800, 800], input_dim=657, learning_rate=0.0005, model_prefix='agg', num_epochs=100, output_dim=3, patience=20, print_test=True, save_model=False, test_best=False, threshold=0.001, weight_decay=0
starting training
epoch: 1, lr: 5.0e-04    2020-06-19 06:49:45


AttributeError: 'Tensor' object has no attribute 'astype'