# RNNs on GPU
After 2 full days of Linux hell I got our Nvidia Tesla K40c to run with keras, and this will speed up computations significantly. Furthermore, I met with Jake Varley who works at Google Brain and knows his shit and he gave me a few tips. I will implement those here, and they are:  

1) one-hot encoding over categorical variables  
2) augment data by "mirroring" it, e.g. double the dataset by switching East and West labels since task is symmetrical  
3) create 2 artificial datasets: one shuffled version where models should not be able to learn anything, and one "hard-coded" one where they should achieve perfect decoding score. This will be a sanity check   
4) give more data to validation and test sets  
5) only include last trial -- this point i'm less sure that I understand. from my understanding including a whole sequence is what gives the memory of the RNN the ability to learn chunks but we'll see. i'm going to try both 'last trial' and a sequence of length n.  
6) switch dropout to 50%  
7) increase batch size to 512  

1-5 are about how we prepare the sequences. The last 2 are about the networks themeselves. 


In [1]:
#BOILERPLATE _______________________
#MODULES ______________________
ROOT = '/Users/pablomartin/python/'
import os
import time
import pickle
import numpy as np
import pandas as pd
import itertools
import operator
import pysftp
import matplotlib.pyplot as plt
from keras.models import load_model

from RNNmodule.SequenceClass import Sequences
from behavioral_performance.utils import fileNames, fileNameLabels
from Visualize.decoding import *
idx = pd.IndexSlice
datatype = ['Full', 'Last', 'Med']
RANDOM_STATE = 6
print 'modules loaded...\n'

artificial_datasets = ['PSR_TbyT_Saline_Rigged.p',
                       'DSR_TbyT_Saline_Shuffled.p',
                       'PSR_TbyT_Saline_Shuffled.p']
for ad in artificial_datasets:
    try:
        fileNames.index(ad)
    except ValueError:
        fileNames.append(ad)
try:
    fileNames.index('DSR_TbyT_Naive_Saline.p')
    fileNames.pop(fileNames.index('DSR_TbyT_Naive_Saline.p'))
except ValueError:
    pass

for fileName in fileNames:
    print fileName


Using TensorFlow backend.


modules loaded...

DSR_TbyT_FirstTraining.p
DSR_TbyT_Naive_mPFC.p
DSR_TbyT_Naive_OFC.p
DSR_TbyT_MidTraining.p
DSR_TbyT_Saline.p
DSR_TbyT_MPFC.p
DSR_TbyT_OFC.p
DSR_TbyT_Ipsi.p
DSR_TbyT_Contra.p
PSR_TbyT_FirstTraining.p
PSR_TbyT_MidTraining.p
PSR_TbyT_Saline.p
PSR_TbyT_MPFC.p
PSR_TbyT_OFC.p
PSR_TbyT_Ipsi.p
PSR_TbyT_Contra.p
PSR_TbyT_Saline_Rigged.p
DSR_TbyT_Saline_Shuffled.p
PSR_TbyT_Saline_Shuffled.p


## Creating Artificial Datasets
First, let's create a shuffled dataset that no network could learn, so we have a baseline. We will make one for DSR_Saline and PSR_Saline, which are the 2 datasets with most data. We will keep all the trial information the same, but shuffle the labels.  
Also, let's create a 'rigged' dataset that the network should be able to learn to perfection. We're gonna pick an XOR gate, where input A = last choice, input B = reward from penultimate trial. The prediction of this XOR model is highly counterintuitive and there is no chance that that is what rats do. Nonetheless, if we are implementing these models correctly, the network should disregard all the other data, find this pattern, and achieve perfect decoding. Otherwise, we are implementing everything wrong. 

In [3]:
#load saline datasets
DSR_Saline = pickle.load(open(ROOT + 'DATA_structures/TbyT/DSR_TbyT_Saline.p', 'rb'))
PSR_Saline = pickle.load(open(ROOT + 'DATA_structures/TbyT/PSR_TbyT_Saline.p', 'rb'))
#shuffle current choice - these are the eventual labels
np.random.shuffle(DSR_Saline['choice',0].values)
np.random.shuffle(PSR_Saline['choice',0].values)
#save result
pickle.dump(DSR_Saline, open(ROOT + 'DATA_structures/TbyT/DSR_TbyT_Saline_Shuffled.p', 'wb'))
pickle.dump(PSR_Saline, open(ROOT + 'DATA_structures/TbyT/PSR_TbyT_Saline_Shuffled.p', 'wb'))
print 'created shuffled datasets'


created shuffled datasets
created rigged dataset


In [None]:
#rigged dataset - base will be PSR_Saline
PSR_Saline = pickle.load(open(ROOT + \
                            'DATA_structures/TbyT/PSR_TbyT_Saline.p', 'rb'))
fresh_copy = pickle.load(open(ROOT + \
                            'DATA_structures/TbyT/PSR_TbyT_Saline.p', 'rb'))
choice_ch = np.sum(PSR_Saline['choice',0] == fresh_copy['choice', 0])
reward_ch = np.sum(PSR_Saline['reward',0] == fresh_copy['reward', 0])
print 'matching values choice: %i/%i' %(choice_ch, len(PSR_Saline))
print 'matching values reward: %i/%i' %(reward_ch, len(PSR_Saline))
for label, session in PSR_Saline.groupby(axis = 0, level = 'session'):

    A = copy.deepcopy(session['reward', 0])
    B = copy.deepcopy(session['choice', 0])
    C = A + 2 * B
    for trial in range(2, len(session)):
        C.iloc[trial] = (C.iloc[trial - 1] + C.iloc[trial - 2]) % 4
    A = C % 2
    B = C > 1
    print 'before assigning rigged list to original'
    print 'A: %i/%i' %(np.sum(A == session['reward', 0]), len(session))
    print 'B: %i/%i' %(np.sum(B == session['choice', 0]), len(session))

    session['reward',0] = A
    session['choice',0] = B
    print 'after assigning rigged list to original'
    print 'A: %i/%i' %(np.sum(A == session['reward', 0]), len(session))
    print 'B: %i/%i' %(np.sum(B == session['choice', 0]), len(session))


    PSR_Saline.loc[idx[label,:,:],idx['choice',0]] = session['choice',0]
    PSR_Saline.loc[idx[label,:,:],idx['reward',0]] = session['reward',0]



choice_ch = np.sum(PSR_Saline['choice',0] == fresh_copy['choice', 0])
reward_ch = np.sum(PSR_Saline['reward',0] == fresh_copy['reward', 0])

print 'matching values choice: %i/%i' %(choice_ch, len(PSR_Saline))
print 'matching values reward: %i/%i' %(reward_ch, len(PSR_Saline))

pickle.dump(PSR_Saline, open(ROOT + \
                        'DATA_structures/TbyT/PSR_TbyT_Saline_Rigged.p', 'wb'))
print 'created rigged dataset'


## Creating Sequences
Most of the fixes are in the preprocessing stage: preparing sequences. The data augmentation should be done *only* on the training set. 

In [4]:
seq_lengths = [1, 30, 200]
seq_length_labels = ['Last', 'Med', 'Full']
seq_types = ['OneHotBinaryMinimal']

dataset_filenames = [w for w in os.listdir(ROOT + 'DATA_structures/TbyT/')\
                     if not w.startswith('.') and w!='DSR_TbyT_Naive_Saline.p']
dataset_filenames = ['DSR_TbyT_Saline_Shuffled.p', 'PSR_TbyT_Saline_Shuffled.p', 'PSR_TbyT_Saline_Rigged.p']

dataset_paths = [ROOT + 'DATA_structures/TbyT/' + w for w in dataset_filenames]

mirrorFlag = True
for dataset_file, dataset_path in zip(dataset_filenames, dataset_paths):
    if not dataset_file.find('Rigged') < 0:
        mirrorFlag = False
    if not dataset_file.find('Shuffled') < 0:
        mirrorFlag = False
        
    print 'dataset %s' %dataset_file
    df = pickle.load(open(dataset_path, 'rb'))
    for seq_length, seq_type in itertools.product(seq_lengths, seq_types):
        seqObject = Sequences(seq_length, seq_type, RANDOM_STATE = RANDOM_STATE)
        seqObject.create_sequences(df,
                                   timesteps = seq_length,
                                   feature_dim = seq_type,
                                   validate_size = 0.25,
                                   test_size = 0.25,
                                   mirrorFlag = mirrorFlag)
        seq_length_label =  seq_length_labels[seq_lengths.index(seq_length)]
        pickle.dump(seqObject, open(ROOT + 'DATA_structures/RNN_sequences/' + \
                                            seq_type + '/' + \
                                            seq_length_label + '/' + \
                                            dataset_file, 'wb'))
        print 'finished: %s - %s - %s' %(dataset_file, seq_length, seq_type)
    

dataset DSR_TbyT_Saline_Shuffled.p
finished: DSR_TbyT_Saline_Shuffled.p - 1 - OneHotBinaryMinimal
finished: DSR_TbyT_Saline_Shuffled.p - 30 - OneHotBinaryMinimal
finished: DSR_TbyT_Saline_Shuffled.p - 200 - OneHotBinaryMinimal
dataset PSR_TbyT_Saline_Shuffled.p
finished: PSR_TbyT_Saline_Shuffled.p - 1 - OneHotBinaryMinimal
finished: PSR_TbyT_Saline_Shuffled.p - 30 - OneHotBinaryMinimal
finished: PSR_TbyT_Saline_Shuffled.p - 200 - OneHotBinaryMinimal
dataset PSR_TbyT_Saline_Rigged.p
finished: PSR_TbyT_Saline_Rigged.p - 1 - OneHotBinaryMinimal
finished: PSR_TbyT_Saline_Rigged.p - 30 - OneHotBinaryMinimal
finished: PSR_TbyT_Saline_Rigged.p - 200 - OneHotBinaryMinimal


## Some Useful Tools
Let's define some tools that would be nice to have. Let's use pysftp systematically for all this stuff.

1) function that finds highest validation accuracy given a folder  
2) plot training vs. validation accuracy and loss

In [7]:
def retrieve_val_acc(connection, model_dir):
    if not connection.isdir(model_dir):
        print 'model does not exist... exiting'
        return None
    model_files = connection.listdir(model_dir)
    finished_training = sum([w=='loss_acc_history.p' for w in model_files])
    if finished_training:
        scores = dict([(index, float(w[w.find('-') + 1: -5]))
                       for index, w in enumerate(model_files) if w.startswith('w')])
        best_model = model_files[max(scores, key=scores.get)]
        val_score = max(scores.values())
        return val_score, best_model
    else:
        print 'training did not finish or has not begun...'
        return None
        
        
def retrieve_history(connection, model_dir, path_to_save_to = '.'):
    if not connection.isdir(model_dir):
        print 'model does not exist... exiting'
        return 
    model_files = connection.listdir(model_dir)
    finished_training = sum([w=='loss_acc_history.p' for w in model_files])
    if finished_training:
        connection.get(model_dir + 'loss_acc_history.p', path_to_save_to)
        return 
    else:
        print 'training did not finish or has not begun...'
        return 
    
    
#this will plot standard training/validation progress thru epochs
def plot_training_history(hist, title):
    for field in hist.keys():
        plt.plot(hist[field], label=field)
    plt.xlabel('epochs')
    plt.ylabel('Accuracy / Loss')
    plt.title(title, FontSize = 20)
    plt.legend()
    plt.show()

    
#downloads file, plots it, and then deletes file
def plot_training_hist(connection, model_dir):
    retrieve_history(connection, model_dir, path_to_save_to = '/Users/pablomartin/python/loss_acc_history.p')
    hist = pickle.load(open('/Users/pablomartin/python/loss_acc_history.p', 'rb'))
    plot_training_history(hist, model_dir)
    os.remove('/Users/pablomartin/python/loss_acc_history.p')




## Determine Best Networks
The following code uses a secure file transfer protocol (SFTP) connection into epsilon, finds out which models are done training, and evaluates which are the top ten performing models on the validation set. It saves this information. 

In [10]:
#define parameters to iterate over
epsilon_ROOT = '/home/pablo/python/'

#CREATING NETWORK DIMENSIONS__________
hidden_dimensions = [5, 20, 50, 100]
hidden_dimensions_red = [5, 50]
no_models = len(hidden_dimensions) + len(hidden_dimensions) ** 2 \
          + len(hidden_dimensions_red) ** 3
HDS = np.zeros([no_models, 3], dtype = int)
HDS[:len(hidden_dimensions), 0] = hidden_dimensions
for index, (hd1, hd2) in enumerate(itertools.product(hidden_dimensions,
                                                     hidden_dimensions)):
    HDS[len(hidden_dimensions) + index, 0] = hd1
    HDS[len(hidden_dimensions) + index, 1] = hd2

counter = np.argmin(np.sum(HDS, axis = 1) != 0)
for index, (hd1, hd2, hd3) in enumerate(itertools.product(hidden_dimensions_red,
                                                          hidden_dimensions_red,
                                                          hidden_dimensions_red)):
    HDS[counter + index, 0] = hd1
    HDS[counter + index, 1] = hd2
    HDS[counter + index, 2] = hd3


cellType_folders = {'RNN' : 'Models/RNN/OneHotBinaryMinimal/',
                    'LSTM' : 'Models/LSTM/Pablo/OneHotBinaryMinimal/'}

rows = pd.MultiIndex.from_product([datatype, [w[:-2] for w in fileNames]],
                                  names =['Seq_Length', 'Dataset'])
cols = pd.MultiIndex.from_product([['val_score', 'cell', 'network', 'dir_path', 'file_path'], range(1,11)],
                                  names = ['Network Info', 'Rank'])
MODEL_RESULTS = pd.DataFrame(np.zeros([len(rows), len(cols)]), index=rows, columns=cols)


In [11]:
verbose = 0
epsilon_connection = pysftp.Connection('10.81.104.156', username='pablo', password='pablo2014')
epsilon2_connection = pysftp.Connection('10.81.104.143', username='pablo', password='pablo2015')
remote_desktops = [epsilon2_connection]
model_iterator = itertools.product(remote_desktops, fileNames, ['Med'])

print 'working on %i datasets' %(len(fileNames))
print 'finished: ',
for iterator_index, (connection, fileName, dataPrep) in enumerate(model_iterator):

    d = {}
    print '%i, ' %(iterator_index),
    for cell_type, hd in itertools.product(['RNN', 'LSTM'], HDS):
        model_dir = epsilon_ROOT + cellType_folders[cell_type] + dataPrep + '/'
        network_name = '_D_'.join([fileName[:-2]] + [str(w) for w in hd if w > 0])
        model_dir += network_name
        start = time.time()
        if connection.isdir(model_dir):
            tmp_val = retrieve_val_acc(connection, model_dir)
        else:
            tmp_val = None
        if verbose > 1: print 'retrieval time: %.3f sec' %(time.time() - start)
        if tmp_val:
            val_score, best_model = tmp_val
            d[best_model] = (val_score, cell_type, network_name)
        
    top_ten = sorted(d, key=lambda x:d[x][0])[-10:]
    for index, key in enumerate(top_ten[::-1]):
        dir_path = epsilon_ROOT \
                    + cellType_folders[d[key][1]] \
                    + dataPrep + '/' \
                    + d[key][2] + '/' 
        file_path = dir_path + key
        if verbose: print 'model directory exists:%s' %connection.isdir(dir_path)
        if verbose: print 'model exists:%s' %connection.isfile(file_path)
        MODEL_RESULTS.loc[idx[dataPrep, fileName[:-2]], idx['val_score', 1 + index]] = d[key][0]
        MODEL_RESULTS.loc[idx[dataPrep, fileName[:-2]], idx['cell', 1 + index]] = d[key][1]
        MODEL_RESULTS.loc[idx[dataPrep, fileName[:-2]], idx['network', 1 + index]] = \
                                                            d[key][2][len(fileName[:-2]) + 1:]
        MODEL_RESULTS.loc[idx[dataPrep, fileName[:-2]], idx['dir_path', 1 + index]] = dir_path
        MODEL_RESULTS.loc[idx[dataPrep, fileName[:-2]], idx['file_path', 1 + index]] = file_path
                                        
                      
MODEL_RESULTS.sort_index(axis = 0, inplace = True)
MODEL_RESULTS.sort_index(axis = 1, inplace = True)
print 'done'
pickle.dump(MODEL_RESULTS, open('/Users/pablomartin/python/tmp/MODEL_RESULTS.p' ,'wb'))
print MODEL_RESULTS.loc[idx['Med',:], idx['val_score',1:5]]

working on 19 datasets
finished:  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  done
Network Info                        val_score                        
Rank                                        1     2     3     4     5
Seq_Length Dataset                                                   
Med        DSR_TbyT_Contra               0.73  0.72  0.72  0.72  0.72
           DSR_TbyT_FirstTraining        0.70  0.70  0.70  0.69  0.69
           DSR_TbyT_Ipsi                 0.75  0.74  0.74  0.74  0.74
           DSR_TbyT_MPFC                 0.77  0.76  0.75  0.75  0.75
           DSR_TbyT_MidTraining          0.74  0.71  0.70  0.70  0.70
           DSR_TbyT_Naive_OFC            0.77  0.77  0.77  0.75  0.75
           DSR_TbyT_Naive_mPFC           0.80  0.77  0.77  0.77  0.77
           DSR_TbyT_OFC                  0.76  0.76  0.76  0.76  0.75
           DSR_TbyT_Saline               0.74  0.74  0.74  0.74  0.74
           DSR_TbyT_Saline_Shuffled 

In [12]:
print MODEL_RESULTS.loc[idx['Med',:], idx[['cell','val_score','network', 'file_path'],1]]

Network Info                         cell  \
Rank                                    1   
Seq_Length Dataset                          
Med        DSR_TbyT_Contra            RNN   
           DSR_TbyT_FirstTraining     RNN   
           DSR_TbyT_Ipsi             LSTM   
           DSR_TbyT_MPFC             LSTM   
           DSR_TbyT_MidTraining       RNN   
           DSR_TbyT_Naive_OFC         RNN   
           DSR_TbyT_Naive_mPFC        RNN   
           DSR_TbyT_OFC               RNN   
           DSR_TbyT_Saline            RNN   
           DSR_TbyT_Saline_Shuffled  LSTM   
           PSR_TbyT_Contra            RNN   
           PSR_TbyT_FirstTraining    LSTM   
           PSR_TbyT_Ipsi              RNN   
           PSR_TbyT_MPFC             LSTM   
           PSR_TbyT_MidTraining       RNN   
           PSR_TbyT_OFC               RNN   
           PSR_TbyT_Saline           LSTM   
           PSR_TbyT_Saline_Rigged     RNN   
           PSR_TbyT_Saline_Shuffled   RNN   

Network I

## Best Network Performance on Train, Validate, and Test
Let's see how they generalize !

In [13]:
MODEL_RESULTS = pickle.load(open('/Users/pablomartin/python/tmp/MODEL_RESULTS.p','rb'))
dataPrep = 'Med'
remote = 'epsilon2'

desktops = {'epsilon1': {'ip': '10.81.104.153', 'password' : 'pablo2014'},
            'epsilon2': {'ip': '10.81.104.143', 'password' : 'pablo2015'}}

scores = pd.DataFrame(np.zeros([len(fileNames),7]),
                      index = fileNames,
                      columns = ['train_choice','validate_choice','test_choice',
                                 'train','validate','test', 'model'])   


for index, fileName in enumerate(fileNames):
    
    sequence_path = '/Users/pablomartin/python/' + \
                    'DATA_structures/RNN_sequences/OneHotBinaryMinimal/' + \
                    dataPrep + '/' + fileName
    seqs = pickle.load(open(sequence_path, 'rb'))       

    file_path = MODEL_RESULTS.loc[idx[dataPrep, fileName[:-2]], idx['file_path',1]]
    model_target = '/Users/pablomartin/python/Models/Winners/' + fileName[:-2] + '.hdf5'
    scores.loc[fileName, 'model'] = file_path
    file_weights = file_path[-file_path[::-1].find('/'):]
    #downloading model if not available in local machine
    with pysftp.Connection(desktops[remote]['ip'],
                           username='pablo',
                           password=desktops[remote]['password']) as connection:
        if not os.path.isfile(model_target):
            print 'downloading %s (%i/%i) ...' %(fileName, 1 + index, len(fileNames)),
            connection.get(file_path, model_target)
            print ' done'
    print 'loading model ...',
    model = load_model(model_target)
    print ' done'
    print 'evaluating model ...',
    
    loss, acc = model.evaluate(x = seqs.X_train, y = seqs.y_train, verbose = 0)
    scores.loc[fileName, 'train'] = acc
    loss, acc = model.evaluate(x = seqs.X_validate, y = seqs.y_validate, verbose = 0)
    scores.loc[fileName, 'validate'] = acc
    loss, acc = model.evaluate(x = seqs.X_test, y = seqs.y_test, verbose = 0)
    scores.loc[fileName, 'test'] = acc
    #choice accuracy
    preds = np.argmax(model.predict(x = seqs.X_train), axis = 1) > 1
    scores.loc[fileName, 'train_choice'] = \
        np.float(np.sum(preds == (np.argmax(seqs.y_train, axis=1) > 1))) / len(seqs.X_train)
    preds = np.argmax(model.predict(x = seqs.X_validate), axis = 1) > 1
    scores.loc[fileName, 'validate_choice'] = \
        np.float(np.sum(preds == (np.argmax(seqs.y_validate, axis=1) > 1))) / len(seqs.X_validate)
    preds = np.argmax(model.predict(x = seqs.X_test), axis = 1) > 1
    scores.loc[fileName, 'test_choice'] = \
        np.float(np.sum(preds == (np.argmax(seqs.y_test, axis=1) > 1))) / len(seqs.X_test)
    assert scores.loc[fileName, 'train_choice'] >= scores.loc[fileName, 'train']
    assert scores.loc[fileName, 'validate_choice'] >= scores.loc[fileName, 'validate']
    assert scores.loc[fileName, 'test_choice'] >= scores.loc[fileName, 'test']
    
    print ' done'
print scores.loc[:, ['train_choice','validate_choice','test_choice']]

downloading DSR_TbyT_FirstTraining.p (1/19) ...  done
loading model ...  done
evaluating model ...  done
downloading DSR_TbyT_Naive_mPFC.p (2/19) ...  done
loading model ...  done
evaluating model ...  done
downloading DSR_TbyT_Naive_OFC.p (3/19) ...  done
loading model ...  done
evaluating model ...  done
downloading DSR_TbyT_MidTraining.p (4/19) ...  done
loading model ...  done
evaluating model ...  done
downloading DSR_TbyT_Saline.p (5/19) ...  done
loading model ...  done
evaluating model ...  done
downloading DSR_TbyT_MPFC.p (6/19) ...  done
loading model ...  done
evaluating model ...  done
downloading DSR_TbyT_OFC.p (7/19) ...  done
loading model ...  done
evaluating model ...  done
downloading DSR_TbyT_Ipsi.p (8/19) ...  done
loading model ...  done
evaluating model ...  done
downloading DSR_TbyT_Contra.p (9/19) ...  done
loading model ...  done
evaluating model ...  done
downloading PSR_TbyT_FirstTraining.p (10/19) ...  done
loading model ...  done
evaluating model ...  done
