In [21]:
# set up packages
import numpy as np
import matplotlib.pyplot as plt
import pickle
import time
import sys
import torch
import os
import random
import math

from load_data_funs import load_data, gen_batch_data_fixations_choice, gen_batch_data_fixations_only, gen_batch_data_choice_only
from neural_nets import SimpleLSTM, SimpleMLP

In [2]:
train_setting = 0
job_idx = 0

on_cluster = True
if on_cluster:
    sim_data_path = '/scratch/gpfs/erussek/RNN_project/optimal_fixation_sims'
    human_data_path = '/scratch/gpfs/erussek/RNN_project/human_trials.json'
else:
    sim_data_path = '/Users/erussek/Dropbox/Griffiths_Lab_Stuff/Data/RNNs/optimal_fixation_sims'
    human_data_path = '/Users/erussek/Dropbox/Griffiths_Lab_Stuff/Data/RNNs/human_trials.json'

In [3]:
train_data_funcs = [gen_batch_data_fixations_choice, gen_batch_data_fixations_only, gen_batch_data_choice_only]
this_data_func = train_data_funcs[train_setting]

In [4]:

# function to test model...
def test(model, test_sim_data, criterion, device, batch_size, n_total_seq, gen_batch_data,use_human_data = False):
    # Set the model to evaluation mode. This will turn off layers that would
    # otherwise behave differently during training, such as dropout.
    model.eval()
    
    n_total_seq = 1000

    n_batches = int(np.round(n_total_seq / batch_size));

    loss_res = np.zeros((n_batches, 1), dtype=float)

    # A context manager is used to disable gradient calculations during inference
    # to reduce memory usage, as we typically don't need the gradients at this point.
    with torch.no_grad():
        for batch_idx in range(n_batches):
            data, target = gen_batch_data(batch_size, batch_idx, test_sim_data, use_human_data=use_human_data)
            data, target = torch.from_numpy(data).float().to(device), torch.from_numpy(target).long().to(device)

            output = model(data)
            
            to_keep = target != 0
            target = target[to_keep]
            output = output[to_keep]
            
            # target = target.argmax(dim=1)
            loss = criterion(output, target)  # is this just for the last batch?

            # store the loss
            loss_res[batch_idx] = loss.item()

    return np.mean(loss_res)

In [5]:

def train_with_intermediate_tests(model, train_sim_data, test_sim_data, criterion, optimizer, device, batch_size, n_total_seq, gen_batch_data, use_human_data = False, model_name = "", n_epochs = 1):
    # Set the model to training mode. This will turn on layers that would
    # otherwise behave differently during evaluation, such as dropout.
    model.train()
    
    # What metric to store?
    # num_correct = 0

    # Iterate over every batch of sequences. Note that the length of a data generator
    # is defined as the number of batches required to produce a total of roughly 1000
    # sequences given a batch size.
        
    # how many batches
    n_batches = int(np.round(n_total_seq/batch_size));
    
    loss_res = []
    train_loss_res = []
    train_num = []
    
    print('n_epochs: '+str(n_epochs))
    
    for epoch_idx in range(n_epochs):
        print(epoch_idx)
        for batch_idx in range(n_batches):
            
            this_batch_idx = n_batches*epoch_idx + batch_idx
            #print(this_batch_idx)

            # Request a batch of sequences and class labels, convert them into tensors
            # of the correct type, and then send them to the appropriate device.
            data, target = gen_batch_data(batch_size, batch_idx, train_sim_data, use_human_data=use_human_data)

            data, target = torch.from_numpy(data).float().to(device), torch.from_numpy(target).long().to(device)

            # Perform the forward pass of the model
            output = model(data)  # Step


            # for some reason target is an int, and dosn't match the output which is float32
            target = target.to(torch.float32)

            # remove padding (nicely, this is just 0's)
            to_keep = target != 0
            target = target[to_keep]
            output = output[to_keep]

            # need to re-write this function... 
            loss = criterion(output, target)  # Step

            # Clear the gradient buffers of the optimized parameters.
            # Otherwise, gradients from the previous batch would be accumulated.
            optimizer.zero_grad()  # Step

            loss.backward()  # Step

            optimizer.step()  # Step

            # 
            if ((this_batch_idx % 50) == 0) & (batch_idx > 0):
                test_loss = test(model, test_sim_data, criterion, device, batch_size, n_total_seq, gen_batch_data, use_human_data=use_human_data)
                loss_res.append(test_loss)

                train_loss_res.append(loss.item())
                train_num.append(200*(this_batch_idx+1))
                
                print('batch num' + str(batch_idx) + ' loss: ' + str(test_loss), flush=True)

        #return num_correct, loss.item()
    return model, np.array(loss_res), np.array(train_num)#loss.item()

In [6]:
train_data_sim, test_data_sim, human_data = load_data(sim_data_path, human_data_path,this_seed=job_idx)
this_data_func = train_data_funcs[train_setting]

In [7]:
# train on a 1.5 mil examples, generate learning curves... 
batch_size  = 32
n_total_seq = 5e4
n_batches = int(np.round(n_total_seq/batch_size));
n_tests = int(np.ceil(n_batches/200)) - 1
input_sizes = [6,3,3]
torch.manual_seed(job_idx)
input_size  = input_sizes[train_setting] # this is the length of the input vector? #train_data_gen.n_symbols
hidden_size = 50#best_hiddens[train_setting] # is this relevant for everything?
output_size = 3 # 

In [41]:
#### now create the model
import torch
import torch.nn as nn
import torch.nn.functional as F

# set up neural networks
class SimpleGRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.gru = torch.nn.GRU(input_size, hidden_size, batch_first=True)
        self.linear = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h = self.gru(x)[0]
        x = self.linear(h)
        return x
    

In [36]:
#### now create the transfoermer model # we want a position encoding -> transformer encoder -> linear readout -- for now, just a single layer
class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens in the sequence.
        The positional encodings have the same dimension as the embeddings, so that the two can be summed.
        Here, we use sine and cosine functions of different frequencies.
    .. math:
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """

        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

    
# now make a transformer - https://github.com/pytorch/examples/blob/main/word_language_model/model.py
class SimpleTransformer(nn.Module):
    def __init__(self, ntoken, input_size, hidden_size, output_size, nlayers = 1, nhead = 1, dropout=.1): # do you want dropout? this was .5 before...
        super().__init__()
        
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
                        
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(input_size, dropout)
        encoder_layers = TransformerEncoderLayer(input_size, nhead, hidden_size, dropout) # nhead???
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, input_size)
        self.ninp = input_size
        self.decoder = nn.Linear(input_size, ntoken)
        
    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
    
    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output#F.log_softmax(output, dim=-1)





In [42]:
# ?torch.nn.GRU
this_lr = .001

model = SimpleGRU(input_size,hidden_size,output_size)
criterion   = torch.nn.MSELoss()
optimizer   = torch.optim.RMSprop(model.parameters(), lr= this_lr) # switch to adam?
device = torch.device("cpu")#torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)


SimpleGRU(
  (gru): GRU(6, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=3, bias=True)
)

In [38]:
output_size

3

In [39]:
ntoken=3
this_lr = .001

model = SimpleTransformer(ntoken, input_size, hidden_size, output_size, nlayers = 1, nhead = 1, dropout=.1)
criterion   = torch.nn.MSELoss()
optimizer   = torch.optim.RMSprop(model.parameters(), lr= this_lr) # switch to adam?
device = torch.device("cpu")#torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)

SimpleTransformer(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=6, out_features=6, bias=True)
        )
        (linear1): Linear(in_features=6, out_features=50, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=50, out_features=6, bias=True)
        (norm1): LayerNorm((6,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((6,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (encoder): Embedding(3, 6)
  (decoder): Linear(in_features=6, out_features=3, bias=True)
)

In [43]:
trained_model, loss_res, train_num = train_with_intermediate_tests(model, train_data_sim, test_data_sim, criterion, optimizer, device, batch_size, n_total_seq, this_data_func, model_name='GRU')


n_epochs: 1
0
batch num50 loss: 5.325952345325101
batch num100 loss: 5.304128177704349
batch num150 loss: 5.3417938909223


KeyboardInterrupt: 

In [19]:
print(torch.backends.mps.is_available())


True


In [None]:
model.train()

# What metric to store?
# num_correct = 0

# Iterate over every batch of sequences. Note that the length of a data generator
# is defined as the number of batches required to produce a total of roughly 1000
# sequences given a batch size.

# how many batches
n_batches = int(np.round(n_total_seq/batch_size));
n_epochs=1

loss_res = []
train_loss_res = []
train_num = []

print('n_epochs: '+str(n_epochs))
epoch_idx = 1
batch_idx = 1


this_batch_idx = n_batches*epoch_idx + batch_idx
#print(this_batch_idx)

gen_batch_data = this_data_func

# Request a batch of sequences and class labels, convert them into tensors
# of the correct type, and then send them to the appropriate device.
data, target = gen_batch_data(batch_size, batch_idx, train_data_sim, use_human_data=False)
data, target = torch.from_numpy(data).float().to(device), torch.from_numpy(target).long().to(device)
output = model(data)  # Step

