<a href="https://colab.research.google.com/github/philqc/IFT6135H19_TP_2/blob/master/tp2_ift_6135.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'
!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-1.0.0-{platform}-linux_x86_64.whl torchvision

import torch 
import torch.nn
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import argparse
import collections
import os
import sys
import numpy
import numpy as np
import math, copy, time
import matplotlib.pyplot as plt

[K    100% |████████████████████████████████| 753.6MB 50.3MB/s 
[31mfastai 1.0.46 has requirement numpy>=1.15, but you'll have numpy 1.14.6 which is incompatible.[0m
[?25h

Set your directory path here

In [2]:
from google.colab import drive
drive.mount('/content/drive')

# directory path to store results/plots/models
dir_path = 'drive/My Drive/dev_2_6135/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


Problem 1 and 2: GRU and RNN

In [0]:
def clones(module, N):
    """
    A helper function for producing N identical layers (each with their own parameters).
    
    inputs: 
        module: a pytorch nn.module
        N (int): the number of copies of that module to return

    returns:
        a ModuleList with the copies of the module (the ModuleList is itself also a module)
    """
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

  
class RNN_cell(nn.Module):

    def __init__(self, in_size, out_size, dp_keep_prob):
        super().__init__()
        self.Wup = nn.Linear(in_size, out_size, bias=False)
        self.Wside = nn.Linear(out_size, out_size)
        self.dropout = nn.Dropout(1 - dp_keep_prob)

    def forward(self, x, hidden_last):
        x = self.Wup(self.dropout(x))
        h = self.Wside(hidden_last)
        return torch.tanh(x + h)


class GRU_cell(nn.Module):
    def __init__(self, in_size, hidden_size, dp_keep_prob):
        super().__init__()

        self.Uh = nn.Linear(hidden_size, hidden_size)
        self.Ur = nn.Linear(hidden_size, hidden_size)
        self.Uz = nn.Linear(hidden_size, hidden_size)
        self.Wh = nn.Linear(in_size, hidden_size, bias=False)
        self.Wr = nn.Linear(in_size, hidden_size, bias=False)
        self.Wz = nn.Linear(in_size, hidden_size, bias=False)
        self.dropout = nn.Dropout(1 - dp_keep_prob)

    def forward(self, x, hidden_last):
        x = self.dropout(x)

        r = torch.sigmoid(self.Wr(x) + self.Ur(hidden_last))
        z = torch.sigmoid(self.Wz(x) + self.Uz(hidden_last))
        h = torch.tanh(self.Wh(x) + self.Uh(torch.mul(r, hidden_last)))
        hidden = torch.mul(torch.ones_like(z) - z, hidden_last) + torch.mul(z, h)
        return hidden


class RNN_base(nn.Module):
    def __init__(self, mode, emb_size, hidden_size, seq_len, batch_size,
                 vocab_size, num_layers, dp_keep_prob):
        super().__init__()
        self.seq_len = seq_len
        self.num_layers = num_layers
        self.dp_keep_prob = dp_keep_prob
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size

        self.rnn = []
        for i in range(num_layers):
            in_size = emb_size if i == 0 else hidden_size
            if mode == 'GRU':
                self.rnn.append(GRU_cell(in_size, hidden_size, dp_keep_prob))
            else:
                self.rnn.append(RNN_cell(in_size, hidden_size, dp_keep_prob))

        self.rnn = nn.ModuleList(self.rnn)

        self.Why = nn.Linear(hidden_size, vocab_size)

        self.embed = nn.Embedding(vocab_size, emb_size)
        self.dropout = nn.Dropout(1 - dp_keep_prob)

        # Initialize the weights
        self.init_weights_uniform()

    def init_weights_uniform(self):
        self.Why.bias.data.fill_(0)
        torch.nn.init.uniform_(self.Why.weight, -0.1, 0.1)
        torch.nn.init.uniform_(self.embed.weight, -0.1, 0.1)

    def init_hidden(self):
        hidden = torch.zeros(self.num_layers, self.batch_size, self.hidden_size)
        return hidden  # a parameter tensor of shape (self.num_layers, self.batch_size, self.hidden_size)

    def forward(self, inputs, hidden):
        logits = []
        for word in inputs:
            for j in range(self.num_layers):
                if j == 0:
                    x = self.embed(word)
                else:
                    x = hidden[j - 1].clone()
                hidden[j] = self.rnn[j](x, hidden[j].clone())

            logit = self.Why(self.dropout(hidden[-1].clone()))
            logits.append(logit)

        logits = torch.stack(logits)

        return logits.view(self.seq_len, self.batch_size, self.vocab_size), hidden

    def generate(self, inputs, hidden, generated_seq_len, temperature=1):
        """
        Arguments:
            - inputs: A mini-batch of input tokens (NOT sequences!)
                            shape: (batch_size)
            - hidden: The initial hidden states for every layer of the stacked RNN.
                            shape: (num_layers, batch_size, hidden_size)
            - generated_seq_len: The length of the sequence to generate.
                           Note that this can be different than the length used
                           for training (self.seq_len)
            - temperature: float
                
        Returns:
            - Sampled sequences of tokens
                        shape: (generated_seq_len, batch_size)
        """
        samples = []
        for i in range(generated_seq_len):
            for j in range(self.num_layers):
                if j == 0:
                    x = self.embed(inputs)
                else:
                    x = hidden[j - 1].clone()
                hidden[j] = self.rnn[j](x, hidden[j].clone())
            
            out = self.Why(hidden[-1].clone())
                           
            # temperature adjustment
            out /= temperature
            
            probs = F.softmax(out, dim=-1)
                    
            # Sample from Categorical distribution
            dist = torch.distributions.categorical.Categorical(probs=probs)
            # inputs is the current output
            inputs = dist.sample()
                           
            samples.append(inputs)
      
        samples = torch.stack(samples)
        return samples
      
      
# Problem 1
class RNN(RNN_base):  # Implement a stacked vanilla RNN with Tanh nonlinearities.
    def __init__(self, emb_size, hidden_size, seq_len, batch_size, vocab_size, num_layers, dp_keep_prob):
        super(RNN, self).__init__('RNN', emb_size, hidden_size, seq_len, batch_size,
                                  vocab_size, num_layers, dp_keep_prob)


# Problem 2
class GRU(RNN_base):  # Implement a stacked GRU RNN
    def __init__(self, emb_size, hidden_size, seq_len, batch_size, vocab_size, num_layers, dp_keep_prob):
        super(GRU, self).__init__('GRU', emb_size, hidden_size, seq_len, batch_size,
                                  vocab_size, num_layers, dp_keep_prob)

Problem 3: Transformer  (Attention is all you need)

In [0]:
import pdb

class MultiHeadedAttention(nn.Module):
    def __init__(self, n_heads, n_units, dropout=0.1):
        """
        n_heads: the number of attention heads
        n_units: the number of output units
        dropout: probability of DROPPING units
        """
        super(MultiHeadedAttention, self).__init__()
        # This sets the size of the keys, values, and queries (self.d_k) to all
        # be equal to the number of output units divided by the number of heads.
        self.d_k = n_units // n_heads
        # This requires the number of n_heads to evenly divide n_units.
        assert n_units % n_heads == 0
        self.n_units = n_units
        self.n_heads = n_heads

        self.dropout = nn.Dropout(p=dropout)
        self.W_q = nn.Linear(n_units, n_units)
        self.W_k = nn.Linear(n_units, n_units)
        self.W_v = nn.Linear(n_units, n_units)
        self.W_out = nn.Linear(n_units, n_units)

        self.init_weights()

    def init_weights(self):
        k = np.sqrt(1 / self.n_units)
        for W in [self.W_q, self.W_k, self.W_v, self.W_out]:
            torch.nn.init.uniform_(W.weight, -k, k)
            torch.nn.init.uniform_(W.bias, -k, k)

    def forward(self, query, key, value, mask=None):
      
        batch_size, seq_len = query.shape[0], query.shape[1]

        query, key, value = self.W_q(query), self.W_k(key), self.W_v(value)
        # we now make the switch from n_units to heads by keys
        # and we also want a tensor of shape batch_size x heads x seq_len x d_k
        query = query.view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        key = key.view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        value = value.view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)

        # we want atn of shape batch_size x n_heads x seq_len x seq_len
        atn = torch.matmul(query, key.transpose(3, 2)) / np.sqrt(self.d_k)

        # Apply mask, for each i, if mask[i] is true, keep value
        # else replace it by -10e9 instead of 0 for num. stability
        if mask is not None:
            # have to add an arbitrary dimension so that the mask can be computed
            # on dim 2 and 3 of the attention variable
            mask = mask.view(batch_size, 1, seq_len, seq_len)
            atn = atn.masked_fill(mask == 0, -10e9)

        # apply softmax layer on last dimension
        atn = F.softmax(atn, dim=3)
        # apply dropout to the attention values
        atn = self.dropout(atn)
        
        heads = torch.matmul(atn, value)
        # concatenate all heads together
        heads = heads.transpose(2, 1).contiguous()
        heads = heads.view(batch_size, seq_len, self.n_units)
        out = self.W_out(heads)
        return out  # size: (batch_size, seq_len, self.n_units)
        

# ----------------------------------------------------------------------------------
# The encodings of elements of the input sequence

class WordEmbedding(nn.Module):
    def __init__(self, n_units, vocab):
        super(WordEmbedding, self).__init__()
        self.lut = nn.Embedding(vocab, n_units)
        self.n_units = n_units

    def forward(self, x):
        # print (x)
        return self.lut(x) * math.sqrt(self.n_units)


class PositionalEncoding(nn.Module):
    def __init__(self, n_units, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, n_units)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, n_units, 2).float() *
                             -(math.log(10000.0) / n_units))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)],
                         requires_grad=False)
        return self.dropout(x)


# ----------------------------------------------------------------------------------
# The TransformerBlock and the full Transformer


class TransformerBlock(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(TransformerBlock, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(ResidualSkipConnectionWithLayerNorm(size, dropout), 2)

    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))  # apply the self-attention
        return self.sublayer[1](x, self.feed_forward)  # apply the position-wise MLP


class TransformerStack(nn.Module):
    """
    This will be called on the TransformerBlock (above) to create a stack.
    """

    def __init__(self, layer, n_blocks):  # layer will be TransformerBlock (below)
        super(TransformerStack, self).__init__()
        self.layers = clones(layer, n_blocks)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)


class FullTransformer(nn.Module):
    def __init__(self, transformer_stack, embedding, n_units, vocab_size):
        super(FullTransformer, self).__init__()
        self.transformer_stack = transformer_stack
        self.embedding = embedding
        self.output_layer = nn.Linear(n_units, vocab_size)

    def forward(self, input_sequence, mask):
        embeddings = self.embedding(input_sequence)
        return F.log_softmax(self.output_layer(self.transformer_stack(embeddings, mask)), dim=-1)


def TRANSFORMER(vocab_size, n_blocks=6,
               n_units=512, n_heads=16, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(n_heads, n_units)
    ff = MLP(n_units, dropout)
    position = PositionalEncoding(n_units, dropout)
    model = FullTransformer(
        transformer_stack=TransformerStack(TransformerBlock(n_units, c(attn), c(ff), dropout), n_blocks),
        embedding=nn.Sequential(WordEmbedding(n_units, vocab_size), c(position)),
        n_units=n_units,
        vocab_size=vocab_size
    )

    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model


# ----------------------------------------------------------------------------------
# Data processing

def subsequent_mask(size):
    """ helper function for creating the masks. """
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0


class Batch:
    "Object for holding a batch of data with mask during training."

    def __init__(self, x, pad=0):
        self.data = x
        self.mask = self.make_mask(self.data, pad)

    @staticmethod
    def make_mask(data, pad):
        "Create a mask to hide future words."
        mask = (data != pad).unsqueeze(-2)
        mask = mask & Variable(
            subsequent_mask(data.size(-1)).type_as(mask.data))
        return mask


# ----------------------------------------------------------------------------------
# Some standard modules

class LayerNorm(nn.Module):
    "layer normalization, as in: https://arxiv.org/abs/1607.06450"

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2


class ResidualSkipConnectionWithLayerNorm(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """

    def __init__(self, size, dropout):
        super(ResidualSkipConnectionWithLayerNorm, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))


class MLP(nn.Module):
    """
    This is just an MLP with 1 hidden layer
    """

    def __init__(self, n_units, dropout=0.1):
        super(MLP, self).__init__()
        self.w_1 = nn.Linear(n_units, 2048)
        self.w_2 = nn.Linear(2048, n_units)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))


ptb-lm file from TAs.

In [10]:
#!/bin/python
# coding: utf-8

# Code outline/scaffold for 
# ASSIGNMENT 2: RNNs, Attention, and Optimization
# By Tegan Maharaj, David Krueger, and Chin-Wei Huang
# IFT6135 at University of Montreal
# Winter 2019
#
# based on code from:
#    https://github.com/deeplearningathome/pytorch-language-model/blob/master/reader.py
#    https://github.com/ceshine/examples/blob/master/word_language_model/main.py
#    https://github.com/teganmaharaj/zoneout/blob/master/zoneout_word_ptb.py
#    https://github.com/harvardnlp/annotated-transformer

# GENERAL INSTRUCTIONS: 
#    - ! IMPORTANT! 
#      Unless we're otherwise notified we will run exactly this code, importing 
#      your models from models.py to test them. If you find it necessary to 
#      modify or replace this script (e.g. if you are using TensorFlow), you 
#      must justify this decision in your report, and contact the TAs as soon as 
#      possible to let them know. You are free to modify/add to this script for 
#      your own purposes (e.g. monitoring, plotting, further hyperparameter 
#      tuning than what is required), but remember that unless we're otherwise 
#      notified we will run this code as it is given to you, NOT with your 
#      modifications.
#    - We encourage you to read and understand this code; there are some notes 
#      and comments to help you.
#    - Typically, all of your code to submit should be written in models.py; 
#      see further instructions at the top of that file / in TODOs.
#          - RNN recurrent unit 
#          - GRU recurrent unit
#          - Multi-head attention for the Transformer
#    - Other than this file and models.py, you will probably also write two 
#      scripts. Include these and any other code you write in your git repo for 
#      submission:
#          - Plotting (learning curves, loss w.r.t. time, gradients w.r.t. hiddens)
#          - Loading and running a saved model (computing gradients w.r.t. hiddens, 
#            and for sampling from the model)

# PROBLEM-SPECIFIC INSTRUCTIONS:   
#    - For Problems 1-3, paste the code for the RNN, GRU, and Multi-Head attention 
#      respectively in your report, in a monospace font.
#    - For Problem 4.1 (model comparison), the hyperparameter settings you should run are as follows:
#          --model=RNN --optimizer=ADAM --initial_lr=0.0001 --batch_size=20 --seq_len=35 --hidden_size=1500 --num_layers=2 --dp_keep_prob=0.35 --save_best
#          --model=GRU --optimizer=SGD_LR_SCHEDULE --initial_lr=10 --batch_size=20 --seq_len=35 --hidden_size=1500 --num_layers=2 --dp_keep_prob=0.35 --save_best
#          --model=TRANSFORMER --optimizer=SGD_LR_SCHEDULE --initial_lr=20 --batch_size=128 --seq_len=35 --hidden_size=512 --num_layers=6 --dp_keep_prob=0.9 --save_best
#    - In those experiments, you should expect to see approximately the following
#      perplexities:
#                  RNN: train:  120  val: 157
#                  GRU: train:   65  val: 104
#          TRANSFORMER:  train:  67  val: 146
#    - For Problem 4.2 (exploration of optimizers), you will make use of the 
#      experiments from 4.1, and should additionally run the following experiments:
#          --model=RNN --optimizer=SGD --initial_lr=0.0001 --batch_size=20 --seq_len=35 --hidden_size=1500 --num_layers=2 --dp_keep_prob=0.35 
#          --model=GRU --optimizer=SGD --initial_lr=10 --batch_size=20 --seq_len=35 --hidden_size=1500 --num_layers=2 --dp_keep_prob=0.35
#          --model=TRANSFORMER --optimizer=SGD --initial_lr=20 --batch_size=128 --seq_len=35 --hidden_size=512 --num_layers=6 --dp_keep_prob=.9
#          --model=RNN --optimizer=SGD_LR_SCHEDULE --initial_lr=1 --batch_size=20 --seq_len=35 --hidden_size=512 --num_layers=2 --dp_keep_prob=0.35
#          --model=GRU --optimizer=ADAM --initial_lr=0.0001 --batch_size=20 --seq_len=35 --hidden_size=1500 --num_layers=2 --dp_keep_prob=0.35
#          --model=TRANSFORMER --optimizer=ADAM --initial_lr=0.001 --batch_size=128 --seq_len=35 --hidden_size=512 --num_layers=2 --dp_keep_prob=.9
#    - For Problem 4.3 (exloration of hyperparameters), do your best to get 
#      better validation perplexities than the settings given for 4.1. You may 
#      try any combination of the hyperparameters included as arguments in this 
#      script's ArgumentParser, but do not implement any additional 
#      regularizers/features. You may (and will probably want to) run a lot of 
#      different things for just 1-5 epochs when you are trying things out, but 
#      you must report at least 3 experiments on each architecture that have run
#      for at least 40 epochs.
#    - For Problem 5, perform all computations / plots based on saved models 
#      from Problem 4.1. NOTE this means you don't have to save the models for 
#      your exploration, which can make things go faster. (Of course
#      you can still save them if you like; just add the flag --save_best). 
#    - For Problem 5.1, you can modify the loss computation in this script 
#      (search for "LOSS COMPUTATION" to find the appropriate line. Remember to 
#      submit your code.
#    - For Problem 5.3, you must implement the generate method of the RNN and 
#      GRU.  Implementing this method is not considered part of problems 1/2 
#      respectively, and will be graded as part of Problem 5.3




# NOTE ==============================================
# This is where your models are imported
#from models import RNN #, GRU
#from models import make_model as TRANSFORMER


##############################################################################
#
# ARG PARSING AND EXPERIMENT SETUP
#
##############################################################################

parser = argparse.ArgumentParser(description='PyTorch Penn Treebank Language Modeling')

# Arguments you may need to set to run different experiments in 4.1 & 4.2.
parser.add_argument('--data', type=str, default=dir_path + '/data',
                    help='location of the data corpus')
parser.add_argument('--model', type=str, default='GRU',
                    help='type of recurrent net (RNN, GRU, TRANSFORMER)')
parser.add_argument('--optimizer', type=str, default='SGD_LR_SCHEDULE',
                    help='optimization algo to use; SGD, SGD_LR_SCHEDULE, ADAM')
parser.add_argument('--seq_len', type=int, default=35,
                    help='number of timesteps over which BPTT is performed')
parser.add_argument('--batch_size', type=int, default=20,
                    help='size of one minibatch')
parser.add_argument('--initial_lr', type=float, default=10,
                    help='initial learning rate')
parser.add_argument('--hidden_size', type=int, default=1500,
                    help='size of hidden layers. IMPORTANT: for the transformer\
                    this must be a multiple of 16.')
parser.add_argument('--save_best', type=int, default=True,
                    help='save the model for the best validation performance')
parser.add_argument('--num_layers', type=int, default=2,
                    help='number of LSTM layers')

#model=RNN --optimizer=ADAM --initial_lr=0.0001 --batch_size=20 --seq_len=35 --hidden_size=1500 --num_layers=2 --dp_keep_prob=0.35 --save_best
#-model=TRANSFORMER --optimizer=SGD_LR_SCHEDULE --initial_lr=20 --batch_size=128 --seq_len=35 --hidden_size=512 --num_layers=6 --dp_keep_prob=0.9 --save_best
#model=GRU --optimizer=SGD_LR_SCHEDULE --initial_lr=10 --batch_size=20 --seq_len=35 --hidden_size=1500 --num_layers=2 --dp_keep_prob=0.35 --save_best
# Other hyperparameters you may want to tune in your exploration
parser.add_argument('--emb_size', type=int, default=200,
                    help='size of word embeddings')
parser.add_argument('--num_epochs', type=int, default=40,
                    help='number of epochs to stop after')
parser.add_argument('--dp_keep_prob', type=float, default=0.35,
                    help='dropout *keep* probability (dp_keep_prob=0 means no dropout')

# Arguments that you may want to make use of / implement more code for
parser.add_argument('--debug', action='store_true') 
parser.add_argument('--save_dir', type=str, default=dir_path,
                    help='path to save the experimental config, logs, model \
                    This is automatically generated based on the command line \
                    arguments you pass and only needs to be set if you want a \
                    custom dir name')
parser.add_argument('--evaluate', action='store_true',
                    help="use this flag to run on the test set. Only do this \
                    ONCE for each model setting, and only after you've \
                    completed ALL hyperparameter tuning on the validation set.\
                    Note we are not requiring you to do this.")

# DO NOT CHANGE THIS (setting the random seed makes experiments deterministic, 
# which helps for reproducibility)
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')

args = parser.parse_args(args=[])
argsdict = args.__dict__
argsdict['code_file'] = sys.argv[0]

# Use the model, optimizer, and the flags passed to the script to make the 
# name for the experimental dir
print("\n########## Setting Up Experiment ######################")
flags = [flag.lstrip('--') for flag in sys.argv[1:]]
experiment_path = os.path.join(args.save_dir+'_'.join([argsdict['model'],
                                         argsdict['optimizer']] 
                                         ))

# Increment a counter so that previous results with the same args will not
# be overwritten. Comment out the next four lines if you only want to keep
# the most recent results.
i = 0
while os.path.exists(experiment_path + "_" + str(i)):
    i += 1
experiment_path = experiment_path + "_" + str(i)

# Creates an experimental directory and dumps all the args to a text file
os.mkdir(experiment_path)
print ("\nPutting log in %s"%experiment_path)
argsdict['save_dir'] = experiment_path
with open (os.path.join(experiment_path,'exp_config.txt'), 'w') as f:
    for key in sorted(argsdict):
        f.write(key+'    '+str(argsdict[key])+'\n')

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)

# Use the GPU if you have one
if torch.cuda.is_available():
    print("Using the GPU")
    device = torch.device("cuda") 
else:
    print("WARNING: You are about to run on cpu, and this will likely run out \
      of memory. \n You can try setting batch_size=1 to reduce memory usage")
    device = torch.device("cpu")


###############################################################################
#
# DATA LOADING & PROCESSING
#
###############################################################################

# HELPER FUNCTIONS
def _read_words(filename):
    with open(filename, "r") as f:
      return f.read().replace("\n", "<eos>").split()

def _build_vocab(filename):
    data = _read_words(filename)

    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    id_to_word = dict((v, k) for k, v in word_to_id.items())

    return word_to_id, id_to_word

def _file_to_word_ids(filename, word_to_id):
    data = _read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

# Processes the raw data from text files
def ptb_raw_data(data_path=None, prefix="ptb"):
    train_path = os.path.join(data_path, prefix + ".train.txt")
    valid_path = os.path.join(data_path, prefix + ".valid.txt")
    test_path = os.path.join(data_path, prefix + ".test.txt")

    word_to_id, id_2_word = _build_vocab(train_path)
    train_data = _file_to_word_ids(train_path, word_to_id)
    valid_data = _file_to_word_ids(valid_path, word_to_id)
    test_data = _file_to_word_ids(test_path, word_to_id)
    return train_data, valid_data, test_data, word_to_id, id_2_word

# Yields minibatches of data
def ptb_iterator(raw_data, batch_size, num_steps):
    raw_data = np.array(raw_data, dtype=np.int32)

    data_len = len(raw_data)
    batch_len = data_len // batch_size
    data = np.zeros([batch_size, batch_len], dtype=np.int32)
    for i in range(batch_size):
        data[i] = raw_data[batch_len * i:batch_len * (i + 1)]

    epoch_size = (batch_len - 1) // num_steps

    if epoch_size == 0:
        raise ValueError("epoch_size == 0, decrease batch_size or num_steps")

    for i in range(epoch_size):
        x = data[:, i*num_steps:(i+1)*num_steps]
        y = data[:, i*num_steps+1:(i+1)*num_steps+1]
        yield (x, y)


class Batch:
    "Data processing for the transformer. This class adds a mask to the data."
    def __init__(self, x, pad=0):
        self.data = x
        self.mask = self.make_mask(self.data, pad)
    
    @staticmethod
    def make_mask(data, pad):
        "Create a mask to hide future words."

        def subsequent_mask(size):
            """ helper function for creating the masks. """
            attn_shape = (1, size, size)
            subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
            return torch.from_numpy(subsequent_mask) == 0

        mask = (data != pad).unsqueeze(-2)
        mask = mask & Variable(
            subsequent_mask(data.size(-1)).type_as(mask.data))
        return mask


# LOAD DATA
print('Loading data from '+args.data)
raw_data = ptb_raw_data(data_path=args.data)
train_data, valid_data, test_data, word_to_id, id_2_word = raw_data
vocab_size = len(word_to_id)
print('  vocabulary size: {}'.format(vocab_size))


###############################################################################
# 
# MODEL SETUP
#
###############################################################################

# NOTE ==============================================
# This is where your model code will be called. You may modify this code
# if required for your implementation, but it should not typically be necessary,
# and you must let the TAs know if you do so.
if args.model == 'RNN':
    model = RNN(emb_size=args.emb_size, hidden_size=args.hidden_size, 
                seq_len=args.seq_len, batch_size=args.batch_size,
                vocab_size=vocab_size, num_layers=args.num_layers, 
                dp_keep_prob=args.dp_keep_prob)
elif args.model == 'GRU':
    model = GRU(emb_size=args.emb_size, hidden_size=args.hidden_size,
                seq_len=args.seq_len, batch_size=args.batch_size,
                vocab_size=vocab_size, num_layers=args.num_layers,
                dp_keep_prob=args.dp_keep_prob)
elif args.model == 'TRANSFORMER':
    if args.debug:  # use a very small model
        model = TRANSFORMER(vocab_size=vocab_size, n_units=16, n_blocks=2)
    else:
        # Note that we're using num_layers and hidden_size to mean slightly
        # different things here than in the RNNs.
        # Also, the Transformer also has other hyperparameters
        # (such as the number of attention heads) which can change it's behavior.
        model = TRANSFORMER(vocab_size=vocab_size, n_units=args.hidden_size,
                            n_blocks=args.num_layers, dropout=1.-args.dp_keep_prob)
    # these 3 attributes don't affect the Transformer's computations;
    # they are only used in run_epoch
    model.batch_size = args.batch_size
    model.seq_len = args.seq_len
    model.vocab_size = vocab_size
else:
    print("Model type not recognized.")


model = model.to(device)

# LOSS FUNCTION
loss_fn = nn.CrossEntropyLoss()
if args.optimizer == 'ADAM':
    optimizer = torch.optim.Adam(model.parameters(), lr=args.initial_lr)

# LEARNING RATE SCHEDULE    
lr = args.initial_lr
lr_decay_base = 1 / 1.15
m_flat_lr = 14.0 # we will not touch lr for the first m_flat_lr epochs


###############################################################################
# 
# DEFINE COMPUTATIONS FOR PROCESSING ONE EPOCH
#
###############################################################################

def repackage_hidden(h):
    """
    Wraps hidden states in new Tensors, to detach them from their history.
    
    This prevents Pytorch from trying to backpropagate into previous input 
    sequences when we use the final hidden states from one mini-batch as the 
    initial hidden states for the next mini-batch.
    
    Using the final hidden states in this way makes sense when the elements of 
    the mini-batches are actually successive subsequences in a set of longer sequences.
    This is the case with the way we've processed the Penn Treebank dataset.
    """
    if isinstance(h, Variable):
        return h.detach_()
    else:
        return tuple(repackage_hidden(v) for v in h)


def run_epoch(model, data, is_train=False, lr=1.0):
    """
    One epoch of training/validation (depending on flag is_train).
    """
    if is_train:
        model.train()
    else:
        model.eval()
    epoch_size = ((len(data) // model.batch_size) - 1) // model.seq_len
    start_time = time.time()
    if args.model != 'TRANSFORMER':
        hidden = model.init_hidden()
        hidden = hidden.to(device)
    costs = 0.0
    iters = 0
    b_time, f_time = 0, 0
    losses = []

    # LOOP THROUGH MINIBATCHES
    for step, (x, y) in enumerate(ptb_iterator(data, model.batch_size, model.seq_len)):
        if args.model == 'TRANSFORMER':
            batch = Batch(torch.from_numpy(x).long().to(device))
            model.zero_grad()
            forward_time = time.time()
            outputs = model.forward(batch.data, batch.mask).transpose(1,0)
            f_time += time.time() - forward_time
            #print ("outputs.shape", outputs.shape)
        else:
            inputs = torch.from_numpy(x.astype(np.int64)).transpose(0, 1).contiguous().to(device)#.cuda()
            model.zero_grad()
            hidden = repackage_hidden(hidden)
            
            forward_time = time.time()
            outputs, hidden = model(inputs, hidden)
            f_time += time.time() - forward_time

        targets = torch.from_numpy(y.astype(np.int64)).transpose(0, 1).contiguous().to(device)#.cuda()
        tt = torch.squeeze(targets.view(-1, model.batch_size * model.seq_len))

        # LOSS COMPUTATION
        # This line currently averages across all the sequences in a mini-batch 
        # and all time-steps of the sequences.
        # For problem 5.3, you will (instead) need to compute the average loss 
        #at each time-step separately. 
        loss = loss_fn(outputs.contiguous().view(-1, model.vocab_size), tt)
        costs += loss.data.item() * model.seq_len
        losses.append(costs)
        iters += model.seq_len
        if args.debug:
            print(step, loss)
        if is_train:  # Only update parameters if training 
            backward_time = time.time()
            loss.backward()
            b_time += time.time() - backward_time
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
            if args.optimizer == 'ADAM':
                optimizer.step()
            else: 
                for p in model.parameters():
                    if p.grad is not None:
                        p.data.add_(-lr, p.grad.data)
            if step % 10 == 0:
              print('\rstep: {}; loss: {:.5f}; costs: {:.2f}; speed (wps) {:.2f}; b_time = {:.2f}; f_time = {:.2f}'
                    ''.format(step, loss, costs, iters * model.batch_size / (time.time() - start_time), b_time, f_time),
                    end='')
    print('')          
    return np.exp(costs / iters), losses






########## Setting Up Experiment ######################

Putting log in drive/My Drive/dev_2_6135/GRU_SGD_LR_SCHEDULE_0
Using the GPU
Loading data from drive/My Drive/dev_2_6135//data
  vocabulary size: 10000


In [11]:
###############################################################################
#
# RUN MAIN LOOP (TRAIN AND VAL)
#
###############################################################################

print("\n########## Running Main Loop ##########################")
train_ppls = []
train_losses = []
val_ppls = []
val_losses = []
best_val_so_far = np.inf
times = []
wall_clock_time = []
time_start = time.time()

# In debug mode, only run one epoch
if args.debug:
    num_epochs = 1 
else:
    num_epochs = args.num_epochs

# MAIN LOOP
for epoch in range(num_epochs):
    t0 = time.time()
    print('\nEPOCH '+str(epoch)+' ------------------')
    if args.optimizer == 'SGD_LR_SCHEDULE':
        lr_decay = lr_decay_base ** max(epoch - m_flat_lr, 0)
        lr = lr * lr_decay # decay lr if it is time

    # RUN MODEL ON TRAINING DATA
    train_ppl, train_loss = run_epoch(model, train_data, True, lr)

    # RUN MODEL ON VALIDATION DATA
    with torch.no_grad():
      val_ppl, val_loss = run_epoch(model, valid_data)


    # SAVE MODEL IF IT'S THE BEST SO FAR
    if val_ppl < best_val_so_far:
        best_val_so_far = val_ppl
        if args.save_best:
            print("Saving model parameters to best_params.pt")
            torch.save(model.state_dict(), os.path.join(args.save_dir, 'best_params.pt'))
        # NOTE ==============================================
        # You will need to load these parameters into the same model
        # for a couple Problems: so that you can compute the gradient 
        # of the loss w.r.t. hidden state as required in Problem 5.2
        # and to sample from the the model as required in Problem 5.3
        # We are not asking you to run on the test data, but if you 
        # want to look at test performance you would load the saved
        # model and run on the test data with batch_size=1

    # LOC RESULTS
    train_ppls.append(train_ppl)
    val_ppls.append(val_ppl)
    train_losses.extend(train_loss)
    val_losses.extend(val_loss)
    wall_clock_time.append(time.time() - time_start)
    times.append(time.time() - t0)
    
    log_str = 'epoch: ' + str(epoch) + '\t' \
            + 'train ppl: ' + str(train_ppl) + '\t' \
            + 'val ppl: ' + str(val_ppl)  + '\t' \
            + 'best val: ' + str(best_val_so_far) + '\t' \
            + 'time (s) spent in epoch: ' + str(times[-1])
    print(log_str)
    with open (os.path.join(args.save_dir, 'log.txt'), 'a') as f_:
        f_.write(log_str+ '\n')

# SAVE LEARNING CURVES
lc_path = os.path.join(args.save_dir, 'learning_curves.npy')
print('\nDONE\n\nSaving learning curves to ' + lc_path)
np.save(lc_path, {'train_ppls': train_ppls, 
                  'val_ppls': val_ppls, 
                  'train_losses': train_losses,
                  'val_losses': val_losses,
                  'time_per_epoch': times,
                  'wall_clock_time': wall_clock_time})
# NOTE ==============================================
# To load these, run 
# >>> x = np.load(lc_path)[()]
# You will need these values for plotting learning curves (Problem 4)



########## Running Main Loop ##########################

EPOCH 0 ------------------
step: 10; loss: 7.86136; costs: 3571.56; speed (wps) 1334.25; b_time = 3.33; f_time = 0.83

KeyboardInterrupt: ignored

Generate samples (Number 5.3)

In [13]:
path_model = dir_path + 'GRU_SGD_LR_SCHEDULE_0_26_epochs_success/' + 'best_params.pt'

batch_size = 10
generate_len = 35

print('batch_size =', batch_size, 'length_generate =', generate_len)

model = GRU(emb_size=args.emb_size, hidden_size=args.hidden_size, 
            seq_len=args.seq_len, batch_size=batch_size,
            vocab_size=vocab_size, num_layers=args.num_layers, 
            dp_keep_prob=args.dp_keep_prob).to(device)

model.load_state_dict(torch.load(path_model))
model.eval()


with torch.no_grad():
  
  hidden = model.init_hidden()
  hidden = hidden.to(device)
    
  for step, (x, y) in enumerate(ptb_iterator(test_data, batch_size, model.seq_len)):
      inputs = torch.from_numpy(x.astype(np.int64)).transpose(0, 1).contiguous().to(device)
      hidden = repackage_hidden(hidden)
      samples = model.generate(inputs[0], hidden, generate_len, temperature=1)    
      
        
      samples = samples.transpose(1, 0)
      print('inputs_shape =', inputs.shape, 
            'y_shape =', y.shape, 
            'sample shape =', samples.shape, '\n\n')
      # change from ints to strings
      for i, seq in enumerate(samples):
          # print starting word for context
          print(id_2_word[y[i, 0]], ' ', end='')
          for word in seq:
              print(id_2_word[word.item()] + ' ', end='')  
          
          print('\n')
        
      break


batch_size = 10 length_generate = 35
inputs_shape = torch.Size([35, 10]) y_shape = (10, 35) sample shape = torch.Size([10, 35]) 


it  longer admit as the present diamond of a marxist recommendation on the incinerator <eos> one black-and-white for example is <unk> nearly a more valuable health story that it will <unk> the bigger temperatures of free 

recent  new york contributed to this article <eos> this thick its training real estate formula <eos> many californians accuse young the founding of the columns and the amount is slipping chunks to do so it knows 

japanese  york the chicago city <eos> iowa is <unk> by grant service to push packages in the communities after the quake is a very dramatic blow <eos> a state official says its lawyers opened with exchange 

<unk>  n't excessive <eos> it represents the kind of state the greatest user over most of the <unk> and borrowers the white house said they should seek to see society corporate <eos> he said that the 

<unk>  the management 