<a href="https://colab.research.google.com/github/eyyupoglu/02456-deep-learning-with-PyTorch/blob/master/working_back_and_forth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision
import torch

In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [0]:
import argparse
import time
import math
import os
import torch
import torch.nn as nn
import torch.onnx
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
import os
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'ptb.train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'ptb.valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'ptb.test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:

            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids

In [0]:
###############################################################################
# Load data
###############################################################################
batch_size = 20
corpus = Corpus('./gdrive/My Drive/nlp/data/raw/penn-treebank')

# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

eval_batch_size = 10
train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [0]:

class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn = RNN_mehmet(ninp, nhid).to(device)
#         if rnn_type in ['LSTM', 'GRU']:
#             self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
#         else:
#             try:
#                 nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
#             except KeyError:
#                 raise ValueError( """An invalid option for `--model` was supplied,
#                                  options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
#             self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))  
        ext_output = decoded.view(output.size(0), output.size(1), decoded.size(1))
        return ext_output, hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)

In [0]:
###############################################################################
# Build the model
###############################################################################
emsize = 200
nhid = 200
nlayers = 1
dropout = 0.5
tied = False

ntokens = len(corpus.dictionary)
model = RNNModel('LSTM', ntokens, emsize, nhid, nlayers, dropout, tied).to(device)
print(model)

criterion = nn.CrossEntropyLoss()

RNNModel(
  (drop): Dropout(p=0.5)
  (encoder): Embedding(10000, 200)
  (rnn): RNN_mehmet()
  (decoder): Linear(in_features=200, out_features=10000, bias=True)
)


In [0]:
###############################################################################
# Training code
###############################################################################

bptt = 35
clip = 0.25
log_interval = 100
epochs = 25

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)


# get_batch subdivides the source data into chunks of length bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.

def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / (len(data_source) - 1)


def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
#     print('N-tokens', ntokens)
    hidden, _ = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, 2 - 1, bptt)):
        data, targets = get_batch(train_data, i)
#         print(data.shape)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        print('output ', output.size())
        print('output view', output.view(-1, ntokens).size())
        print('targets', targets.size())
        loss = criterion(output.view(-1, ntokens), targets)
        print('loss', loss)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
           
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

# Loop over epochs.
lr = 20
best_val_loss = None
save_file = 'best_model'

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save_file, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')


RuntimeError: ignored

In [0]:

# coding: utf-8

# In[1]:

import itertools
import operator
from datetime import datetime
import sys
from torch import FloatTensor
from torch.autograd import Variable
from torch import nn


vocabulary_size = 8000


class RNN_mehmet(nn.Module):
    def __init__(self, word_dim, hidden_dim = 100, activation = 'sigmoid'):
        super(RNN_mehmet, self).__init__()

        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.weights_hh = self.init_weights((hidden_dim, hidden_dim))
        self.weights_xh = self.init_weights((hidden_dim, word_dim))
        self.weights_o = self.init_weights((word_dim, hidden_dim))
        self.activation = getattr(torch, activation)


  
    def init_weights(self, dim):
        return nn.Parameter(torch.FloatTensor(dim[0], dim[1]).uniform_(-np.sqrt(1./dim[0]), np.sqrt(1./dim[1])), requires_grad=True)
    def softmax(self, x):
        xt = torch.exp(x - torch.max(x))
        return xt / xt.sum()
      
    
    def init_hidden(self, batch_size, dim):
        layer = torch.zeros((1, batch_size, self.hidden_dim),  requires_grad = True)
        if dim > 1:
           layer = (layer.clone(), layer.clone())
        return layer
      
    def step(self, lr):
        for p in self.parameters():
            p.data.add_(-lr, p.grad.data)
      
    def forward_step(self, xt, hidden_t_1):
        # clone weights
        weights_xh = self.weights_xh.clone()
        weights_hh = self.weights_hh.clone()
        weights_o = self.weights_o.clone()
        
        # calculate left and right terms
        left_term = F.linear(xt, weights_xh)
        right_term = F.linear(hidden_t_1, weights_hh)
        
        # sum terms
        sum_ = left_term + right_term
        
        # activation for hidden state
        hidden_t = self.activation(sum_)
        
        # calculate output
        output = F.linear(hidden_t, weights_o)
        return output, hidden_t

    def forward_propagation(self, x, hidden_t_1):
        # Get sequence length (bptt), batch_size from the input
        bptt, batch_size, _ = x.size()
        output = torch.zeros((bptt, batch_size, self.word_dim))
        
        # loop over sequence
        for t in torch.arange(bptt): 
            xt = x[t,:,:]
         
            output[t], hidden_t_1 = self.forward_step(xt, hidden_t_1)
        return [output, hidden_t_1]
      
    def __call__(self, x, hidden_t_1):
        return self.forward_propagation(x, hidden_t_1)

fake_net = RNN_mehmet(emsize, hidden_dim = 130)

#forward_prop
# test forward pass
x = np.random.normal(0, 1, (bptt, batch_size, emsize)).astype('float32')
x = torch.Tensor(torch.from_numpy(x))

hidden = fake_net.init_hidden(batch_size, 1)

y = np.random.normal(0, 1, (bptt * batch_size, emsize)).astype('float32')
y = torch.Tensor(torch.from_numpy(y)).long()


print(result[0].size())



#example backward and 10 steps

for i in range(10):
    fake_net.zero_grad()
    result = fake_net.forward_propagation(x, hidden)
    output =  result[0].view(-1, emsize)

    loss_fn = nn.MSELoss()
    loss = loss_fn(output, y.float())
    print(loss)
    loss.backward()
    fake_net.step(0.01)



torch.Size([35, 20, 200])
tensor(0.8565, grad_fn=<MseLossBackward>)
tensor(0.8537, grad_fn=<MseLossBackward>)
tensor(0.8509, grad_fn=<MseLossBackward>)
tensor(0.8481, grad_fn=<MseLossBackward>)
tensor(0.8454, grad_fn=<MseLossBackward>)
tensor(0.8426, grad_fn=<MseLossBackward>)
tensor(0.8399, grad_fn=<MseLossBackward>)
tensor(0.8372, grad_fn=<MseLossBackward>)
tensor(0.8346, grad_fn=<MseLossBackward>)
tensor(0.8319, grad_fn=<MseLossBackward>)


In [0]:
# New RNN Layer

import torch.nn.functional as F

class RNNCustom(nn.Module):
  
  def __init__(self, ninput, nhid, activation = 'sigmoid'):
      self.ninput = ninput
      self.nhid = nhid
      self.weights_hh = self.init_weights((nhid, nhid))
      self.weights_xh = self.init_weights((nhid, ninput))
      self.activation = getattr(F, activation)
      self._modules = {}
    
  def init_weights(self, dimensions):
      return torch.zeros(dimensions, requires_grad = True).to(device)
      
  def init_hidden(self):
      layer = torch.zeros((1, batch_size, self.nhid),  requires_grad = True).to(device)
      return (layer, layer)
    
  def step(self, xt, ht_1):
      # calculate product of weights and inputs
      print('xt', xt.size())
      print('weights', self.weights_xh.size())
      xt = F.linear(xt, self.weights_xh)
      ht_1 = F.linear(ht_1, self.weights_hh)
      
      # return activation of concatenated products
      return self.activation(xt + ht_1), ht_1
      
    
  def forward(self, x, hidden):
      # Get sequence length (bptt), batch_size from the input
      bptt, batch_size, _ = x.size()
      
      # intialize output
      output = torch.zeros((bptt, batch_size, self.ninput), requires_grad = True).to(device)
      
      # hidden layers
      ht_1, ht = hidden
      
      # loop over sequence
      for i in range(bptt):
        
        # slice input 
        xt = x[i,:,:]
       
        # store step output
        output[i,:,:], ht = self.step(xt, ht_1)
        
        # update hidden states
        ht_1 = ht
      
      # return output, (hidden, hidden)
      return output, (ht_1, ht)
    
  def __call__(self, x, hidden):
      return self.forward(x, hidden)
    

# test forward pass
x = np.random.normal(0, 1, (bptt, batch_size, emsize)).astype('float32')

# double hidden only necesary for LSTM
hidden_layer = np.zeros((1, batch_size, nhid)).astype('float32')
hidden_layer_tensor = torch.Tensor(torch.from_numpy(hidden_layer)).to(device)
hidden = (hidden_layer_tensor, hidden_layer_tensor)


# output is still very different
rnn_pt = nn.RNN(emsize, nhid)
output, h = rnn_pt(torch.Tensor(torch.from_numpy(x)))
# print('RNN TORCH', output)

rnn_cust = RNNCustom(ninput = emsize, nhid = nhid)
output, h = rnn_cust(torch.Tensor(torch.from_numpy(x)).to(device), hidden)
# print('RNN cust', output)

xt torch.Size([20, 200])
weights torch.Size([300, 200])




RuntimeError: ignored

# Dimensionality Analysis

I used the implementation above to go through all the different dimensions. Maybe this helps you understanding how to implement the RNN / LSTM layer.

I also tried looking at: https://www.quora.com/In-LSTM-how-do-you-figure-out-what-size-the-weights-are-supposed-to-be

Also, look here: https://github.com/pytorch/pytorch/blob/v0.3.0/torch/nn/_functions/rnn.py

### Some variables

* bptt = "backpropagation through time", but here used as the sequence length that we feed at once. It's given in dim=0 aling the 'seq_len' dimension in the LSTM given in the original code. Given above to be 35.
* bsz = "batch size", number of sequences looked at at once, in our case set to 20 in dim=1
* ntokens = len(vocab), total number of different tokens in the data
* len(text) = total number of tokens of the whole text
* nhid = number of values in hidden layer
* emsize = embedding size

### Step by step

1. in **corpus** only the index of every word is kept, so every word goes from dimension *ntokens* to a scalar value:   
      dim token: [ntokens] -> [1]

2. after **batchify(training_data)** , dividing the total text by the batch size and having *batch_size* many sequences:  
      dim: [len(text)] -> [len(text) / bsz, bsz]

3. after **get_batch(data, i)** get on sequence of size bptt for every batch:   
    dim data: [len(text) / bsz,  bsz] -> [bptt, bsz]  
    dim target: [len(text)] -> [bptt * bsz]
    
4. In the Net  
    1. Input dim: [bptt, bsz]
    2. hidden layer dim: (ht-1, ht): ([1, bsz, nhid], [1, bsz, nhid])
    3. embedding layer dim: [bptt, bsz, emsize]
    4. lstm layer dim: [bptt, bsz, emsize], hidden layer

In [0]:
# [method_name for method_name in dir(nn.RNN)
#  if callable(getattr(nn.RNN, method_name))]

for i in rnn_pt.named_parameters():
  print(i[0], i[1])

print(rnn_pt.forward)

for el in dir(rnn_pt):
  print(el)

weight_ih_l0 Parameter containing:
tensor([[ 0.0570,  0.0325,  0.0189,  ...,  0.0003,  0.0020, -0.0051],
        [-0.0652, -0.0458, -0.0119,  ...,  0.0604,  0.0518, -0.0349],
        [ 0.0103, -0.0330,  0.0539,  ..., -0.0286, -0.0249, -0.0535],
        ...,
        [-0.0342,  0.0368, -0.0072,  ..., -0.0362, -0.0624, -0.0534],
        [-0.0208, -0.0699, -0.0090,  ...,  0.0071, -0.0498,  0.0638],
        [ 0.0366,  0.0317,  0.0204,  ..., -0.0263,  0.0156,  0.0087]],
       requires_grad=True)
weight_hh_l0 Parameter containing:
tensor([[-0.0653,  0.0089, -0.0366,  ...,  0.0704, -0.0615,  0.0476],
        [-0.0279, -0.0296, -0.0317,  ...,  0.0018, -0.0658,  0.0076],
        [ 0.0561,  0.0579,  0.0167,  ..., -0.0332, -0.0495, -0.0416],
        ...,
        [ 0.0557, -0.0258, -0.0673,  ..., -0.0306,  0.0201, -0.0108],
        [-0.0630, -0.0141,  0.0312,  ..., -0.0008,  0.0000, -0.0391],
        [-0.0467,  0.0008,  0.0345,  ...,  0.0381, -0.0057, -0.0267]],
       requires_grad=True)
bias_ih_