In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
!git clone "https://github.com/leondz/emerging_entities_17"

Cloning into 'emerging_entities_17'...
remote: Enumerating objects: 53, done.[K
Unpacking objects:   1% (1/53)   Unpacking objects:   3% (2/53)   Unpacking objects:   5% (3/53)   Unpacking objects:   7% (4/53)   Unpacking objects:   9% (5/53)   Unpacking objects:  11% (6/53)   Unpacking objects:  13% (7/53)   Unpacking objects:  15% (8/53)   Unpacking objects:  16% (9/53)   Unpacking objects:  18% (10/53)   Unpacking objects:  20% (11/53)   Unpacking objects:  22% (12/53)   Unpacking objects:  24% (13/53)   Unpacking objects:  26% (14/53)   Unpacking objects:  28% (15/53)   Unpacking objects:  30% (16/53)   Unpacking objects:  32% (17/53)   Unpacking objects:  33% (18/53)   Unpacking objects:  35% (19/53)   Unpacking objects:  37% (20/53)   Unpacking objects:  39% (21/53)   Unpacking objects:  41% (22/53)   Unpacking objects:  43% (23/53)   Unpacking objects:  45% (24/53)   Unpacking objects:  47% (25/53)   Unpacking objects:  49% (26/53)   Unpacking objects:

### Imports

In [16]:
from collections import Counter
import itertools
from functools import reduce
import numpy as np
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.init
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

### Utility Functions

In [4]:
def read_word_tags(file, caseless=True):
  with open(file, 'r') as f:
    lines = f.readlines()
  words, tags, temp_w, temp_t  = [], [], [], []
  #r = '([^\s+]+)\s+(.*)'
  #i = 0
  for line in lines:
    #print(i)
    if not line.isspace():
      #m = re.match(r, line)
      feats = line.split('\t')
      assert len(feats) == 2
      feats[1] = feats[1].strip('\n')
      temp_w.append(feats[0].lower() if caseless else feats[0])
      temp_t.append(feats[1].strip())
    elif len(temp_w) > 0:
      assert len(temp_w) == len(temp_t)
      words.append(temp_w)
      tags.append(temp_t)
      temp_w, temp_t = [], []
    #i += 1
  if len(temp_w) > 0:
    assert len(temp_w) == len(temp_t)
    words.append(temp_w)
    tags.append(temp_t)
  
  assert len(words) == len(tags)

  return words,tags

In [None]:
# Testing
x,y = read_word_tags('/content/emerging_entities_17/wnut17train.conll')

In [5]:
def create_maps(words, tags, min_word_freq=5, min_char_freq=1):
  "Creates word, char, tag maps"
  word_freq, char_freq = Counter(), Counter()
  tag_map = set()
  for w,t in zip(words,tags):
    word_freq.update(w)
    char_freq.update(list(reduce(lambda x, y: list(x)+[' ']+list(y), w)))
    tag_map.update(t)

  word_map = {k: v+1 for v, k in enumerate([w for w in word_freq.keys() if word_freq[w] > min_word_freq])}
  char_map = {k: v+1 for v, k in enumerate([c for c in char_freq.keys() if char_freq[c] > min_char_freq])}
  tag_map = {k: v+1 for v, k in enumerate(tag_map)}

  word_map['<pad>'] = 0
  word_map['<unk>'] = len(word_map)
  word_map['<end>'] = len(word_map)
  char_map['<pad>'] = 0
  char_map['<unk>'] = len(char_map)
  char_map['<end>'] = len(char_map)
  tag_map['<start>'] = len(tag_map)
  tag_map['<end>'] = len(tag_map)

  return word_map, char_map, tag_map

In [6]:
def create_input_tensors(words, tags, word_map, char_map, tag_map):
  "Creates tensors for Pytorch dataset"

  # Encode sentences into word maps with <end>
  wmaps = list(map(lambda s: list(map(lambda w: word_map.get(w, word_map['<unk>']), s)) + [word_map['<end>']], words))

  # forward and backward character streams
  chars_f = list(map(lambda s: list(reduce(lambda x, y: list(x) + [' '] + list(y), s)) + [' '], words))
  chars_b = list(map(lambda s: list(reversed([' '] + list(reduce(lambda x, y: list(x) + [' '] + list(y), s)))), words))

  # Encode streams into forward and backward maps with <end>
  cmaps_f = list(map(lambda s: list(map(lambda c: char_map.get(c, char_map['<unk>']), s)) + [char_map['<end>']], chars_f))
  cmaps_b = list(map(lambda s: list(map(lambda c: char_map.get(c, char_map['<unk>']), s)) + [char_map['<end>']], chars_b))

  # Positions of spaces and <end>
  cmarkers_f = list(map(lambda s: [ind for ind in range(len(s)) if s[ind]==char_map[' ']] + [len(s-1)], cmaps_f))
  cmarkers_b = list(map(lambda s: list(reversed([ind for ind in range(len(s)) if s[ind]==char_map[' ']])) + [len(s-1)], cmaps_b))

  # Encode tags into tag_maps with <end>
  tmaps = list(map(lambda s: list(map(lambda t: tag_map[t], s)) + [tag_map['<end>']], tags))

  # Since we use a prev_tag*cur_tag matrix for CRF scores
  tmaps = list(map(lambda s: [tag_map['<start>'] * len(tag_map) + s[0]] + [s[i - 1] * len(tag_map) + s[i] for i in range(1, len(s))], tmaps))

  # Actual tag indices to be recovered using tmaps%len(tag_map)

  # Applying padding for obvious reasons
  word_pad_len = max(list(map(lambda s: len(s), wmaps)))
  char_pad_len = max(list(map(lambda s: len(s), cmaps_f)))  #Since sentence length is constant in forward and backward

  # Sanity Check
  assert word_pad_len == max(list(map(lambda s: len(s), tmaps)))

  padded_wmaps, padded_cmaps_f, padded_cmaps_b, padded_cmarkers_f, padded_cmarkers_b, padded_tmaps, wmap_lengths, cmap_lengths = [], [], [], [], [], [], [], []

  for w, cf, cb, cmf, cmb, t in zip(wmaps, cmaps_f, cmaps_b, cmarkers_f, cmarkers_b, tmaps):
    # Sanity  checks
    assert len(w) == len(cmf) == len(cmb) == len(t)
    assert len(cmaps_f) == len(cmaps_b)
    
    padded_wmaps.append(w + [word_map['<pad>']] * (word_pad_len - len(w)))
    padded_cmaps_f.append(cf + [char_map['<pad>']] * (char_pad_len - len(cf)))
    padded_cmaps_b.append(cb + [char_map['<pad>']] * (char_pad_len - len(cb)))

    # Padding with index 0 for markers
    padded_cmarkers_f.append(cmf + [0] * (word_pad_len - len(w)))
    padded_cmarkers_b.append(cmb + [0] * (word_pad_len - len(w)))

    padded_tmaps.append(t + [tag_map['<pad>']] * (word_pad_len - len(t)))

    wmap_lengths.append(len(w))
    cmap_lengths.append(len(cf))

    # Sanity check
    assert len(padded_wmaps[-1]) == len(padded_tmaps[-1]) == len(padded_cmarkers_f[-1]) == len(padded_cmarkers_b[-1]) == word_pad_len
    assert len(padded_cmaps_f[-1]) == len(padded_cmaps_b[-1]) == char_pad_len

  padded_wmaps = torch.LongTensor(padded_wmaps)
  padded_cmaps_f = torch.LongTensor(padded_cmaps_f)
  padded_cmaps_b = torch.LongTensor(padded_cmaps_b)
  padded_cmarkers_f = torch.LongTensor(padded_cmarkers_f)
  padded_cmarkers_b = torch.LongTensor(padded_cmarkers_b)
  padded_tmaps = torch.LongTensor(padded_tmaps)
  wmap_lengths = torch.LongTensor(wmap_lengths)
  cmap_lengths = torch.LongTensor(cmap_lengths)

  return padded_wmaps, padded_cmaps_f, padded_cmaps_b, padded_cmarkers_f, padded_cmarkers_b, padded_tmaps, wmap_lengths, cmap_lengths

In [7]:
def init_embedding(input_embedding):
  "Initializing embedding tensor using uniformly distributed values"
  bias = np.sqrt(3.0/input_embedding.size(1))
  nn.init.uniform_(input_embedding, -bias, bias)

In [8]:
def load_embeddings(emb_file, word_map, expand_vocab=True):
  "Load Pre Trained embeddings"
  with open(emb_file, 'r') as f:
    emb_len = len(f.readline().split(' ')) - 1
  
  print(f"Embedding length is {emb_len}")

  ic_embs = torch.FloatTensor(len(word_map), emb_len)
  init_embedding(ic_embs)

  if expand_vocab:
    print("You have selected to include out of corpus embeddings")
    ooc_words, ooc_embeds = [], []

  else:
    print("Not including out of corpus word embeddings")

  print("\nLoading Embeddings :)")
  
  for line in open(emb_file,'r'):
    line = line.split(' ')
    emb_word = line[0]

    embedding = list(map(lambda t: float(t), filter(lambda n: n and not n.isspace(), line[1:])))

    if not expand_vocab and emb_word not in word_map:
      continue
    
    if emb_word in word_map:
      ic_embs[word_map[emb_word]] = torch.FloatTensor(embedding)

    elif expand_vocab:
      ooc_words.append(emb_word)
      ooc_embeds.append(embedding)

  lm_vocab_size = len(word_map)

  if expand_vocab:
    print("Updating Word Map")
    for word in ooc_words:
      word_map[word] = len(word_map)
    ooc_embs = torch.FloatTensor(np.asarray(ooc_embs))
    embeddings = torch.cat([ic_embs,ooc_embs], 0)

  else:
    embeddings = ic_embs

  assert embeddings.size(0) == len(word_map)

  print(f'Process completed successfully.\nEmbedding vocabulary: {len(word_map)}\n Language Model Vocabulary: {lm_vocab_size}')

  return embeddings, word_map, lm_vocab_size

In [9]:
def clip_gradient(optimizer, grad_clip):
  "Clip gradients during backpropogation to prevent gradient explosion"
  for group in optimizer.param_groups:
    for param in group['params']:
      if param.grad is not None:
        param.grad.data.clamp_(-grad_clip, grad_clip)

In [10]:
def save_checkpoint(epoch, model, optimizer, val_f1, word_map, char_map, tag_map, lm_vocab_size, is_best):
  "Save model checkpoint"

  state = {'epoch':epoch,
           'f1' : val_f1,
           'model': model,
           'optimizer':optimizer,
           'word_map':word_map,
           'tag_map': tag_map,
           'char_map': char_map,
           'lm_vocab_size':lm_vocab_size}

  filename = 'checkpoint_lm_lstm_crf.pth.tar'
  torch.save(state, filename)
  if is_best:
    torch.save(state,'BEST_'+filename)

In [11]:
class AverageMeter(object):
  "Keeps track of most recent average,sum and count of a metric"
  def __init__(self):
    self.reset()

  def reset(self):
    self.val = 0
    self.avg = 0
    self.sum = 0
    self.count = 0
  
  def update(self, val, n=1):
    self.val = val
    self.sum += val*n
    self.count += n
    self.avg = self.sum / self.count

In [12]:
def adjust_learning_rate(optimizer, new_lr):
  "Shrinks learning rate by a given factor"
  print("Decaying Learning rate")
  
  for param_group in optimizer.param_groups:
    param_group['lr'] = new_lr

  print(f"The new learning rate is now {optimizer.param_groups[0]['lr']}")

In [13]:
def log_sum_exp(tensor, dim):
  "Calculates the log-sum-exponent in a stable way"
  m, _ = torch.max(tensor, dim)
  m_expanded = m.unsqueeze(dim).expand_as(tensor)
  return m + torch.log(torch.sum(torch.exp(tensor - m_expanded), dim))

### Dataset Class

In [15]:
class WCDataset(Dataset):
  "Dataset to be used by dataloader"
  def __init__(self, wmaps, cmaps_f, cmaps_b, cmarkers_f, cmarkers_b, tmaps, wmap_lengths, cmap_lengths):
    self.wmaps = wmaps
    self.cmaps_f = cmaps_f
    self.cmaps_b = cmaps_b
    self.cmarkers_f = cmarkers_f
    self.cmarkers_b = cmarkers_b
    self.tmaps = tmaps
    self.wmap_lengths = wmap_lengths
    self.cmap_lenghts = cmap_lengths
    self.data_size = self.wmaps.size(0)

  def __getitem__(self, i):
    return self.wmaps[i], self.cmaps_f[i], self.cmaps_b[i], self.cmarkers_f[i], self.cmarkers_b[i], self.tmaps[i], self.wmap_lengths[i], self.cmap_lengths[i]

  def __len__(self):
    return self.data_size

### Testing Dynamic RNN

In [17]:
# Tensor with variable lengths and pads(25)
seqs = torch.LongTensor([[0,1,2,3,25,25,25],
                         [4,5,25,25,25,25,25],
                         [6,7,8,9,10,11,25]])

In [19]:
# Storing original lengths
seq_lens = torch.LongTensor([4,2,6])

In [20]:
# Sort by decreasing lengths
seq_lens, sort_ind = seq_lens.sort(dim = 0, descending=True)
seqs = seqs[sort_ind]

In [23]:
embeds = nn.Embedding(26,10, padding_idx=25)

In [24]:
lstm = nn.LSTM(10, 50, bidirectional=False, batch_first=True)

In [27]:
# Without dynamic batching
embeddings = embeds(seqs)
out_static, _ = lstm(embeddings)

In [28]:
assert out_static.size(1) == embeddings.size(1)

In [29]:
out_static[1,-1]

tensor([-0.0119,  0.0027,  0.0061,  0.0786,  0.1278, -0.0046,  0.0359,  0.0562,
         0.0026,  0.0846, -0.0501, -0.0274, -0.0561, -0.0109, -0.0523, -0.0353,
         0.0968, -0.0874, -0.0330,  0.1150,  0.0598, -0.0217, -0.0538, -0.0602,
        -0.1130,  0.0985,  0.0642, -0.0596, -0.0416, -0.0739, -0.0075, -0.0026,
        -0.1013, -0.0197, -0.0187,  0.0071,  0.0074, -0.0375,  0.1002, -0.0072,
        -0.0076, -0.0448,  0.0792,  0.0015, -0.0398,  0.0789,  0.0195, -0.0549,
        -0.0794, -0.0272], grad_fn=<SelectBackward>)

In [33]:
# With Dynamic Batching
packed_seqs = pack_padded_sequence(embeddings, seq_lens.tolist(), batch_first=True)

In [34]:
out_dynamic, _ = lstm(packed_seqs)
out_dynamic, lens = pad_packed_sequence(out_dynamic, batch_first=True)

In [37]:
assert out_dynamic.size(1) != embeddings.size(1)

In [38]:
# Padded length is the length of the longest sequence
out_dynamic.shape

torch.Size([3, 6, 50])

In [39]:
out_dynamic[1,-1]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.], grad_fn=<SelectBackward>)

In [40]:
seqs

tensor([[ 6,  7,  8,  9, 10, 11, 25],
        [ 0,  1,  2,  3, 25, 25, 25],
        [ 4,  5, 25, 25, 25, 25, 25]])

In [41]:
packed_seqs = pack_padded_sequence(seqs, seq_lens, batch_first=True)
packed_seqs[0]

tensor([ 6,  0,  4,  7,  1,  5,  8,  2,  9,  3, 10, 11])

In [42]:
# Look at the batch size in seqs at each time step, this is equal to that
packed_seqs[1]

tensor([3, 3, 2, 2, 1, 1])