In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import os
from tqdm import tqdm_notebook, tqdm

import torch
import torch.nn as nn
from matplotlib import pyplot as plt

import torchtext
import collections

import re

### Verify if CUDA is available

In [2]:
# If CUDA is available print devices
if torch.cuda.is_available():
    print('CUDA devices:')
    for device in range(0, torch.cuda.device_count()):
        print('\t{} - {}'.format(device, torch.cuda.get_device_name(device)))
else:
    print('No CUDA devices')

CUDA devices:
	0 - GeForce RTX 2060


### Define encoder and decoder

In [3]:
class Encoder(nn.Module):
  def __init__(self, embedding_size, encoding_size, use_cuda=False):
    super(Encoder, self).__init__()

    self.encoding_size = encoding_size
   
    self.encoder = nn.LSTM(
      input_size=embedding_size, 
      hidden_size=encoding_size,
      num_layers=1,
      bias=True,
      batch_first=True,
      dropout=0.1,
      bidirectional=False
    )
  
  def forward(self, embeddings):
    _, (hidden, memory) = self.encoder(embeddings)
    return hidden, memory

In [4]:
class Decoder(nn.Module):

  def __init__(self, encoding_size, embedding_size, use_cuda=False):
    super(Decoder, self).__init__()


    self.encoding_size = encoding_size

    self.decoder = nn.LSTM(
      input_size=embedding_size, 
      hidden_size=encoding_size,
      num_layers=1,
      bias=True,
      batch_first=True,
      dropout=0.1,
      bidirectional=False
    )

    self.dim_linear = nn.Linear(encoding_size, embedding_size)
    self.dim_fn = nn.Tanh()    

  def forward(self, embeddings, init_hidden, init_memory):

    output_, (hidden, memory) = self.decoder(embeddings, (init_hidden, init_memory))
    linear = self.dim_linear(output_)
    
    return self.dim_fn(linear), hidden, memory

### Defining functions related with transforming data

In [5]:
# Transform each sentence by adding a start and finish sequence for encoder and decoder training
def get_transformation(vocabulary, embedding_size):
    first_embedding = torch.ones((embedding_size,))
    last_embedding = torch.zeros((embedding_size,))

    def transform_example(example):
        transformed = []
        transformed.append(first_embedding)

        for idx in example:
            transformed.append(vocabulary.vectors[idx])
        
        transformed.append(last_embedding)
        
        transformed = torch.stack(transformed)[:15]
        transformed = transformed.unsqueeze(0)

        return transformed

    return transform_example

In [6]:
# Get indices from word's vectors
def get_index_fn(vectors):
    def get_index(prediction):
        indices = []

        for vector in prediction:
            result = torch.abs(vectors - vector).norm(2, dim=1)
            indices.append(torch.argmin(result))

        indices = torch.stack(indices)
        return indices

    return get_index

In [7]:
# This function recover a sentence from word's indices
def get_text_fn(itos):
    def get_text(example):
        text = []
        for idx in example:
            text.append(itos[idx])

        return ' '.join(text)
    return get_text

In [8]:
# this function return indices from string
def get_example_fn(stoi):
    def get_example(string):
        parts = re.sub(r'(\s|\-|\]|\^|\$|\*|\.|\\|\'|,|")', r'#\1#', string)
        parts = re.sub('(#|\s)+', '#', parts)
        parts = parts.split('#')

        example = []
        for part in parts:
            if part in stoi:
                example.append(stoi[part])
            else:
                example.append(stoi['<unk>'])

        return example
    return get_example

### Define variables related with loading information and training/validation data

In [9]:
VECTORS_LOADED = 20000

### Loading word vectors and trainig/validation dataset

In [10]:
fasttext = torchtext.vocab.FastText(language='en', max_vectors=VECTORS_LOADED, cache='../.vector_cache')

In [11]:
vocabulary = torchtext.vocab.Vocab(collections.Counter(fasttext.stoi.keys()))
vocabulary.set_vectors(fasttext.stoi, fasttext.vectors, fasttext.dim)

### Defining variables related with training

In [12]:
USE_CUDA = torch.cuda.is_available()
EMBEDDING_SIZE = fasttext.dim
ENCODING_SIZE = 1024

### Load encoder and decoder

In [13]:
encoder = Encoder(EMBEDDING_SIZE, ENCODING_SIZE, use_cuda=USE_CUDA)
encoder.load_state_dict(torch.load('checkpoints/encoder.pt'))
encoder.eval()
decoder = Decoder(ENCODING_SIZE, EMBEDDING_SIZE, use_cuda=USE_CUDA)
decoder.load_state_dict(torch.load('checkpoints/decoder.pt'))
decoder.eval()

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()


In [14]:
transformation_fn = get_transformation(vocabulary, EMBEDDING_SIZE)
get_text = get_text_fn(vocabulary.itos)
get_indices = get_index_fn(vocabulary.vectors)
get_example = get_example_fn(vocabulary.stoi)

### Predicting (Recovering)

In [15]:
ORIGINAL_TEXT = 'He has called on authorities in Iran to restore the internet service'

#### Encoding

In [16]:
example = get_example(ORIGINAL_TEXT.lower())
embeddings = transformation_fn(example)

with torch.no_grad():
    if USE_CUDA:
        embeddings = embeddings.cuda()
    
    representation, memory = encoder(embeddings[:,1:,:])

Here, representation contains the encoded sentence

In [17]:
representation

tensor([[[ 0.0029, -0.0010, -0.0162,  ..., -0.2869, -0.0137, -0.6091]]],
       device='cuda:0')

#### Decoding

In [18]:
embedding = torch.ones((1, 1, EMBEDDING_SIZE))
hidden = representation
mem = memory
decodings = []

if USE_CUDA:
    embedding = embedding.cuda()
    hidden = hidden.cuda()
    mem = mem.cuda()

for prediction_idx in range(15):
    embedding, hidden, mem = decoder(embedding, hidden, mem)
    decodings.append(embedding[0])

decodings = torch.stack(decodings)


In [19]:
get_text(get_indices(decodings.cpu()))

'and has called on authorities in iran to but the the internet service <pad> and'