# Autoencoder

In [2]:
import os
import glob
import pickle
import logging
from datetime import datetime
from tqdm import tqdm
tqdm.monitor_interval = 0
import pandas as pd

logging.basicConfig(filename='logs/autoencoder.log', filemode='a', level=logging.INFO, 
                        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from masked_cross_entropy import *

device = torch.device('cpu')

OUT_DIR = 'embeddings'

In [3]:
dim = 8
lg = 'en'
fnames = glob.iglob(os.path.join('wikipedia', lg, 'unk-articles/1111*.txt'))
fname = os.path.join('wikipedia', lg, 'unk-metadata.pkl')
with open(fname, 'rb') as f:
    obj = pickle.load(f)
word2id = obj['word2id']
word2id['SOS'] = len(word2id)
word2id['EOS'] = len(word2id)
word2id['PAD'] = len(word2id)
def read():
    for fname in fnames:
        with open(fname, encoding='utf-8') as f:
            for line in f:
                yield line.strip()
sentences = list(read())
SOS = word2id['SOS']
EOS = word2id['EOS']
PAD = word2id['PAD']
vocab_size = len(word2id)

def indices_from_sentence(sentence):
    return [word2id[word] for word in sentence.split(' ') if word]
# Above is all the Trainer stuff

def pad_seq(seq, max_length):
    seq += [PAD for i in range(max_length - len(seq))]
    return seq

# prepare batch
BATCH_SIZE = 3
batch = []
for i in range(BATCH_SIZE):
    tokens = indices_from_sentence(sentences[i])
    batch.append(tokens)
batch.sort(key=lambda s: len(s), reverse=True)
batch_lengths = [len(s) for s in batch]
max_length = max(batch_lengths)
padded_batch = [pad_seq(s, max_length) for s in batch]
input_var = Variable(torch.LongTensor(padded_batch)).transpose(0, 1) # just a tensor
# end prepare batch

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq)
        packed = pack_padded_sequence(embedded, input_lengths)
        output_seq, hidden = self.lstm(packed, hidden)
        output_seq, output_lengths = pad_packed_sequence(output_seq)
        return output_seq, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size, batch_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size=batch_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input_seq, last_hidden):
        embedded = self.embedding(input_seq).view(1, self.batch_size, self.hidden_size) # S=1 x B x N
        rnn_output, hidden = self.lstm(embedded, last_hidden)
        output = self.softmax(self.out(rnn_output))
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, self.batch_size, self.hidden_size, device=device)

In [7]:
lr=0.01
encoder = Encoder(vocab_size, dim)
decoder = Decoder(dim, vocab_size, BATCH_SIZE)
encoder_outputs, (encoder_hidden, encoder_cell) = encoder(input_var, batch_lengths)
encoder_optimizer = optim.SGD(encoder.parameters(), lr=lr)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=lr)
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()

In [8]:
decoder_input = Variable(torch.LongTensor([SOS] * BATCH_SIZE))
decoder_hidden = encoder_hidden[:1]
all_decoder_outputs = Variable(torch.zeros(max_length, BATCH_SIZE, vocab_size))
decoder_cell = decoder.init_hidden()

In [9]:
for t in range(max_length):
    decoder_output, (decoder_hidden, decoder_cell) = decoder(decoder_input, (decoder_hidden, decoder_cell))
    all_decoder_outputs[t] = decoder_output
    decoder_input = input_var[t]

In [None]:
loss = masked_cross_entropy(
    all_decoder_outputs.transpose(0, 1).contiguous(),
    input_var.transpose(0, 1).contiguous(),
    batch_lengths
)