In [1]:
import zipfile
import torch

def read_data(filename):
    z = zipfile.ZipFile(filename, 'r')

    lines = []
    with z.open(z.namelist()[0]) as f:
        i = 0
        for line in f:
            if i % 100 == 0:
                line = line.decode('utf-8').lower().replace("'", " ").replace(".", "").replace("?", "")\
                    .replace("!", "").replace(":", "").replace(";", "")
                lines.append(line)
            i += 1

    z.close()
    return lines

In [6]:
from collections import Counter
import numpy as np
import torch
import pickle


class LanguageLoader(object):
    def __init__(self, input_path, output_path, vocab_size, max_length):
        super(LanguageLoader, self).__init__()

        self.vocab_size, self.max_length = vocab_size, max_length

        try:
            self.input_dict = pickle.load(open("data/input_dict.p", "rb"))
            self.input_vecs = pickle.load(open("data/input_vecs.p", "rb"))
            self.input_size = len(self.input_dict)

            self.output_dict = pickle.load(open("data/output_dict.p", "rb"))
            self.output_vecs = pickle.load(open("data/output_vecs.p", "rb"))
            self.output_size = len(self.output_dict)
            print("Languages found and loaded.")
        except(IOError):
            self.input_dict, self.input_vecs, self.input_size = self.init_language(input_path)
            pickle.dump(self.input_dict, open("data/input_dict.p", "wb"))
            pickle.dump(self.input_vecs, open("data/input_vecs.p", "wb"))
            print("Input language loaded.")

            self.output_dict, self.output_vecs, self.output_size = self.init_language(output_path)
            pickle.dump(self.output_dict, open("data/output_dict.p", "wb"))
            pickle.dump(self.output_vecs, open("data/output_vecs.p", "wb"))
            print("Output language loaded.")

        self.input_vecs, self.output_vecs = self.filter(self.input_vecs, self.output_vecs)

    def init_language(self, path):
        dictionary = ["<SOS>", "<EOS>", "<UNK>"]

        corpus = read_data(path)
        #corpus = ["the negative log likelihood loss", "it is useful to train a classification problem with n classes" "if provided, the optional argument weights should be a 1D Tensor assigning weight to each of the classes", "this is particularly useful when you have an unbalanced training set"]
        words = " ".join(corpus).split()
        mc = Counter(words).most_common(self.vocab_size-3)
        dictionary.extend([word for word, _ in mc])
        vectors = [[self.vectorize(word, dictionary) for word in sentence.split()] for sentence in corpus]

        return dictionary, vectors, len(dictionary)

    def sentences(self, amount):
        indeces = np.random.choice(len(self.input_vecs), amount)
        sentences = [(self.input_vecs[i], self.output_vecs[i]) for i in indeces]

        return sentences

    def sentence_to_vec(self, sentence):
        vectors = [self.vectorize(word, self.input_dict) for word in sentence.lower().split()]
        return vectors

    def vec_to_sentence(self, vectors):
        sentence = " ".join([self.output_dict[vec[0, 0]] for vec in vectors])
        return sentence

    def vectorize(self, word, list):
        vec = torch.LongTensor(1, 1).zero_()
        index = 2 if word not in list else list.index(word)
        vec[0][0] = index
        return vec

    def filter(self, input_vecs, output_vecs):
        i = 0
        for _ in input_vecs:
            if len(input_vecs[i]) > self.max_length or len(output_vecs[i]) > self.max_length:
                input_vecs.pop(i)
                output_vecs.pop(i)
            else:
                i += 1

        return input_vecs, output_vecs

In [20]:
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch

class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size=500, hidden_size=1000):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size, 1)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden_state = self.gru(embedded, hidden)
        return output, hidden_state

    def first_hidden(self):
        return Variable(torch.FloatTensor(1, 1, self.hidden_size).zero_())

class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size=500, hidden_size=1000):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size, 1)
        self.linear = nn.Linear(hidden_size, input_size)
        self.softmax = nn.Softmax()

    def forward(self, input, hidden):
        input = Variable(input)
        embedded = self.embedding(input)
        output, hidden_state = self.gru(embedded, hidden)
        output = output.view(1, output.size(2))
        linear = self.linear(output)
        softmax = self.softmax(linear)
        return output, softmax, hidden_state
    
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch

class RNN(object):
    def __init__(self, input_size, output_size):
        super(RNN, self).__init__()

        self.encoder = Encoder(input_size)
        self.decoder = Decoder(output_size)

        self.loss = nn.CrossEntropyLoss()
        self.encoder_optimizer = optim.Adam(self.encoder.parameters())
        self.decoder_optimizer = optim.Adam(self.decoder.parameters())

        sos, eos = torch.LongTensor(1, 1).zero_(), torch.LongTensor(1, 1).zero_()
        sos[0, 0], eos[0, 0] = 0, 1

        self.sos, self.eos = sos, eos


    def train(self, input, target):
        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()
        hidden_state = self.encoder.first_hidden()

        # Encoder
        for ivec in input:
            _, hidden_state = self.encoder.forward(Variable(ivec), hidden_state)

        # Decoder
        target.insert(0, self.sos)
        target.append(self.eos)
        total_loss = 0
        
        for i in range(len(target) - 1):
            o, softmax, hidden_state = self.decoder.forward(target[i], hidden_state)
            print(o)
            print(softmax)
            print(torch.sum(softmax))
            print(gr)
            total_loss += self.loss(softmax, Variable(target[i+1][0]))

        total_loss.backward()

        self.decoder_optimizer.step()
        self.encoder_optimizer.step()

        return total_loss

    def eval(self, input):
        hidden_state = self.encoder.first_hidden()

        # Encoder
        for ivec in input:
            _, hidden_state = self.encoder.forward(ivec, hidden_state)

        outputs = []
        output = self.sos
        # Decoder
        while output is not self.eos:
            output, _, hidden_state = self.decoder.forward(output, hidden_state)
            outputs += output

        return outputs

In [21]:
en_path = 'data/en.zip'
fr_path = 'data/fr.zip'

max_length = 3
num_batches = 10
vocab_size = 15

def main():
    data = LanguageLoader(en_path, fr_path, vocab_size, max_length)
    rnn = RNN(data.input_size, data.output_size)

    losses = []
    iter = 0
    for input, target in data.sentences(num_batches):
        loss = rnn.train(input, target)
        if iter % 100 is 0:
            print(loss.data[0])
        iter += 1

main()

Languages found and loaded.
Variable containing:
( 0  ,.,.) = 
  0.0551  0.1808 -0.0945  ...  -0.1549 -0.0182  0.0851
[torch.FloatTensor of size 1x1x1000]

Variable containing:
 0.0633  0.1629  0.1551  ...   0.1290  0.3100  0.2841
[torch.FloatTensor of size 1x1000]

Variable containing:
 0.1535  0.1543  0.1757  0.1811  0.1476  0.1878
[torch.FloatTensor of size 1x6]

Variable containing:
 1
[torch.FloatTensor of size 1]



NameError: global name 'gr' is not defined