In [1]:
import os
import glob
import pickle
import logging
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from datetime import datetime
import numpy as np
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense, Embedding
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

logging.basicConfig(filename='logs/autoencoder.log', filemode='w', level=logging.INFO, 
                        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

DIMS = [5, 10]
BATCH_SIZE = 4
EPOCHS = 2

Using TensorFlow backend.


In [2]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

[]

In [6]:
class Trainer:
    
    def __init__(self, lg):
        self.lg = lg
        #self.steps_per_epoch = self.read_num_sents() // BATCH_SIZE # uncomment for real run
        self.steps_per_epoch = 100
        self.word2id = self.read_word2id()
        self.word2id['PAD'] = 0
        self.word2id['SOS'] = len(self.word2id)
        self.word2id['EOS'] = len(self.word2id)
        self.vocab_size = len(self.word2id)
        self.path_to_articles = os.path.join('wikipedia', self.lg, 'unk-articles', '*.txt')
        self.out_dir = os.path.join('embeddings', self.lg)
    
    def read_num_sents(self):
        fname = os.path.join('wikipedia', self.lg, 'metadata.pkl')
        with open(fname, 'rb') as f:
            obj = pickle.load(f)
        return obj['num_sents']
    
    def read_word2id(self):
        fname = os.path.join('wikipedia', self.lg, 'unk-metadata.pkl')
        with open(fname, 'rb') as f:
            obj = pickle.load(f)
        return obj['word2id']
    
    def train(self):
        for d in DIMS:
            start = datetime.now()
            autoencoder = Autoencoder(d, self.word2id, self.path_to_articles)
            autoencoder.train(self.steps_per_epoch)
            autoencoder.save(self.out_dir)
            end = datetime.now()
            msg = 'Training {} autoencoder with {} dimensions took {}'.format(self.lg, d, end-start)
            logging.info(msg)
    
class Autoencoder:
    
    def __init__(self, dim, word2id, path):
        self.dim = dim
        self.word2id = word2id.copy()
        self.PAD = self.word2id['PAD']
        self.SOS = self.word2id['SOS']
        self.EOS = self.word2id['EOS']
        self.vocab_size = len(self.word2id)
        self.path = path
        self.model, self.encoder = self.build_models()
    
    def indices_from_sentence(self, sent):
        return [self.word2id[w] for w in sent.split(' ') if w] + [self.EOS]
    
    def build_models(self):
        encoder_inputs = Input(shape=(None,))
        encoder_embedding = Embedding(self.vocab_size, self.dim, mask_zero=True)(encoder_inputs)
        encoder = LSTM(self.dim, return_state=True)
        encoder_outputs, h, c = encoder(encoder_embedding)
        decoder_inputs = Input(shape=(None,))
        decoder_embedding = Embedding(self.vocab_size, self.dim, mask_zero=True)(decoder_inputs)
        decoder = LSTM(self.dim, return_sequences=True)
        decoder_outputs = decoder(decoder_embedding, initial_state=[h,c])
        dense = Dense(self.vocab_size, activation='softmax')
        decoder_outputs = dense(decoder_outputs)
        model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=[decoder_outputs])
        model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
        encoder_model = Model(encoder_inputs, h)
        return model, encoder_model
    
    def generate_batches(self):
        while True:
            fnames = glob.iglob(self.path)
            batch = []
            for fname in fnames:
                with open(fname, encoding='utf-8') as f:
                    for line in f:
                        batch.append(line.strip())
                        if len(batch) == BATCH_SIZE:
                            encoder_input_data = [self.indices_from_sentence(sent) for sent in batch]
                            encoder_input_data = pad_sequences(encoder_input_data, padding='post', value=self.PAD)
                            decoder_input_data = [[self.SOS] + self.indices_from_sentence(sent) for sent in batch]
                            decoder_input_data = pad_sequences(decoder_input_data, padding='post', value=self.PAD)
                            decoder_target_data = [self.indices_from_sentence(sent) + [self.PAD] for sent in batch]
                            decoder_target_data = pad_sequences(decoder_target_data, padding='post', value=self.PAD)
                            decoder_target_data = to_categorical(decoder_target_data, self.vocab_size)
                            yield ([encoder_input_data, decoder_input_data], decoder_target_data)
                            batch = []
    
    def train(self, num_steps):
        batches = self.generate_batches()
        self.model.fit_generator(batches, steps_per_epoch=num_steps, epochs=EPOCHS)
    
    def encode(self, sentences):
        result = {}
        for s in sentences:
            encoder_input_data = self.indices_from_sentence(s)
            encoder_input_data = np.array([encoder_input_data])
            h = self.encoder.predict(encoder_input_data)
            result[s] = h.flatten()
        return result
    
    def save(self, dirname):
        fname = os.path.join(dirname, 'autoencoder-{}.model'.format(self.dim))
        self.encoder.save(fname)
        
    def load(self, dirname):
        fname = os.path.join(dirname, 'autoencoder-{}.model'.format(self.dim))
        self.encoder = load_model(fname)

In [7]:
t = Trainer('en')
t.train()

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
