In [6]:
import os
import glob
import pickle
import logging
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from datetime import datetime
import numpy as np
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense, Embedding
from keras.utils import to_categorical, Sequence
from keras import optimizers

DIMS = [100] 
BATCH_SIZE = 32
DROPOUT = 0.1
LR = 0.01
OPTIMIZER = optimizers.Adagrad(lr=LR)
EPOCHS = 1

on_floyd = False
IN_DIR = '.'
OUT_DIR = '.'
if on_floyd:
    IN_DIR = '/floyd/input/'
    OUT_DIR = '/output'

log_fname = os.path.join(OUT_DIR, 'autoencoder.log')
logging.basicConfig(filename=log_fname, filemode='w', level=logging.INFO, 
                        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

In [13]:
class Trainer:
    
    def __init__(self, lg):
        self.lg = lg
        assert self.read_batch_size() == BATCH_SIZE, "Batch size in data doesn't match this notebook"
        self.steps_per_epoch = self.read_num_sents() // BATCH_SIZE
        self.word2id = self.read_word2id()
        self.vocab_size = len(self.word2id)
        self.path_to_articles = os.path.join(IN_DIR, 'wikipedia', self.lg, 'batched-articles', '*.csv')
        self.out_dir = os.path.join(OUT_DIR, 'embeddings', self.lg)
        os.makedirs(self.out_dir, exist_ok=True)
    
    def read_pickle(self, fname, key):
        fname = os.path.join(IN_DIR, 'wikipedia', self.lg, fname)
        with open(fname, 'rb') as f:
            obj = pickle.load(f)
        return obj[key]
    
    def read_num_sents(self):
        return self.read_pickle('metadata.pkl', 'num_sents')
    
    def read_word2id(self):
        return self.read_pickle('batched-metadata.pkl', 'word2id')
    
    def read_batch_size(self):
        return self.read_pickle('batched-metadata.pkl', 'batch_size')
    
    def train(self):
        for d in DIMS:
            start = datetime.now()
            autoencoder = Autoencoder(d, self.word2id, self.path_to_articles, self.steps_per_epoch)
            autoencoder.train(self.steps_per_epoch)
            autoencoder.save(self.out_dir)
            end = datetime.now()
            msg = 'Training {} autoencoder with {} dimensions took {}'.format(self.lg, d, end-start)
            logging.info(msg)

class MySequence(Sequence):
    def __init__(self, path, steps_per_epoch, vocab_size):
        self.path = path
        self.fnames = glob.iglob(path)
        self.steps_per_epoch = steps_per_epoch
        self.vocab_size = vocab_size
    
    def __len__(self):
        return self.steps_per_epoch
    
    def on_epoch_end(self):
        self.fnames = glob.glob(self.path)
    
    def __getitem__(self, index):
        fname = next(self.fnames)
        data = np.loadtxt(fname, delimiter=',', dtype=int)
        encoder_input_data = data[:,1:-1].copy()
        decoder_input_data = data[:,:-1].copy()
        decoder_target_data = data[:,1:]
        decoder_target_data = to_categorical(decoder_target_data, self.vocab_size)
        return ([encoder_input_data, decoder_input_data], decoder_target_data)
    
class Autoencoder:
    
    def __init__(self, dim, word2id, path, steps_per_epoch):
        self.dim = dim
        self.word2id = word2id.copy()
        self.vocab_size = len(self.word2id)
        self.sequence = MySequence(path, steps_per_epoch, self.vocab_size)
        self.model, self.encoder = self.build_models()
    
    def build_models(self):
        encoder_inputs = Input(shape=(None,))
        encoder_embedding = Embedding(self.vocab_size, self.dim, mask_zero=True)(encoder_inputs)
        encoder = LSTM(self.dim, return_state=True, dropout=DROPOUT, recurrent_dropout=DROPOUT)
        encoder_outputs, h, c = encoder(encoder_embedding)
        decoder_inputs = Input(shape=(None,))
        decoder_embedding = Embedding(self.vocab_size, self.dim, mask_zero=True)(decoder_inputs)
        decoder = LSTM(self.dim, return_sequences=True, dropout=DROPOUT, recurrent_dropout=DROPOUT)
        decoder_outputs = decoder(decoder_embedding, initial_state=[h,c])
        dense = Dense(self.vocab_size, activation='softmax')
        decoder_outputs = dense(decoder_outputs)
        model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=[decoder_outputs])
        model.compile(optimizer=OPTIMIZER, loss='categorical_crossentropy')
        encoder_model = Model(encoder_inputs, h)
        return model, encoder_model
    
    def indices_from_sentence(self, sent):
        return [self.word2id[w] for w in sent.split(' ') if w] + [self.word2id['EOS']]
    
    def generate_batches(self):
        while True:
            fnames = glob.iglob(self.path)
            for fname in fnames:
                data = np.loadtxt(fname, delimiter=',', dtype=int)
                encoder_input_data = data[:,1:-1].copy()
                decoder_input_data = data[:,:-1].copy()
                decoder_target_data = data[:,1:]
                decoder_target_data = to_categorical(decoder_target_data, self.vocab_size)
                yield ([encoder_input_data, decoder_input_data], decoder_target_data)
    
#     def generate_batches(self):
#         while True:
#             fnames = glob.iglob(self.path)
#             batch = []
#             for fname in fnames:
#                 with open(fname, encoding='utf-8') as f:
#                     for line in f:
#                         batch.append(line.strip())
#                         if len(batch) == BATCH_SIZE:
#                             encoder_input_data = [self.indices_from_sentence(sent) for sent in batch]
#                             encoder_input_data = pad_sequences(encoder_input_data, padding='post', value=self.PAD)
#                             decoder_input_data = [[self.SOS] + self.indices_from_sentence(sent) for sent in batch]
#                             decoder_input_data = pad_sequences(decoder_input_data, padding='post', value=self.PAD)
#                             decoder_target_data = [self.indices_from_sentence(sent) + [self.PAD] for sent in batch]
#                             decoder_target_data = pad_sequences(decoder_target_data, padding='post', value=self.PAD)
#                             decoder_target_data = to_categorical(decoder_target_data, self.vocab_size)
#                             yield ([encoder_input_data, decoder_input_data], decoder_target_data)
#                             batch = []
    
    def train(self, num_steps):
        #batches = self.generate_batches()
        self.model.fit_generator(self.sequence, steps_per_epoch=num_steps, epochs=EPOCHS, workers=4, 
                                 use_multiprocessing=True)
    
    def encode(self, sentences):
        result = {}
        for s in sentences:
            encoder_input_data = self.indices_from_sentence(s)
            encoder_input_data = np.array([encoder_input_data])
            h = self.encoder.predict(encoder_input_data)
            result[s] = h.flatten()
        return result
    
    def save(self, dirname):
        fname = os.path.join(dirname, 'autoencoder-{}.model'.format(self.dim))
        self.encoder.save(fname)
        
    def load(self, dirname):
        fname = os.path.join(dirname, 'autoencoder-{}.model'.format(self.dim))
        self.encoder = load_model(fname)

In [15]:
t = Trainer('en')
#t.train()