In [1]:
%load_ext autoreload
%autoreload 2
#%matplotlib inline
%matplotlib notebook
import matplotlib
import matplotlib.pyplot as plt
print(plt.get_backend())

nbAgg


In [2]:
from torch.nn.functional import normalize
import torch.optim as optim
import torch
from abae_pytorch.train import max_margin_loss, orthogonal_regularization, plotter
import collections
import matplotlib.cm as cm
import numpy as np
import tqdm


def sample_aspects(projection, i2w, n=8):
    projection = torch.sort(projection, dim=1)
    for j, (projs, index) in enumerate(zip(*projection)):
        index = index[-n:].detach().cpu().numpy()
        words = ', '.join([i2w[i] for i in index])
        print('Aspect %2d: %s' % (j + 1, words))


def validate(ab, dl, device='cuda', split='val', epochsize=100, batchsize=100, negsize=20, ortho_reg=0.1):
    losses = []
    batches = dl.batch_generator(split, batchsize=batchsize, negsize=negsize, device=device)
    with tqdm.tqdm(range(epochsize), total=epochsize, desc='validating') as pbar:
        for b in pbar:
            pos, neg = next(batches)
            r_s, z_s, z_n = ab(pos, neg)
            J = max_margin_loss(r_s, z_s, z_n).item()
            U = orthogonal_regularization(ab.T.weight).item()
            losses.append((J + ortho_reg * batchsize * U))
            x = (b + 1, np.mean(losses))
            pbar.set_description('VAL BATCH: %d | MEAN-VAL-LOSS: %0.5f' % x)
    return np.mean(losses)


def train(ab, dl, device='cuda', 
          epochs=5, epochsize=100, initial_lr=0.02, batchsize=100, negsize=20, ortho_reg=0.1):    
    batches = dl.batch_generator('train', batchsize=batchsize, negsize=negsize, device=device)
    i2w = dict((dl.w2i[w], w) for w in dl.w2i)

    opt = optim.Adam(ab.parameters(), lr=initial_lr)
    
    plot = plotter()
    
    epoch_losses = collections.defaultdict(list)
    val_loss = validate(ab, dl, device, 'val', epochsize, batchsize, negsize, ortho_reg)
    epoch_losses['Training Loss'].append(float('inf'))
    epoch_losses['Validation Loss'].append(val_loss)
    #ab.sample_aspects(i2w)
    sample_aspects(ab.aspects(), i2w)
    plot(epoch_losses)
    
    for e in range(epochs):
        train_losses = []
        with tqdm.trange(epochsize) as pbar:
            for b in pbar:
                pos, neg = next(batches)
                r_s, z_s, z_n = ab(pos, neg)
                J = max_margin_loss(r_s, z_s, z_n)
                U = orthogonal_regularization(ab.T.weight)
                loss = J + ortho_reg * batchsize * U
                opt.zero_grad()
                loss.backward()
                opt.step()

                train_losses.append(loss.item())
                x = (e + 1, opt.param_groups[0]['lr'], train_losses[-1])
                d = 'TRAIN EPOCH: %d | LR: %0.5f | MEAN-TRAIN-LOSS: %0.5f' % x
                pbar.set_description(d)

                if b * batchsize % 100 == 0:
                    lr = initial_lr * (1.0 - 1.0 * ((e + 1) * (b + 1)) / (epochs * epochsize))
                    for pg in opt.param_groups:
                        pg['lr'] = lr
        
        val_loss = validate(ab, dl, device, 'val', epochsize, batchsize, negsize, ortho_reg)
        epoch_losses['Training Loss'].append(np.mean(train_losses))
        epoch_losses['Validation Loss'].append(val_loss)
        #ab.sample_aspects(i2w)
        sample_aspects(ab.aspects(), i2w)
        plot(epoch_losses)

In [3]:
import os
import time
from abae_pytorch.data import dataloader, preprocess
from abae_pytorch import aspect_model


#data = './data/wiki_01'
#data = './data/beer.train.txt'
data = './data/restaurant.train.txt'
prep = data + '.prep'
if not os.path.isfile(prep):
    preprocess(data, prep)


min_count = 10
d_embed = 100
n_aspects = 10
device = 'cpu'
w2v = prep + '.w2v'
aspector = aspect_model(prep, w2v, min_count, d_embed, n_aspects, device)


x = (aspector.w2v.n_vocab, aspector.w2v.d_embed, aspector.w2v.n_aspects)
print('n_vocab: %d | d_embed: %d | n_aspects: %d' % x)


split = {'train': 0.9, 'val': 0.05, 'test': 0.05}
with dataloader(aspector.w2v.w2i, prep, split=split) as dl:

    epochs = 20
    epochsize = 100
    batchsize = 100
    negsize = 20
    initial_lr = 0.001
    train(aspector.ab, dl, 
          device=device,
          epochs=epochs, 
          epochsize=epochsize,
          batchsize=batchsize,
          negsize=negsize,
          initial_lr=initial_lr)

preprocessing "./data/restaurant.train.txt": 100%|██████████| 281989/281989 [00:18<00:00, 15495.08it/s]
training: 162432it [00:01, 134259.84it/s]
training: 162432it [00:02, 73693.64it/s]
training: 162432it [00:02, 73951.99it/s]
training: 162432it [00:02, 75765.28it/s]
training: 162432it [00:02, 71514.02it/s]
training: 162432it [00:02, 75130.41it/s]


n_vocab: 8311 | d_embed: 100 | n_aspects: 10


Traceback (most recent call last):
  File "<ipython-input-3-51e6cb131f1e>", line 41, in <module>
    initial_lr=initial_lr)
  File "<ipython-input-2-d66d2e8cde61>", line 41, in train
    plot = plotter()
  File "/srv/cluster/abae/abae_pytorch/train.py", line 20, in plotter
    f, ax = plt.subplots(1, 1, figsize=figsize)
NameError: name 'plt' is not defined


#
    preprocessing script for some known datasets
        num tag for preprocessing
    
    
    model wrap class

        given preprocessed data path
            train w2v models
                word embeddings trained on partitions too...
                optionally use different w2v training corpus
            initialize aspect matrix
                inferring n_aspects?
                    downweight specificity?

        given preprocessed data path, train abae model
        
        given sentences, provide aspect predictions

        save and load combinations of components


    break into package
        cli
        documentation
        setup.py
        requirements.txt

# TRASH 

In [None]:
!{'wc %s' % wd.prep_path}

    class dedicated to making a structured dataset a particular data source
        raw text -> preprocessing -> splitting
        creates a vocab
        vocab trains word embeddings

    class data loader which serves batches of training/evaluation data
        requires preprocessed text to serve
        requires predetermined vocab
        vocab requires word embeddings

    class model just the neural network parts

    class wrap model with interface
        training
        evaluation
        deployment

    cli script covering interface