In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import json
import baseline
from encoder import *
from baseline import *
from decoder import *
from alphabet import *
from train import *
import torch
from matplotlib import pyplot as plt

torch.__version__

'1.4.0'

In [2]:
LANGUAGE = 'Python'

def filter_ascii(strings):
    'Returns only the strings that can be encoded in ASCII.'
    l = []
    for s in strings:
        try:
            s.encode('ascii')
            if len(s) <= 80:
                l.append(s)
        except UnicodeEncodeError:
            pass
        
    return l

with open('dataset/medium.json') as f:
    multilang_dataset = json.load(f)
    dataset = multilang_dataset[LANGUAGE]
    
    dataset['train'] = filter_ascii(dataset['train'])
    dataset['dev'] = filter_ascii(dataset['dev'])
    dataset['test'] = filter_ascii(dataset['test'])
    
    tiny_dataset = {
        'train': dataset['train'][:50],
        'dev': dataset['train'][:50],
        'test': dataset['train'][:50],
    }
    
    print('{} training examples, {} validation examples, {} test exampless'.format(
        len(dataset['train']), 
        len(dataset['dev']),
        len(dataset['test'])))

76738 training examples, 9590 validation examples, 9616 test exampless


In [3]:
def generate_dumb_dataset():
    'Returns a dataset where all examples are the same string, which consists of 10 times the same letter.'

    SIZE = 200
    l = []

    for i in range(SIZE):
        l.append(random.choice('abcdefghijklmnopqrstuvwxyz') * random.choice([10]))
        
    return {'train': l, 'dev': l, 'test': l}

dumb_dataset = generate_dumb_dataset()

In [5]:
device = torch.device(0) if torch.cuda.is_available() else torch.device('cpu')
print(device)

alphabet = AsciiOneHotEncoding(device)
encoder = baseline.UniformEncoder(0.9)
decoder = AutoCompleteDecoderModel(alphabet, hidden_size=64)
nencoder = NeuralEncoder(alphabet, epsilon=0.2, hidden_size=64)

cuda:0


In [146]:
def expected_initial_loss(input_string, epsilon, alphabet, lam):
    s, a = len(input_string), alphabet.size()
    return s/2.0 + lam*(s*math.log(a)-epsilon)

print(expected_initial_loss('ddddd',0.5,alphabet,10))

240.10151319598083


In [12]:
parameters = {
    'learning_rate': 1e-4,
    'verbose': True,
    'batch_size': 64,
    'init_scale': 0.01,
    'epochs': 10000,
    'initial_lambda': 50,
    'epsilon': 0.1,
}

train_loss_history, train_avg_kept, train_reconstruction_loss = train(nencoder, decoder, dumb_dataset, parameters, device)
plt.plot(train_loss_history)

Initial lambda: tensor(50., device='cuda:0', requires_grad=True)
Epoch 0 iteration 0: loss = 1542.217, lambda: 50.308, % kept: 0.484, rec_loss: 4.852, avg likelihood: -6.930tp = 502.72 lines/s, ETA 01h24m52s
Epoch 25 iteration 0: loss = 2179.743, lambda: 80.643, % kept: 0.506, rec_loss: 4.269, avg likelihood: -6.932tp = 513.35 lines/s, ETA 01h22m54s
Epoch 50 iteration 0: loss = 2168.913, lambda: 103.394, % kept: 0.495, rec_loss: 3.311, avg likelihood: -6.931tp = 514.84 lines/s, ETA 01h22m27s
Epoch 75 iteration 0: loss = 2565.568, lambda: 124.440, % kept: 0.494, rec_loss: 3.254, avg likelihood: -6.931tp = 513.57 lines/s, ETA 01h22m27s
Epoch 100 iteration 0: loss = 2964.909, lambda: 145.159, % kept: 0.489, rec_loss: 3.224, avg likelihood: -6.931tp = 511.96 lines/s, ETA 01h22m30s
Epoch 125 iteration 0: loss = 3452.394, lambda: 165.792, % kept: 0.478, rec_loss: 3.288, avg likelihood: -6.930tp = 511.98 lines/s, ETA 01h22m17s
Epoch 150 iteration 0: loss = 3871.032, lambda: 186.369, % kept: 0

Epoch 1400 iteration 0: loss = 3889.176, lambda: 967.623, % kept: 0.059, rec_loss: 2.842, avg likelihood: -1.690tp = 510.59 lines/s, ETA 01h11m51s
Epoch 1425 iteration 0: loss = 2881.319, lambda: 971.116, % kept: 0.041, rec_loss: 2.827, avg likelihood: -1.268tp = 509.24 lines/s, ETA 01h11m50s


KeyboardInterrupt: 

In [123]:
ps = torch.tensor([0.3, 0.8], requires_grad=True)
bs = torch.bernoulli(ps)
s = bs.sum()
print(s)
s.backward()
print(bs)
print(bs.grad)

from torch.distributions import Categorical

b = Categorical(ps)
mask = b.sample()
# next_state, reward = env.step(action)
# loss = -m.log_prob(action) * reward
loss = b.log_prob(mask)
loss.backward()
print(loss)
print(mask.grad)

tensor(2., grad_fn=<SumBackward0>)
tensor([1., 1.], grad_fn=<BernoulliBackward0>)
None
tensor(-0.3185, grad_fn=<SqueezeBackward1>)
None


In [109]:
bs._grad

# End-to-end samples

In [47]:
SPLIT = 'train'

import copy

s = random.choice(dumb_dataset[SPLIT])
compressed = encoder.encode(s)
decompressed = decoder([compressed])

print('String:', repr(s))
print('Encoded:', repr(compressed))
print('Decoded:', repr(decompressed[0]))
print(len(decompressed))

String: 'ffffffffff'
Encoded: 'ffffffffff'
Decoded: 'd}{}\x13d}}|d}\x13\x13d})dddA}}\x13})}7d}}}|}}\x13d}\x13d}A|\x13}}Z(AA|\x13}}{}}}K}}AP}d}}\x13}d}}}}\x13d}j}}\x13Z\x13A}j}}}\x13dd})}\x13}Z\x13}|d\x13\x13}}}t})}\x13d}}dZ}}|}}d}Z}}dZAA}\x13})}dd}}dt}\x04d}dA}}}}A})d}}d}d}A}}\x04}A}A\x13d|A}}}}\x13A}\x13d}}}dA}}})d}d}{)\x13|A'
1


In [9]:
def top1accuracy(dataset):
    return len(list(filter(lambda s: s == decoder([encoder.encode(s)])[0],
                         dataset)))/len(dataset)
print(top1accuracy(dumb_dataset[SPLIT]))

0.25
