In [1]:
import numpy as np
import tqdm
from sklearn.model_selection import train_test_split
import theano
import theano.tensor as T
import lasagne
from lasagne.layers import *

import matplotlib.pyplot as plt
%matplotlib inline

Can not use cuDNN on context None: cannot compile with cuDNN. We got this error:
b'C:\\Users\\tumanov\\AppData\\Local\\Temp\\try_flags_rfq1z6g_.c:4:19: fatal error: cudnn.h: No such file or directory\r\ncompilation terminated.\r\n'
Mapped name None to device cuda0: GeForce GTX 1070 (0000:01:00.0)


## Utils

In [2]:
def non_trainable(net):
    for tags in net.params.values():
        tags -= {'trainable', 'regularizable'}
    return net

## Parameters

## Generating data

In [22]:
def generate_data(size, digits, problem = lambda a, b: a+b, problem_operator='{}+{}'):
    source_seqs = []
    target_seqs = []
    seen = set()
    print('Generating data...')
    while len(source_seqs) < TRAINING_SIZE:
        f = lambda: int(''.join(np.random.choice(list('0123456789'))
                        for i in range(np.random.randint(1, DIGITS + 1))))
        a, b = f(), f()
        # Skip any addition questions we've already seen
        # Also skip any such that x+Y == Y+x (hence the sorting).
        key = tuple(sorted((a, b)))
        if key in seen:
            continue
        seen.add(key)
        # Pad the data with spaces such that it is always MAXLEN.
        q = problem_operator.format(a, b)
        ans = problem(a, b)

        source_seqs.append(q)
        targes.append(ans)

    print('Total addition questions:', len(source_seqs))
    
    source_letters = list(set([token for word in source_seqs for token in word]))
    source_letter_to_ix = {l:i for i,l in enumerate(source_letters)}
    
    return np.array(source_seqs), source_letters, source_letter_to_ix, np.array(targets)

In [23]:
def as_matrix(sequences, token_to_i, max_len=None, PAX_ix=-1):
    """
    Converts several sequences of tokens to a matrix, edible a neural network.
    Crops at max_len(if given), pads shorter sequences with -1 or PAD_ix.
    """
    max_len = max_len or max(map(len,sequences))
    
    matrix = np.zeros((len(sequences),max_len),dtype='int8') -1
    for i,seq in enumerate(sequences):
        
        row_ix = [token_to_i.get(_, 0) for _ in seq[:max_len]]
        matrix[i,:len(row_ix)] = row_ix
    
    return matrix


In [24]:
input_sequence = T.matrix('token sequence', 'int32')
output_sequence = T.matrix('target target_letters', 'int32')

In [7]:
def handle_model(nn, learning_rate=0.001):
    # Model weights
    weights = get_all_params(nn)
    network_output = get_output(nn)

    predictions_flat = network_output[:,:-1,:].reshape([-1,len(target_letters)])
    targets = output_sequence[:,1:].ravel()

    #do not count loss for '-1' tokens
    mask = T.nonzero(T.neq(targets,-1))

    loss = lasagne.objectives.categorical_crossentropy(predictions_flat[mask], targets[mask]).mean()
    updates = lasagne.updates.adam(loss, weights, learning_rate=learning_rate)
    #training
    train = theano.function([input_sequence, output_sequence], loss, updates=updates, allow_input_downcast=True)

    #computing loss without training
    compute_cost = theano.function([input_sequence, output_sequence], loss, allow_input_downcast=True)
    #compile the function that computes probabilities for next token given previous text.

    last_probas =network_output[:, -1]

    probs = theano.function([input_sequence, output_sequence], last_probas)
    return train, compute_cost, probs

In [8]:
class CerMemory(lasagne.layers.Layer):
    def __init__(self, incoming, memory_size, M=lasagne.init.Orthogonal(), **kwargs):
        super(CerMemory, self).__init__(incoming, **kwargs)
        self.query_shape = self.input_shape[1]
        self.memory_size = memory_size
        self.M = self.add_param(M, (self.query_shape, memory_size), name='M')

    def get_output_for(self, input, **kwargs):
        m = self.M / T.sqrt(T.sqr(self.M).sum(axis=0)).reshape(self.M.shape[1], 1)
        weights =  T.dot(input, m)
        return T.dot(weights, m.T)
    
    def get_output_shape_for(self, input_shape):
        return (input_shape[0], self.query_shape)

In [9]:
class EvcNormalizer(lasagne.layers.Layer):
    def get_output_for(self, input, **kwargs):
        return (input.T / T.sqrt(T.sqr(input).sum(axis=1)).reshape(input.shape[0], 1)).T

In [13]:
def bazal_model(query_size, memory_size, hidden_size, memory_benchmark=False, bidir_features=False):

    ##ENCODER
    l_in = InputLayer(shape=(None, None),input_var=input_sequence)
    l_mask = InputLayer(shape=(None, None),input_var=T.neq(input_sequence,-1)) 


    l_emb = non_trainable(EmbeddingLayer(l_in, len(source_letters),  len(source_letters), W=np.diag(np.ones(len(source_letters)))))

    features = LSTMLayer(l_emb, HIDDEN_SIZE, only_return_final=True, mask_input=l_mask)
    features_backward = LSTMLayer(l_emb, HIDDEN_SIZE, only_return_final=True, mask_input=l_mask, backwards=True)
    if bidir_features:
        features = ConcatLayer([features, features_backward])
    
    if not memory_benchmark:
        ## QUERY BUILDER
        query = DenseLayer(features, QUERY_SIZE, nonlinearity=None)
        query = EvcNormalizer(query)
        ## Memory
        memory = CerMemory(query, MEMORY_SIZE)
    else:
        memory = DenseLayer(DenseLayer(features, QUERY_SIZE), QUERY_SIZE)
        
    to_decode = ConcatLayer([features, memory])
    l_out = DenseLayer(to_decode, HIDDEN_SIZE)
    l_out = DenseLayer(l_out, 1, nonlinearity=lasagne.nonlinearities.linear)
    return l_out, memory 

In [19]:
TRAINING_SIZE = 100000
DIGITS = 3

QUERY_SIZE = 16
MEMORY_SIZE = 64
HIDDEN_SIZE = 128

In [20]:
source_seqs, source_letters, source_letter_to_ix, target_seqs, target_letters, target_letter_to_ix =\
                    generate_data(TRAINING_SIZE, DIGITS, lambda a, b: a * b, '{}*{}')

Generating data...
Total addition questions: 100000


In [21]:
l_out, memory = bazal_model(QUERY_SIZE, MEMORY_SIZE, HIDDEN_SIZE, False, True)