In [1]:
from discopy import *

# Define neural networks

We start by defining neurons, layers and neural nets as composites of scalar multiplication, bias, copy and merge. See arxiv:1711.10455.

In [2]:
from discopy.function import *

def mult(weight): 
    return Function('mult({})'.format(str(weight)), 1, 1, lambda x: weight * x)
    
def mults(dom, weights):
    result = Id(0)
    for i in range(dom):
        result = result @ mult(weights[i])
    result._name = 'mults({})'.format(str(weights))
    return result

def bias(weight):
    return Function('bias({})'.format(str(weight)), Dim(0), Dim(1), lambda x: np.array([weight]))

def merge(cod, copies):
    @discofunc(cod * copies, cod, name='merge({}, {})'.format(cod, copies))
    def add(x):
        return np.array([np.sum([x[i + cod * j] for j in range(copies)]) for i in range(cod)])
    return add

sigmoid = Function('sigmoid', 1, 1, lambda x: 1/(1 + np.exp(-x)))

In [3]:
def neuron(dom, weights, beta=0): # weights is a 1d array of length dom, beta is a scalar bias
    return mults(dom, weights) @ bias(beta) >> merge(1, dom + 1) >> sigmoid

We can jit and take the gradient of neurons using Jax.

In [4]:
from jax import jit, grad

print(jit(neuron(4, [0, 2.1, 0.3, 0.1]))(np.array([0., 0.3, 1.2, 3.2])))
print(grad(lambda x: neuron(4, [0., 2.1, 0.3, 0.1])(x)[0])(np.array([0., 0.3, 1.2, 3.2])))

[0.7875132]
[0.         0.35140604 0.05020086 0.01673362]


In [5]:
def layer(dom, cod, weights, biases): # weights is an array of shape: (cod, dom) (note cod = number of neurons)
    neurons = Id(0)                   # biases is a 1d array of shape (cod, )
    for i in range(cod):
        neurons = neurons @ neuron(dom, weights[i], biases[i])
    return Copy(dom, cod) >> neurons

In [6]:
disconnected_layer = lambda x: layer(3, 1, [[0., 0., 0.]], [0., 0.])(x)[0]
assert np.all(grad(disconnected_layer)(np.array([2., 3.4, 1.])) == np.array([0., 0., 0.]))

In [7]:
def neural_net(layers, weights, biases): # each layer has layer[i] neurons and is fully connected to adjacent layers 
    # weights is a list of arrays of shape (layers[i+1], layers[i]) of length len(layers) - 1
    # biases is a list of 1d arrays of shape (layers[i], ) of length len(layers) - 1
    result = Id(layers[0])
    for i in range(len(layers) - 1):
        result = result >> layer(layers[i], layers[i + 1], weights[i], biases[i])
    return result

In [8]:
from random import uniform

layers = [2, 3, 1]
weights = [[[uniform(-10, 10) for k in range(layers[i])] for j in range(layers[i+1])] for i in range(len(layers) - 1)]
biases = [[uniform(-5, 5) for j in range(layers[i+1])] for i in range(len(layers) - 1)]

nnet = lambda x: neural_net(layers, weights, biases)(x)[0]
jit(nnet)
print(nnet([2.23, 4.2]))
print(grad(nnet)([2., 4.]))

0.74754167
[DeviceArray(-0.0095527, dtype=float32), DeviceArray(0.01538139, dtype=float32)]


# Generate a language of grammatical sentences

Now we fix a vocabulary and generate grammatical sentences

In [9]:
from discopy.pivotal import Ty, Box, Diagram
from discopy.pregroup import Word

s, n = Ty('s'), Ty('n')
Alice = Word('Alice', n)
loves = Word('loves', n.r @ s @ n.l)
Bob =  Word('Bob', n)
who = Word('who', n.r @ n @ s.l @ n)
is_rich = Word('is rich', n.r @ s)

vocab = [Alice, who, is_rich, loves, Bob]

In [10]:
from time import time
from discopy.pregroup import brute_force

gen, n_sentences = brute_force(*vocab), 20
sentences, parsing = list(), dict()

print("Brute force search for grammatical sentences:")

start = time()
for i in range(n_sentences):
    diagram = next(gen)
    sentence = ' '.join(str(w)
        for w in diagram.boxes if isinstance(w, Word)) + '.'
    sentences.append(sentence)
    parsing.update({sentence: diagram})
    print(sentence)

print("{:.2f} seconds to generate {} sentences.\n".format(time() - start, n_sentences))

Brute force search for grammatical sentences:
Alice is rich.
Bob is rich.
Alice loves Alice.
Alice loves Bob.
Bob loves Alice.
Bob loves Bob.
Alice who is rich is rich.
Bob who is rich is rich.
Alice who is rich loves Alice.
Alice who is rich loves Bob.
Alice who loves Alice is rich.
Alice who loves Bob is rich.
Bob who is rich loves Alice.
Bob who is rich loves Bob.
Bob who loves Alice is rich.
Bob who loves Bob is rich.
Alice who who is rich is rich is rich.
Alice who is rich who is rich is rich.
Alice who loves Alice loves Alice.
Alice who loves Alice loves Bob.
11.90 seconds to generate 20 sentences.



# Autonomization

In order to map grammatical sentences to neural networks we need to turn the parsed sentences into diagrams that do not contain cups or caps. This procedure is known as autonomization, see arxiv:1411.3827 

In [11]:
from discopy.pivotal import Cup, Cap, PivotalFunctor

love_box = Box('love_box', n @ n, s)
is_rich_box = Box('is_rich_box', n, s)
who_box0 = Box('who_box0', n, n @ n)
who_box1 = Box('who_box1', n @ s, n)

ob = {n: n, s: s}
ar = {Alice: Alice,
      Bob: Bob,
      loves: Cap(n.r, n) @ Cap(n, n.l) >> Diagram.id(n.r) @ love_box @ Diagram.id(n.l),
      is_rich: Cap(n.r, n) >> Diagram.id(n.r) @ is_rich_box,
      who: Cap(n.r, n) >> Diagram.id(n.r) @ (who_box0 >> Diagram.id(n) @ Cap(s, s.l) @ Diagram.id(n) >>
                                             who_box1 @ Diagram.id(s.l @ n))
     }

A = PivotalFunctor(ob, ar, ob_cls=Ty, ar_cls=Diagram)

In [12]:
autonomised_parsing = {sentences[i]: A(parsing[sentences[i]]).normal_form() for i in range(n_sentences)}

# Quantum model generates the distribution to be learned

We use a quantum CircuitModel to generate a corpus of true sentences.

In [13]:
from discopy import CircuitModel
from discopy.circuit import Circuit, sqrt, Ket, H, Rx, CX, SWAP

GHZ = sqrt(2) @ Ket(0, 0, 0)\
    >> Circuit.id(1) @ H @ Circuit.id(1)\
    >> Circuit.id(1) @ CX\
    >> (SWAP >>  CX) @ Circuit.id(1)

def intransitive_ansatz(phase):
    return Ket(0) >> Rx(phase)

def transitive_ansatz(phase):
    return sqrt(2) @ Ket(0, 0) >> H @ Rx(phase) >> CX

ob_q = {s: 0, n: 1}
ar_q = lambda params: {
    Alice: Ket(0),
    loves: transitive_ansatz(params['loves']),
    Bob: Ket(1),
    who: GHZ,
    is_rich: intransitive_ansatz(params['is_rich'])}

QModel = CircuitModel(ob_q, ar_q({'loves': 0.5, 'is_rich': 1.}))

In [14]:
corpus = {sentence: QModel(parsing[sentence]).measure() for sentence in sentences}

epsilon = 1e-2

# print("True sentences:\n{}\n".format('\n'.join(sentence
#     for sentence, probability in corpus.items() if probability > 1 - epsilon)))
# print("False sentences:\n{}".format('\n'.join(sentence
#     for sentence, probability in corpus.items() if probability < epsilon)))

# Train a neural network to reproduce the same corpus

In [15]:
from sklearn.model_selection import train_test_split

sentence_train, sentence_test = train_test_split(sentences, test_size=0.5, random_state=42)

# print("Training set:\n{}\n".format('\n'.join(sentence_train)))
# print("Testing set:\n{}".format('\n'.join(map(str, sentence_test))))

Define the loss function with respect to a NumpyFunctor

In [34]:
from discopy.function import NumpyFunctor

layers = {'loves': [4, 5, 1],
          'is_rich': [2, 3, 1],
          'who': [3, 2]}

ob = {s: Dim(1), n: Dim(2)}

ar = lambda params: {
    Alice: Function('Alice', 0, 2, lambda x: np.array([1., 0.])),
    Bob: Function('Alice', 0, 2, lambda x: np.array([0., 1.])),
    love_box: neural_net(layers['loves'], params['loves'][0], params['loves'][1]),
    is_rich_box: neural_net(layers['is_rich'], params['is_rich'][0], params['is_rich'][1]),
    who_box0: Copy(2, 2),
    who_box1: neural_net(layers['who'], params['who'][0], params['who'][1])}

F = lambda params: NumpyFunctor(ob, ar({'loves': params[0], 'is_rich': params[1], 'who': params[2]}))

evaluate = lambda F, sentence: F(autonomised_parsing[sentence])([])[0]

In [38]:
def rand_params(layers):
    weights = [[[uniform(-10, 10) for k in range(layers[i])] for j in range(layers[i+1])] for i in range(len(layers) - 1)]
    biases = [[uniform(-10, 10) for j in range(layers[i+1])] for i in range(len(layers) - 1)]
    return [weights, biases]

params0 = [rand_params(layers['loves']), rand_params(layers['is_rich']), rand_params(layers['who'])]

In [39]:
@jit
def training_loss(params):
    return np.mean(np.array([
        (corpus[sentence] - evaluate(F(params), sentence)) ** 2
        for sentence in sentence_train]))

@jit
def testing_loss(params):
    return np.mean(np.array([
        (corpus[sentence] - evaluate(F(params), sentence)) ** 2
        for sentence in sentence_test]))

training_loss(params0)

DeviceArray(0.5265648, dtype=float32)