# Rumelhart and Todd network (1993)

### Ethan Blackwood
### September 28, 2020

**Goal**: Simulate the Rumelhart & Todd connectionist semantic memory network shown in Rogers & McClelland (2008)
Figure 1, and replicate the results in Figure 3 regarding the similarity of internal item representations over time.

In [2]:
%matplotlib widget
%config IPCompleter.greedy=True

import itertools
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from scipy.cluster import hierarchy

import ptree

First, build the tree that contains all our inputs and outputs.

In [3]:
# can afford to use doubles for this
torch.set_default_tensor_type(torch.DoubleTensor)

rumeltree = ptree.from_xml('rumeltree.xml')

# Convert to lists so we have a canonical order for items, relations, and attributes.
items = list(rumeltree['items'])
relations = list(rumeltree['relations'])
attributes = list(rumeltree['attributes'])

# Now make our inputs and outputs.
item_vecs = torch.eye(len(items)).split(1)
rel_vecs = torch.eye(len(relations)).split(1)
xs = list(itertools.product(item_vecs, rel_vecs))
items_rep = torch.stack([x[0] for x in xs], axis=0)
rels_rep = torch.stack([x[1] for x in xs], axis=0)
xs_cat = (items_rep, rels_rep)

y = torch.zeros((len(xs), len(attributes)))

for kI in range(len(items)):
    for kR in range(len(relations)):

        # get attributes to associate
        my_attrs = rumeltree['nodes'][items[kI]].get_related_attributes(relations[kR])
        attr_inds = np.isin(attributes, list(my_attrs))
        y[kI*len(relations) + kR, attr_inds] = 1

ys = y.split(1)
ys_cat = y

# prepare for MultiLabelMarginLoss
# y_inds = torch.full((len(ys), len(attributes)), -1, dtype=torch.long)
# for i, y_vec in enumerate(ys):
#    y_ind = y_vec.nonzero(as_tuple=True)[1]
#    y_inds[i, :len(y_ind)] = y_ind

# y_inds = y_inds.split(1)

print('Items: ', items)
print('Relations: ', relations)
print('Attributes: ', attributes)
print()
print('Some examples:')
rng = np.random.default_rng()

for k in rng.choice(len(xs), size=4, replace=False):
    x = xs[k]
    item_hot = x[0].numpy().squeeze().nonzero()[0]
    item = items[item_hot[0]]
    rel_hot = x[1].numpy().squeeze().nonzero()[0]
    relation = relations[rel_hot[0]]
    
    y = ys[k]
    attrs_hot = y.numpy().squeeze().nonzero()[0]
    attrs = [attributes[i] for i in attrs_hot]
    
    print(f'{item} {relation}: {", ".join(attrs) if len(attrs) > 0 else "<nothing>"}')

Items:  ['rose', 'pine', 'sunfish', 'oak', 'canary', 'salmon', 'robin', 'daisy']
Relations:  ['is', 'has', 'can', 'ISA']
Attributes:  ['scales', 'skin', 'oak', 'fly', 'bark', 'salmon', 'sing', 'organism', 'living', 'red', 'move', 'rose', 'grow', 'pine', 'branches', 'green', 'pretty', 'bird', 'leaves', 'yellow', 'animal', 'flower', 'big', 'petals', 'plant', 'sunfish', 'swim', 'canary', 'feathers', 'gills', 'robin', 'roots', 'tree', 'fish', 'wings', 'daisy']

Some examples:
robin is: living, red
oak has: bark, branches, leaves, roots
sunfish has: scales, skin, gills
canary is: living, yellow


Now build the network and training function.

In [4]:
class RumelNet(nn.Module):
    def __init__(self, n_items, n_relations, n_attributes):
        super(RumelNet, self).__init__()
        
        self.n_items = n_items
        self.n_relations = n_relations
        self.n_attributes = n_attributes
        
        rep_size = 8
        hidden_size = 15
        
        # define layers
        self.item_to_rep = nn.Linear(n_items, rep_size)
        self.rep_to_hidden = nn.Linear(rep_size, hidden_size)
        self.rel_to_hidden = nn.Linear(n_relations, hidden_size, bias=False) # only need one hidden layer bias
        self.hidden_to_attr = nn.Linear(hidden_size, n_attributes)
        
        # make weights start small
        with torch.no_grad():
            for layer in (self.item_to_rep, self.rep_to_hidden, self.rel_to_hidden, self.hidden_to_attr):
                nn.init.normal_(layer.weight.data, std=0.1)
                if layer.bias is not None:
                    nn.init.normal_(layer.bias.data, std=0.1)

    def forward(self, x):
        # split into item and relation
        item, relation = x
        
        # flow inputs through network
        rep = torch.sigmoid(self.item_to_rep(item))
        hidden = torch.sigmoid(self.rep_to_hidden(rep) + self.rel_to_hidden(relation))
        attr = torch.sigmoid(self.hidden_to_attr(hidden))
        return attr

In [7]:
def train_network(net, optimizer, num_epochs=200, snap_freq=20, batch_size=4):
    
    n_snaps = num_epochs // snap_freq
    n_items = net.n_items
    n_rep = net.item_to_rep.out_features
    
    # Holds snapshots of input representation layer after probing with each item
    rep_snapshots = np.ndarray((n_snaps, n_items, n_rep))
    
    criterion = nn.MSELoss()
    
    n_batches = (len(xs)-1) // batch_size + 1
    
    for epoch in range(num_epochs):
        # collect snapshot
        if epoch % snap_freq == 0:
            k_snap = epoch // snap_freq
            
            with torch.no_grad():
                for k_item, item in enumerate(item_vecs):
                    act = torch.sigmoid(net.item_to_rep(item))
                    rep_snapshots[k_snap, k_item, :] = act
        
#         optimizer.zero_grad()
#         outputs = net(xs_cat)
#         loss = criterion(outputs, ys_cat)
#         loss.backward()
#         optimizer.step()
        
#         if epoch % 10 == 0:
#             with torch.no_grad():
#                 running_loss = loss.item()
#                 running_accuracy = torch.mean(((outputs > 0.5).to(torch.double) == ys_cat).to(torch.double)).item()

        running_loss = 0.0
        running_accuracy = 0.0

        order = rng.permutation(len(xs))
        for k_batch in range(n_batches):
            # train
            optimizer.zero_grad()
            batch_inds = order[k_batch*batch_size:(k_batch+1)*batch_size] 
            
            outputs = net((xs_cat[0][batch_inds], xs_cat[1][batch_inds]))
            loss = criterion(outputs, ys_cat[batch_inds])
            loss.backward()
            optimizer.step()
            
            with torch.no_grad():
                running_loss += loss.item() * len(batch_inds)
                accuracy = torch.mean(((outputs > 0.5).to(torch.double) == ys_cat[batch_inds]).to(torch.double))
                running_accuracy += accuracy.item() * len(batch_inds)
        
        if epoch % 10 == 0:
            print(f'Epoch {epoch} end: mean loss = {running_loss / len(xs):.3f}, mean accuracy = {running_accuracy / len(xs):.3f}')
        
    return rep_snapshots

Moment of truth, time to run it

In [8]:
net = RumelNet(len(items), len(relations), len(attributes))
optimizer = torch.optim.SGD(net.parameters(), lr=0.001)

rep_snapshots = train_network(net, optimizer, num_epochs=2000)

Epoch 0 end: mean loss = 0.254, mean accuracy = 0.489
Epoch 10 end: mean loss = 0.254, mean accuracy = 0.489
Epoch 20 end: mean loss = 0.253, mean accuracy = 0.489
Epoch 30 end: mean loss = 0.253, mean accuracy = 0.492
Epoch 40 end: mean loss = 0.252, mean accuracy = 0.495
Epoch 50 end: mean loss = 0.252, mean accuracy = 0.495
Epoch 60 end: mean loss = 0.251, mean accuracy = 0.502
Epoch 70 end: mean loss = 0.251, mean accuracy = 0.507
Epoch 80 end: mean loss = 0.250, mean accuracy = 0.507
Epoch 90 end: mean loss = 0.250, mean accuracy = 0.511
Epoch 100 end: mean loss = 0.250, mean accuracy = 0.513
Epoch 110 end: mean loss = 0.249, mean accuracy = 0.513
Epoch 120 end: mean loss = 0.249, mean accuracy = 0.513
Epoch 130 end: mean loss = 0.248, mean accuracy = 0.513
Epoch 140 end: mean loss = 0.248, mean accuracy = 0.518
Epoch 150 end: mean loss = 0.247, mean accuracy = 0.526
Epoch 160 end: mean loss = 0.247, mean accuracy = 0.533
Epoch 170 end: mean loss = 0.246, mean accuracy = 0.533
Epo

Epoch 1480 end: mean loss = 0.194, mean accuracy = 0.823
Epoch 1490 end: mean loss = 0.193, mean accuracy = 0.826
Epoch 1500 end: mean loss = 0.193, mean accuracy = 0.827
Epoch 1510 end: mean loss = 0.193, mean accuracy = 0.829
Epoch 1520 end: mean loss = 0.192, mean accuracy = 0.828
Epoch 1530 end: mean loss = 0.192, mean accuracy = 0.828
Epoch 1540 end: mean loss = 0.192, mean accuracy = 0.828
Epoch 1550 end: mean loss = 0.191, mean accuracy = 0.834
Epoch 1560 end: mean loss = 0.191, mean accuracy = 0.837
Epoch 1570 end: mean loss = 0.191, mean accuracy = 0.843
Epoch 1580 end: mean loss = 0.190, mean accuracy = 0.845
Epoch 1590 end: mean loss = 0.190, mean accuracy = 0.845
Epoch 1600 end: mean loss = 0.190, mean accuracy = 0.846
Epoch 1610 end: mean loss = 0.189, mean accuracy = 0.845
Epoch 1620 end: mean loss = 0.189, mean accuracy = 0.846
Epoch 1630 end: mean loss = 0.189, mean accuracy = 0.845
Epoch 1640 end: mean loss = 0.189, mean accuracy = 0.846
Epoch 1650 end: mean loss = 0.1

In [9]:
z = hierarchy.linkage(rep_snapshots[-1])
plt.figure()
hierarchy.dendrogram(z, labels=items)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [10]:
# Test trained net
with torch.no_grad():
    ind = 31
    
    item_vec, rel_vec = [t.squeeze().numpy() == 1 for t in xs[ind]]
    item = np.array(items)[item_vec][0]
    relation = np.array(relations)[rel_vec][0]
    
    fig, ax = plt.subplots(figsize=(8, 15))
    h1 = ax.barh(range(len(attributes)), net(xs[ind]).squeeze().numpy(),
                 align='edge', height=0.4, tick_label=attributes)
    h2 = ax.barh(range(len(attributes)), ys[ind].squeeze().numpy(),
                 align='edge', height=-0.4, tick_label=attributes)
    ax.legend([h1, h2], ['Actual', 'Expected'])
    ax.set_title(f'{item} {relation}...', size='x-large')
    fig.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …