# WordNet dataset generation

In [10]:
from nltk.corpus import wordnet as wn
from collections import defaultdict

In [None]:
def getDataForSynset(sname, fname):

    closure = defaultdict(set)

    def walk(node, ancestors):
        node_name = node.name()
        closure[node_name].update(ancestors)
        for s in node.hyponyms():
            walk(s, ancestors + [node_name])

    walk(wn.synset(sname), [sname])
    #Write out tsv
    with open(fname, 'w') as f:
        for n in closure:
            for a in closure[n]:
                f.write(n + '\t' + a + '\t' + "1\n")

In [None]:
getDataForSynset('entity.n.01', "data/wordnet_nouns.tsv")

In [None]:
getDataForSynset('mammal.n.01', "data/wordnet_mammals.tsv")

# Training

## Imports

In [1]:
import sys
import timeit
import gc
import logging
import argparse
import numpy as np
import torch as th
import torch.multiprocessing as mp
from torch.autograd import Variable
from collections import defaultdict as ddict
from sklearn.metrics import average_precision_score
from torch.utils.data import DataLoader

from model import LorentzEmbedding
from data import DatasetReader
from optimization import RiemannianSGD
from optimization import LorentzDistance

th.set_default_tensor_type(th.DoubleTensor)

## Helper functions

In [2]:
def evaluate(types, model):
    with th.no_grad():
        embs = th.from_numpy(model.embedding())
        embedding = Variable(embs)
        ranks = []
        ap_scores = []
        for s, tree in types.items():
            s_e = Variable(embs[s].unsqueeze(0))
            dists_curr = model.dist()(s_e, embedding).data.cpu().numpy().flatten()
            dists_curr[s] = 1e14
            labels = np.zeros(embedding.size(0))
            dists_masked = dists_curr.copy()
            ranks_curr = []
            
            for o in tree:
                dists_masked[o] = float('inf')
                labels[o] = 1 
            ap_scores.append(average_precision_score(labels, -dists_curr))
            for o in tree:
                d = dists_masked.copy()
                d[o] = dists_curr[o]
                r = np.argsort(d)
                ranks_curr.append(np.where(r == o)[0][0] + 1)
            ranks += ranks_curr
    return np.mean(ranks), np.mean(ap_scores)

## Constants

In [3]:
_lr_multiplier = 0.1                     #Burnin multiplier
dim = 3                                  #The dimensionality of the embdding
dataset = './data/wordnet_mammals.tsv'   #The dataset to learn
fout = './mammals.pth'                   #The output model file
lr_base = 1.0                            #The base learning rate
epochs = 1500                            #Max number of epochs
batchsize = 20                           #The batch size
negs = 50                                #Number of negative examples
eval_each = 10                           #The number of epochs between each evaluation
burnin = 20                              #The number of burnin epochs
nworkers = 5                             #The number of dataset readers

## Setup

In [4]:
#Dataset setup
data = DatasetReader(dataset, negs)
idx = data.samples

# create adjacency list for evaluation
adjacency = ddict(set)
for i in range(len(idx)):
    s, o, _ = idx[i]
    adjacency[s].add(o)
adjacency = dict(adjacency)

# initialize model and data
model = LorentzEmbedding(len(data.entities), dim, LorentzDistance)

# initialize optimizer
optimizer = RiemannianSGD(
    model.parameters(),
    lr=lr_base,
)

## Training

In [5]:
min_rank = (float('inf'), -1)
max_map = (0, -1)
loader = DataLoader(
        data,
        batch_size=batchsize,
        shuffle=True,
        num_workers=nworkers,
        collate_fn=data.collate
    )

for epoch in range(epochs):
    epoch_loss = []
    loss = None
    data.burnin = False
    lr = lr_base
    t_start = timeit.default_timer()
    if epoch < burnin:
        data.burnin = True
        lr = lr_base * _lr_multiplier
        
    #Training loop
    for inputs, targets in loader:
        inputs = Variable(th.from_numpy(np.vstack(inputs)))
        targets = Variable(th.from_numpy(np.vstack(targets))).squeeze()

        elapsed = timeit.default_timer() - t_start
        optimizer.zero_grad()
        preds = model(inputs)
        loss = model.loss(preds, targets, size_average=True)
        loss.backward()
        optimizer.step(lr=lr)
        epoch_loss.append(loss.data[0])
        th.save({
           'model': model.state_dict(),
           'epoch': epoch,
           'entities': data.entities
        }, 'run/%05d.pth'%epoch)
        
    #Evaluation
    if epoch == (epochs - 1) or epoch % eval_each == (eval_each - 1):
        th.save({
                'model': model.state_dict(),
                'epoch': epoch,
                'entities': data.entities
            }, fout)
        mrank, mAP = evaluate(adjacency, model)
        if mrank < min_rank[0]:
            min_rank = (mrank, epoch)
        if mAP > max_map[0]:
            max_map = (mAP, epoch)
        print(
            ('epoch: %d, '
             'loss: %.3f, '
             'mean rank: %.2f, '
             'mAP: %.4f, '
             'best rank: %.2f, '
             'best mAP: %.4f, '
             'time: %.2fs') % (
                 epoch, loss, mrank, mAP, min_rank[0], max_map[0], elapsed)
        )

    gc.collect()



epoch: 9, loss: 1.730, mean rank: 214.94, mAP: 0.0773, best rank: 214.94, best mAP: 0.0773, time: 1.51s
epoch: 19, loss: 1.623, mean rank: 202.15, mAP: 0.0829, best rank: 202.15, best mAP: 0.0829, time: 1.35s
epoch: 29, loss: 2.975, mean rank: 137.82, mAP: 0.1070, best rank: 137.82, best mAP: 0.1070, time: 2.35s
epoch: 39, loss: 2.399, mean rank: 110.76, mAP: 0.1390, best rank: 110.76, best mAP: 0.1390, time: 2.55s
epoch: 49, loss: 2.545, mean rank: 91.42, mAP: 0.1550, best rank: 91.42, best mAP: 0.1550, time: 2.25s
epoch: 59, loss: 2.715, mean rank: 77.13, mAP: 0.1661, best rank: 77.13, best mAP: 0.1661, time: 2.18s
epoch: 69, loss: 1.667, mean rank: 63.91, mAP: 0.1806, best rank: 63.91, best mAP: 0.1806, time: 2.57s
epoch: 79, loss: 1.579, mean rank: 54.00, mAP: 0.1939, best rank: 54.00, best mAP: 0.1939, time: 2.51s
epoch: 89, loss: 2.509, mean rank: 44.38, mAP: 0.2033, best rank: 44.38, best mAP: 0.2033, time: 2.22s
epoch: 99, loss: 0.875, mean rank: 34.73, mAP: 0.2223, best rank: 

Process Process-1717:
Process Process-1720:
Process Process-1719:
Process Process-1718:
Process Process-1716:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/dbw003/.conda/envs/jhubMachineLearning/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/dbw003/.conda/envs/jhubMachineLearning/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/dbw003/.conda/envs/jhubMachineLearning/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/home/dbw003/.conda/envs/jhubMachineLearning/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/dbw003/.conda/envs/jhubMachineLearning/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/dbw003/.conda/envs/jhubMachineLearning

KeyboardInterrupt: 

# Visualization

In [6]:
model = th.load('./mammals.pth')
lorentz_embeddings = model['model']['embeddings.weight']
dim0 = lorentz_embeddings[:,0].unsqueeze(1)
dimn = lorentz_embeddings[:,1:]

poincare_embeddings = dimn / (dim0 + 1)

In [11]:
links = defaultdict(set)
family_start = ['lynx.n.02']
for f in family_start:
    stack = [f]
    first = True
    while len(stack) > 0:
        v = stack.pop()
        for k in wn.synsets(v.split('.')[0]):
            if k.name() in model['entities']:
                for u in k.hypernyms():
                    if u.name() in model['entities']:                            
                        links[k.name()].add(u.name())
                        stack.append(u.name())

In [12]:
from torch.autograd import Function
eps = 1e-5
class Arcosh(Function):
    def __init__(self, eps=eps):
        super(Arcosh, self).__init__()
        self.eps = eps 

    def forward(self, x): 
        self.z = th.sqrt(x * x - 1)
        return th.log(x + self.z)

    def backward(self, g): 
        z = th.clamp(self.z, min=eps)
        z = g / z 
        return z

class PoincareDistance(Function):
    boundary = 1 - eps 

    def grad(self, x, v, sqnormx, sqnormv, sqdist):
        alpha = (1 - sqnormx)
        beta = (1 - sqnormv)
        z = 1 + 2 * sqdist / (alpha * beta)
        a = ((sqnormv - 2 * th.sum(x * v, dim=-1) + 1) / th.pow(alpha, 2)).unsqueeze(-1).expand_as(x)
        a = a * x - v / alpha.unsqueeze(-1).expand_as(v)
        z = th.sqrt(th.pow(z, 2) - 1)
        z = th.clamp(z * beta, min=eps).unsqueeze(-1)
        return 4 * a / z.expand_as(x)

    def forward(self, u, v): 
        self.save_for_backward(u, v)
        self.squnorm = th.clamp(th.sum(u * u, dim=-1), 0, self.boundary)
        self.sqvnorm = th.clamp(th.sum(v * v, dim=-1), 0, self.boundary)
        self.sqdist = th.sum(th.pow(u - v, 2), dim=-1)
        x = self.sqdist / ((1 - self.squnorm) * (1 - self.sqvnorm)) * 2 + 1 
        # arcosh
        z = th.sqrt(th.pow(x, 2) - 1)
        return th.log(x + z)

    def backward(self, g): 
        u, v = self.saved_tensors
        g = g.unsqueeze(-1)
        gu = self.grad(u, v, self.squnorm, self.sqvnorm, self.sqdist)
        gv = self.grad(v, u, self.sqvnorm, self.squnorm, self.sqdist)
        return g.expand_as(gu) * gu, g.expand_as(gv) * gv

In [None]:
import matplotlib.pyplot as plt
%matplotlib notebook
from matplotlib import animation, rc
from IPython.display import HTML, Image

from sklearn.decomposition import PCA
import glob
import time

rc('animation', html='html5')

fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)
ax.set_xlim(( -1.1, 1.1))
ax.set_ylim((-1.1, 1.1))
ax.set_axis_off()

bbox_props = dict(boxstyle="round,pad=0.3", fc="white", ec="black", lw=1)
ld = LorentzDistance()
pd = PoincareDistance()
s = [1 for n in range(len(model['entities']))]

links = {'lynx.n.02': {'feline.n.01'},
         'feline.n.01' : {'big_cat.n.01'},
         'big_cat.n.01': {'carnivore.n.01'},
         'carnivore.n.01': {'mammal.n.01'},
         'mammal.n.01': {}}
root = model['entities'].index('mammal.n.01')


In [None]:
fig_items = []
circle, = ax.plot([], [], linewidth=1, color='black')
fig_items.append(circle)
x = []
y = []
t = np.linspace(0,np.pi*2,1000)
x.extend(list(np.cos(t)))
y.extend(list(np.sin(t)))

scatter = ax.scatter([], [], s=s, color='darkblue')
fig_items.append(scatter)

label_items = {}
link_items = defaultdict(dict)
for l in links:
    t = ax.text([], [], l, bbox=bbox_props, ha="center", va="center")
    fig_items.append(t)
    label_items[l] = t
    for n in links[l]:
        p, = ax.plot([], [], 'k-', lw=0.75)
        fig_items.append(p)
        link_items[l][n] = p

In [None]:
import matplotlib.pyplot as plt
%matplotlib notebook
from sklearn.decomposition import PCA
import glob
import time
from scipy.interpolate import griddata

fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)
plt.ion()

fig.show()
fig.canvas.draw()
bbox_props = dict(boxstyle="round,pad=0.3", fc="white", ec="black", lw=1)
ld = LorentzDistance()
pd = PoincareDistance()

x = []
y = []
for r in np.linspace(0, 0.99, 100):
    t = np.linspace(0,np.pi*2,1000 * r)
    x.extend(list(r*np.cos(t)))
    y.extend(list(r*np.sin(t)))
model = th.load('./mammals.pth')
lorentz_embeddings = model['model']['embeddings.weight']
dim0 = lorentz_embeddings[:,0].unsqueeze(1)
dimn = lorentz_embeddings[:,1:]
poincare_embeddings = dimn / (dim0 + 1)
p = poincare_embeddings[model['entities'].index('lynx.n.02')]
z = [pd(p, th.DoubleTensor([xp,yp]).cuda()).cpu().data.numpy() for xp,yp in zip(x,y)]

vmin=min(z)
vmax=max(z)

x = np.asarray(x)
y = np.asarray(y)
z = np.asarray(z)
x=x.ravel()              #Flat input into 1d vector
x=(x[x!=np.isnan])   #eliminate any NaN
y=y.ravel()
y=(y[y!=np.isnan])
z=z.ravel()
z=(z[z!=np.isnan])

ax.hexbin(x, y, C=z, cmap=plt.cm.jet, bins=None, vmin=0, vmax=vmax)
ax.set_axis_off()
ax.set_title("Heatmap showing the distance from the edge of the space to all other points.")
fig.canvas.draw()

In [None]:
plt.savefig('distance_from_edge.png')

In [None]:
import matplotlib.pyplot as plt
%matplotlib notebook
from sklearn.decomposition import PCA
import glob
import time
from scipy.interpolate import griddata

fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)
plt.ion()

fig.show()
fig.canvas.draw()
bbox_props = dict(boxstyle="round,pad=0.3", fc="white", ec="black", lw=1)
ld = LorentzDistance()
pd = PoincareDistance()

x = []
y = []
for r in np.linspace(0, 0.99, 100):
    t = np.linspace(0,np.pi*2,1000 * r)
    x.extend(list(r*np.cos(t)))
    y.extend(list(r*np.sin(t)))
model = th.load('./mammals.pth')
lorentz_embeddings = model['model']['embeddings.weight']
dim0 = lorentz_embeddings[:,0].unsqueeze(1)
dimn = lorentz_embeddings[:,1:]
poincare_embeddings = dimn / (dim0 + 1)
p = th.DoubleTensor([0.0,0.0]).cuda()#poincare_embeddings[model['entities'].index('mammal.n.01')]
z = [pd(p, th.DoubleTensor([xp,yp]).cuda()).cpu().data.numpy() for xp,yp in zip(x,y)]

x = np.asarray(x)
y = np.asarray(y)
z = np.asarray(z)
x=x.ravel()              #Flat input into 1d vector
x=(x[x!=np.isnan])   #eliminate any NaN
y=y.ravel()
y=(y[y!=np.isnan])
z=z.ravel()
z=(z[z!=np.isnan])

ax.hexbin(x, y, C=z, cmap=plt.cm.jet, bins=None, vmin=0, vmax=vmax)
ax.set_title("Heatmap showing the distance from the center of the space to all other points.")
ax.set_axis_off()
fig.canvas.draw()

In [None]:
plt.savefig('distance_from_center.png')