In [2]:
# set up libraries and plotting UI
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt
import seaborn
seaborn.set(rc={"figure.figsize": (10, 8), "font.size": 12})

def matshow(mat, **kwargs):
    seaborn.heatmap(mat, square=True, xticklabels=50, yticklabels=50, **kwargs)

In [4]:
from conceptnet_retrofitting.loaders import *
from conceptnet_retrofitting.word_vectors import WordVectors
from conceptnet_retrofitting.builders.build_assoc import build_relations_from_conceptnet
from ordered_set import OrderedSet
from sklearn.preprocessing import normalize

In [5]:
PATH = '../build-data/'

In [9]:
pwd

'/media/rspeer/wobbly_data/code/conceptnet-retrofitting-paper/code/notebook'

In [7]:
labels = OrderedSet(load_labels(PATH + 'glove.840B.300d.filtered.conceptnet5.labels'))
sparse_rels = build_relations_from_conceptnet(labels, '/wobbly/data/conceptnet5/assoc/reduced.csv')

In [10]:
cnglove = load_word_vectors(
    PATH + 'glove.840B.300d.filtered.conceptnet5.labels',
    PATH + 'glove.840B.300d.l1.filtered.conceptnet5.npy',
    PATH + 'glove.840B.300d.filtered.conceptnet5.replacements.msgpack'
)
glove = load_word_vectors(PATH + 'glove.840B.300d.standardized.labels', PATH + 'glove.840B.300d.l1.standardized.npy')

AssertionError: 

In [None]:
from conceptnet_retrofitting.builders.retrofit import dense_relation_array

In [None]:
rel_array = dense_relation_array(
    cnglove.vectors[:100000],
    {rel: sp[:100000, :100000] for (rel, sp) in sparse_rels.items()}
)

In [None]:
def en_filter(term):
    return term.startswith('/c/en/')

In [None]:
rel_labels = sorted(sparse_rels)

In [None]:
def read_analogies(filename):
    for line in open(filename, encoding='utf-8'):
        line = line.rstrip()
        if not line or line.startswith('#'):
            continue
        parts = line.split('\t')
        inputs = parts[1:4]
        answers = parts[4:]
        yield inputs, answers

In [None]:
sym_rel_array = np.concatenate([rel_array, rel_array.swapaxes(1, 2)], axis=0)

In [None]:
rel_array.shape

In [None]:
any_rel = np.mean(cnglove.vectors, 0) @ rel_array @ np.mean(cnglove.vectors, 0)

In [None]:
def which_relation(wv, rar, c1, c2):
    rels = wv.to_vector(c2) @ rar @ wv.to_vector(c1)
    diff = np.maximum(0, rels - any_rel) ** 2
    diffsum = np.sum(diff)
    if diffsum > 0:
        diff /= diffsum
    return diff

In [None]:
def rank3_inner_product(vec, array3):
    return (array3 * vec[:, np.newaxis, np.newaxis]).sum(0)

In [None]:
def eval_analogy(wv, rar, c1, c2, c3, c4):
    if c4 in (c1, c2, c3):
        return 0.
    try:
        relA = which_relation(wv, rar, c1, c2)
        relB = which_relation(wv, rar, c1, c3)
        relAr = rank3_inner_product(relA, rar)
        relBr = rank3_inner_product(relB, rar)
        v1, v2, v3, v4 = [wv.to_vector(c) for c in (c1, c2, c3, c4)]
        numer1 = v4 @ relAr @ v3 + 1
        numer2 = v4 @ relBr @ v2 + 1
        denom1 = v4 @ relAr @ v1 + 1
        denom2 = v4 @ relBr @ v1 + 1
    except KeyError:
        return 0.
    return (numer1 ** 3 * numer2) / (denom1 + denom2)


In [None]:
def analogy(wv, rar, c1, c2, c3, num=20):
    relA = which_relation(wv, rar, c1, c2)
    relB = which_relation(wv, rar, c1, c3)
    print("RelA")
    for label, strength in zip(rel_labels + rel_labels, relA):
        print('\t%-20s\t% 7.1f' % (label, strength * 1000))
    print("RelB")
    for label, strength in zip(rel_labels + rel_labels, relB):
        print('\t%-20s\t% 7.1f' % (label, strength * 1000))
    relAr = rank3_inner_product(relA, rar)
    relBr = rank3_inner_product(relB, rar)
    v1, v2, v3 = [wv.to_vector(c) for c in (c1, c2, c3)]
    numer1 = wv.vectors @ (relAr @ v3) + 1
    numer2 = wv.vectors @ (relBr @ v2) + 1
    denom1 = wv.vectors @ (relAr @ v1) + 1
    denom2 = wv.vectors @ (relBr @ v1) + 1
    ratings = (numer1 ** 2 * numer2) / (denom1 + denom2)
    sortorder = np.argsort(-ratings)
    found = []
    for idx in sortorder:
        label = wv.labels[idx]
        if en_filter(label):
            found.append((label, ratings[idx]))
        if len(found) >= num:
            break
    return found

In [None]:
np.seterr(all='raise')
analogy(cnglove, rel_array, 'fire', 'hot', 'snow', num=20)

In [None]:
def analogy_3cosmul(wv, c1, c2, c3, num=20):
    sims1 = (wv.vectors @ wv.to_vector(c1)) + 1.000001
    sims2 = (wv.vectors @ wv.to_vector(c2)) + 1
    sims3 = (wv.vectors @ wv.to_vector(c3)) + 1
    ratings = sims2 * sims3 / (sims1)
    sortorder = np.argsort(-ratings)
    found = []
    for idx in sortorder:
        label = wv.labels[idx]
        if en_filter(label):
            found.append((label, ratings[idx]))
        if len(found) >= num:
            break
    return found


In [None]:
def eval_analogy_3cosmul(wv, c1, c2, c3, c4):
    try:
        v4 = wv.to_vector(c4)
        sim1 = v4 @ wv.to_vector(c1) + 1.000001
        sim2 = v4 @ wv.to_vector(c2) + 1
        sim3 = v4 @ wv.to_vector(c3) + 1
    except KeyError:
        return 0.
    return sim2 * (sim3 ** 2) / sim1

In [None]:
def eval_analogies(analogy_func, filename='/nfs/broadway/data/corpora/readtheory-analogies.txt'):
    total = 0
    correct = 0
    for inputs, answers in read_analogies(filename):
        # The 'inputs' are the three given components of the analogy.
        # 'answers' are the multiple-choice answers, where the correct answer is first in the list.
        best_score = 0.
        best_answer = ''
        for answer in answers:
            quad = inputs + [answer]
            score = analogy_func(*quad)
            if score >= best_score:
                best_score = score
                best_answer = answer
        total += 1
        if best_answer == answers[0]:
            correct += 1
        else:
            items = tuple(inputs + [best_answer.upper()] + [answers[0].upper()])
            print("%s : %s :: %s : %s (should be %s)" % items)
    print("Score: %2.2f%% (%d/%d)" % (correct / total * 100, correct, total))
    return correct / total

In [None]:
def curry_3cosmul(c1, c2, c3, c4):
    return eval_analogy_3cosmul(cnglove, c1, c2, c3, c4)

In [None]:
def curry_inferred(c1, c2, c3, c4):
    return eval_analogy(cnglove, rel_array, c1, c2, c3, c4)

In [None]:
print('\n3cosmul:')
eval_analogies(curry_3cosmul)
print('\ninferred relations:')
eval_analogies(curry_inferred)

print('\n3cosmul:')
eval_analogies(curry_3cosmul, filename='/nfs/broadway/data/corpora/learningexpress-analogies.txt')
print('\ninferred relations:')
eval_analogies(curry_inferred, filename='/nfs/broadway/data/corpora/learningexpress-analogies.txt')

In [None]:
example_words = ['metro', 'railway', 'subway', 'transit', 'public transit', 'public transportation', 'busway', 'light rail']

In [None]:
def scatter_words(wv1, wv2, words):
    N = len(words)
    vectors1 = np.vstack([wv1.to_vector(word) for word in words])
    vectors2 = np.vstack([wv2.to_vector(word) for word in words])
    U, S, Vt = np.linalg.svd(np.concatenate([vectors1, vectors2], axis=0), full_matrices=False)
    axis_ranks = np.argsort(Vt[0])
    ax1, ax2 = axis_ranks[-1], axis_ranks[-2]
    print(ax1, ax2)
    xs1 = vectors1[:, ax1]
    ys1 = vectors1[:, ax2]
    xs2 = vectors2[:, ax1]
    ys2 = vectors2[:, ax2]
    plot1 = plt.scatter(xs1, ys1, marker='o', color='#8899ff', s=30, label='before retrofitting')
    plot2 = plt.scatter(xs2, ys2, marker='s', color='#338833', s=30, label='after retrofitting')
    plt.legend(handles=[plot1, plot2])
    for i, word in enumerate(words):
        #plt.annotate(
        #    word, xy=(xs1[i], ys1[i]), xytext=(-2, 2),
        #    textcoords='offset points', ha='right', va='bottom',
        #    color='#6677dd'
        #)
        plt.annotate(
            word, xy=(xs2[i], ys2[i]), xytext=(-3, 2),
            textcoords='offset points', ha='right', va='bottom',
            color='#333333'
        )
        plt.quiver(
            xs1 * .95 + xs2 * .05, ys1 * .95 + ys2 * .05, (xs2 - xs1) * .9, (ys2 - ys1) * .9, scale_units='xy', angles='xy', scale=1,
            width=.0005, headwidth=20, headlength=20, color='#777777'
        )
    return plt

In [None]:
plt = scatter_words(glove, cnglove, example_words)
plt.xlim(-0.20, 0.05)
plt.ylim(-0.20, 0.05)
plt.xlabel("Feature with most variance")
plt.ylabel("Feature with second-most variance")
plt.plot()