In [8]:
import string
import utils
import random
import scipy
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import pickle

from scipy import optimize
from utils import random_idx
from utils import utils
from utils import lang_vectors_utils as lvu
from utils import verb_mappings as vm
%matplotlib inline

k = 500
N = 1000
NUM_CLASSES = 2
cluster_size = 3
ordered = 1
alphabet = string.lowercase + ' '
RI_letters = random_idx.generate_letter_id_vectors(N, k, alphabet)

def read_examples(filepath):
    examples = []
    with open(filepath, "r") as f:
        for line in f:
            for word in line.split():
                examples.append(word)
    return examples

edsets = ["ed.txt", "ed10000.txt", "ed5000.txt", "ed3000.txt", "ed1000.txt"]
edlabels = ["ed.txt", "ed10000.txt", "ed5000.txt", "ed3000.txt", "ed1000.txt"]
# oh shit -ed verbs are actually -d and -ed verbs in the same bag of words. i'm keeping them as the same class

In [9]:
# only working with -ed verbs for now
past = read_examples("wickle_train/edsets/ed1000.txt")
present = read_examples("wickle_train/edlabels/ed1000.txt")

In [10]:
"""
Mapping the present-tense verbs to past tense using
high-D vector arithmetic/algebra (rather than a neural
net) will take some experimenting because the simplest
idea is sure to fail.  First we make a Word Vector for
each basic verb and each past verb.  Word vectors are
hypervectors, and they are encoded as a sum of their
trigrams.  Thus a word vector is just like a language
vector of yore, with a word of L letters + two spaces
yielding L trigrams.

Now here is the simple idea (the one that will fail in
the end): form a mapping vector from a present-tense
verb to a past-tense verb by multiplying the two word
vectors.  This same mapping vector will turn the
past-tense verb to the present-tense verb.  These
mappings are not exact because the word vectors are
not binary +1s and -1s.

The next step is to add all the present-tense word
vectors into a single vector, call it the Present
Vector, and all the past-tense word vectors into a
single Past Vector, and then multiply the two.  That
will be the master mapping vector with which to start
experimenting (and it is the one that will have
problems).
"""
pasts = np.zeros((len(past), N))
presents = np.zeros((len(past), N))
mapping_vecs = np.zeros((len(past), N))

for i in range(len(past)):
    pasts[i] = vm.word_vec(past[i], alphabet, RI_letters, cluster_size, N, ordered)
    presents[i] = vm.word_vec(present[i], alphabet, RI_letters, cluster_size, N, ordered)
    mapping_vecs[i] = vm.mapping_vec(presents[i], pasts[i])

past_tense_vec = np.sum(pasts, axis=0)
present_tense_vec = np.sum(presents, axis=0)
master = vm.master_map(present_tense_vec, past_tense_vec)

In [16]:
"""
Test number 1: see if the word vector for a
present-tense verb, when multiplied by the master
mapping vector, produces anything like the word vector
for the same word in the past tense.  Sometimes it
works and sometimes it fails.  We can then start
looking at when and how it fails, and try to fix the
problem.  There are lots of tricks in our bag to fix
it with!
"""
# need to normalize the dot products
dot_prods = np.zeros(len(past))
for i in range(len(past)):
    pastHat = presents[i]*master
    dot_prods[i] = pasts[i].dot(pastHat)
dot_prods = (dot_prods - np.mean(dot_prods))/float(np.std(dot_prods))

accuracies = np.where(dot_prods>0, 1, 0)
accuracy = np.sum(accuracies)/float(len(past))
print ("accuracy", accuracy)
print "present, past, pastHat, similarity"
for i in range(len(past)):
    print (present[i], past[i], dot_prods[i])

('accuracy', 0.437)
present, past, pastHat, similarity
('wist', 'wisted', -1.1258142992267717)
('soberiz', 'soberized', 0.032232009311273895)
('revegetat', 'revegetated', 1.2724957547803479)
('finess', 'finessed', -0.62057427505250484)
('impersonaliz', 'impersonalized', 2.114444697028707)
('cuss', 'cussed', -1.3037646364975732)
('overag', 'overaged', -0.3109877738272353)
('retailor', 'retailored', 0.74694334911814908)
('barrag', 'barraged', -0.23890024794850251)
('clott', 'clotted', -0.71653296713353287)
('nonlead', 'nonleaded', -0.0056613505193472127)
('disclaim', 'disclaimed', 0.48727682957335411)
('knarr', 'knarred', -0.73762811338148737)
('sparkplugg', 'sparkplugged', 1.2971380727363866)
('demount', 'demounted', 0.66560663536280285)
('croon', 'crooned', -0.65555246945199586)
('mutini', 'mutinied', -0.35222733416919416)
('pigg', 'pigged', -1.2919043574241886)
('attract', 'attracted', 0.075214814271957711)
('flench', 'flenched', -0.7787870445606796)
('institutionalis', 'institutional