In [44]:
import string
import utils
import random
import numpy as np
import matplotlib.pyplot as plt
import pickle
from scipy import optimize
from utils import random_idx
from utils import utils
from utils import lang_vectors_utils as lvu
%matplotlib inline

alphabet = string.lowercase + ' _#'
k = 5000
N = 10000

cluster_sizes = [1, 2, 3, 4, 5, 6, 7, 8]
ordered = 1
alphabet = string.lowercase + ' '
RI_letters = random_idx.generate_letter_id_vectors(N, k, alphabet)
lower_n_cutoff = .85

def read_examples(filepath):
    examples = set()
    with open(filepath, "r") as f:
        for line in f:
            for word in line.split():
                examples.add(word)
    return examples

def wicklefeaturize(past_tense_word, cluster_size, filepath="wickle_train/"):
    """
    Create a wicklefeature matrix (for mapping) and vector (for computations).
    only trigrams (cluster_size of size 3).
    Save the matrix.
    
    _ j u
    j u m
    u m p
    m p #

    """
    word = "_" + past_tense_word + "#"
    wicklefeatures = np.zeros((len(word)-cluster_size,N))
    for i in range(len(word)-cluster_size):
        ngram = word[i:i+cluster_size]
        wicklefeatures[i,:] = random_idx.id_vector(N, ngram, alphabet, RI_letters,ordered)
    wicklefeature = np.sum(wicklefeatures, axis=0)
    pickle.dump(wicklefeature, open(filepath+past_tense_word, "wb"))
    wicklefeatures, wicklefeature
   

    
def get_training_set(set_size, folderpath="wickle_train/", verbtype="ed",filepath="ed.txt"):
    past = read_examples(folderpath + filepath)
    past = list(random.sample(past, set_size))
    X = np.zeros((set_size,N))
    for i in range(set_size):
        X[i,:] = pickle.load(open(folderpath+verbtype+"/"+past[i], "r"))
    return X
    
def get_test_set(filepath):
    Y = [] # wicklefeatures of the test set
    y_labels = [] # 0 for not -ed verb, 1 for -ed verb

In [45]:
# stage 1. just knowing
def typed_training_set():
    # all types
    simple_present = read_examples("wickle_train/simple_present.txt")
    simple_past = read_examples("wickle_train/simple_past.txt")

    for word in simple_past:
        wicklefeaturize(word, 3, "wickle_train/typed/")

def training_set(folderpath="wickle_train/", verbtype="ed", filepath="ed.txt"):
    # -ed type
    # n data points
    # y = N x 1 vector predicting if a word is a verb in past tense
    past = read_examples(folderpath+filepath)
    for p in past:
        wicklefeaturize(p, 3, "wickle_train/"+verbtype+"/")

def ed_training_set(folderpath="wickle_train/", verbtype="ed", filepath="ed.txt"):
    training_set(folderpath, verbtype, filepath)
    
def bad_examples_training_set(folderpath="wickle_train/", verbtype="non", filepath="nouns5499.txt"):
    # for words that aren't verbs
    training_set(folderpath, verbtype, filepath)

In [46]:
# stage 2. initial prediction
# https://github.com/stephencwelch/Neural-Networks-Demystified
#New complete class, with changes:
class Neural_Network(object):
    def __init__(self, Lambda=0):        
        #Define Hyperparameters
        self.inputLayerSize = 2
        self.outputLayerSize = 1
        self.hiddenLayerSize = 3
        
        #Weights (parameters)
        self.W1 = np.random.randn(self.inputLayerSize,self.hiddenLayerSize)
        self.W2 = np.random.randn(self.hiddenLayerSize,self.outputLayerSize)
        
        #Regularization Parameter:
        self.Lambda = Lambda
        
    def forward(self, X):
        #Propogate inputs though network
        self.z2 = np.dot(X, self.W1)
        self.a2 = self.sigmoid(self.z2)
        self.z3 = np.dot(self.a2, self.W2)
        yHat = self.sigmoid(self.z3) 
        return yHat
        
    def sigmoid(self, z):
        #Apply sigmoid activation function to scalar, vector, or matrix
        return 1/(1+np.exp(-z))
    
    def sigmoidPrime(self,z):
        #Gradient of sigmoid
        return np.exp(-z)/((1+np.exp(-z))**2)
    
    def costFunction(self, X, y):
        #Compute cost for given X,y, use weights already stored in class.
        self.yHat = self.forward(X)
        J = 0.5*sum((y-self.yHat)**2)/X.shape[0] + (self.Lambda/2)*(np.sum(self.W1**2)+np.sum(self.W2**2))
        return J
        
    def costFunctionPrime(self, X, y):
        #Compute derivative with respect to W and W2 for a given X and y:
        self.yHat = self.forward(X)
        
        delta3 = np.multiply(-(y-self.yHat), self.sigmoidPrime(self.z3))
        #Add gradient of regularization term:
        dJdW2 = np.dot(self.a2.T, delta3)/X.shape[0] + self.Lambda*self.W2
        
        delta2 = np.dot(delta3, self.W2.T)*self.sigmoidPrime(self.z2)
        #Add gradient of regularization term:
        dJdW1 = np.dot(X.T, delta2)/X.shape[0] + self.Lambda*self.W1
        
        return dJdW1, dJdW2
    
    #Helper functions for interacting with other methods/classes
    def getParams(self):
        #Get W1 and W2 Rolled into vector:
        params = np.concatenate((self.W1.ravel(), self.W2.ravel()))
        return params
    
    def setParams(self, params):
        #Set W1 and W2 using single parameter vector:
        W1_start = 0
        W1_end = self.hiddenLayerSize*self.inputLayerSize
        self.W1 = np.reshape(params[W1_start:W1_end], \
                             (self.inputLayerSize, self.hiddenLayerSize))
        W2_end = W1_end + self.hiddenLayerSize*self.outputLayerSize
        self.W2 = np.reshape(params[W1_end:W2_end], \
                             (self.hiddenLayerSize, self.outputLayerSize))
        
    def computeGradients(self, X, y):
        dJdW1, dJdW2 = self.costFunctionPrime(X, y)
        return np.concatenate((dJdW1.ravel(), dJdW2.ravel()))
    
##Need to modify trainer class a bit to check testing error during training:
class trainer(object):
    def __init__(self, N):
        #Make Local reference to network:
        self.N = N
        
    def callbackF(self, params):
        self.N.setParams(params)
        self.J.append(self.N.costFunction(self.X, self.y))
        self.testJ.append(self.N.costFunction(self.testX, self.testY))
        
    def costFunctionWrapper(self, params, X, y):
        self.N.setParams(params)
        cost = self.N.costFunction(X, y)
        grad = self.N.computeGradients(X,y)
        
        return cost, grad
        
    def train(self, trainX, trainY, testX, testY):
        #Make an internal variable for the callback function:
        self.X = trainX
        self.y = trainY
        
        self.testX = testX
        self.testY = testY

        #Make empty list to store training costs:
        self.J = []
        self.testJ = []
        
        params0 = self.N.getParams()

        options = {'maxiter': 200, 'disp' : True}
        _res = optimize.minimize(self.costFunctionWrapper, params0, jac=True, method='BFGS', \
                                 args=(trainX, trainY), options=options, callback=self.callbackF)

        self.N.setParams(_res.x)
        self.optimizationResults = _res


In [52]:
# ed is type 1, non ed is type 0
X = get_training_set(10, "wickle_train/", "ed", "ed.txt")
y = np.ones((10,1))
print X.shape
print y.shape

(10, 10000)
(10, 1)


In [48]:
#Lambda: regularization parameter
NN = Neural_Network(Lambda=0.0001)

In [49]:
T = trainer(NN)

In [50]:
T.train(X,y)

MemoryError: 

In [None]:
plot(T.J)
grid(1)
xlabel('Iterations')
ylabel('Cost')

In [None]:
NN.costFunctionPrime(X,y)

In [None]:
NN.forward(X)

In [None]:
y

In [None]:
#Plot cost during training:
plot(T.J)
plot(T.testJ)
grid(1)
xlabel('Iterations')
ylabel('Cost')

In [None]:
#Make sure our gradients our correct after making changes:
numgrad = computeNumericalGradient(NN, X, y)
grad = NN.computeGradients(X,y)

In [None]:
#Should be less than 1e-8:
norm(grad-numgrad)/norm(grad+numgrad)

In [4]:


"""

# test set is now different
# someone clean up test_tense.txt.....
test_simple_past = read_examples("test_simple_past.txt")
test_tense = read_examples("test_tense.txt")

test_simple_present = read_examples("test_simple_present.txt")
test_simple_past = read_examples("test_simple_past.txt")


# stage 2. predicting tense (regularized, less correct)

http://www.myenglishteacher.net/irregular_verbs.html
given these few examples, basically 1 example for every rule,
generate the past tense for a test word
test_tense.txt
frequency isn't relevant here because all verbs have frequency ~1

n = len(simple_past)
d = 10000
mu, var = 0, .015
noise = np.random.normal(mu, sqrt(var), (n,d))

# need the reference for the past tense part of every word argh
for i in range(n):
    word_hypervec(simple_present[i], alphabet, d)
    word_hypervec(simple_past[i], alphabet, d)
# model W = weight vectors that represent the part of the word that makes the tense pattern

# stage 3. predicting tense, usually correct

pg 8 mclellan
feature vectors by one hot encoding.
how to determine if a verb is irregular or regular. it seems like a specific mapping for irregular. 
other than -ed and tense endings, what kind of patterns are we using to estimate a verb to a tense?



are we running linear regression where the elements of weight vector (every type of tense) determined by gaussians?
what would an activation/threshold be?
pg 9

pg 11: sounds like we're gradient descending

One hot encoding means can only store N linearly independent sets of patterns. can introduce noise from a gaussian 
hyperparameters mu = 0 and variance to have each set represent a general rule so can store more patterns. 
rule of 78 wat.
pg 14

enforce logarithmic growth for adding patterns of regular verbs
pg 15

A scheme which meets the first criterion, but not the second, is the scheme proposed by Wickelgren 0969) .
He suggested that words
should be represented as sequences of context-sensitive phoneme units
which represent each phone in a word as a triple, consisting of the phone itself, its predecessor, and its successor.
Notationally, we write each Wickelphone as a triple of phonemes, consisting of the central phoneme,
subscripted on the left by its predecessor and on the right by its successor. 
A phoneme occur- ring at the beginning of a word is preceded by a special symbol (#) standing for the word boundary; 
pg 18

Though the Wickelphones in a word are not strictly position specific , 
it turns out that (a) few words contain more than one occurrence of any given Wickelphone, 
and (b) there are no two words we know of that consist of the same sequence of Wickel- phones. 
For example Islitl and Isiltl contain no Wickelphones in common.
One nice property of Wickelphones is that they capture enough of the context in which a phoneme 
occurs to provide a sufficient basis for differentiating between the different cases of the 
past-tense rule and for characterizing the contextual variables that determine the subregulari- ties 
among the irregular past-tense verbs. For example, the word-final phoneme that 
determines whether we should add Idl, It I or rdl forming the regular past. 
And it is the sequence iN # which transformed to aN # in the ing ang pattern found in words like sing.
The trouble with the Wickelphone solution is that there are too many of them, and they are too specific
pg 19

for neural net:
activation function: sigmoid, wickelphone, relu

hat the model captures the basic three-stage pattern of acquisi- tion.
. That the model captures most aspects of differences in per- formance on different types of regular and irregular verbs.
. That the model is capable of responding appropriately to verbs it has never seen before , as well as to regular and irregular verbs actually experienced during training.
The more frequent a verb is, the more likely it is to be a regular verb. 
pg 25
Divide irregular verbs into 9 classes:

Verbs that do not change at all to form the past tense.

Verbs that change to a final /d/ to /t/ to form the past tense.

Verbs that undergo an internal vowel change and also add a final /t/ or /d/.

Verbs that undergo an internal vowel change and also a final /t/ or /d/.

Verbs that undergo an internal vowel change whose stems end in a dental.

Verbs that undergo a an internal vowel change of /i/ or /a/ to /^/.

Verbs that undergo a an internal vowel change of /i/ to /a/.

All other verbs that undergo an internal vowel change.

All verbs that undergo a vowel change and that end in a dipthongal sequence.

Divide regular verbs into 3 categories: 
those ending in a vowel or voiced consonant, which take a /d/ to form the past tense
those ending in a voiceless consonant, which take a /d/
those ending in /t/ or /d/, which take a final /^d/ to form the past tense

how clearly the same patterns evident in the Bybee and Slobin data. Verbs ending in t/d always show a stronger no-change response and a weaker regularized response than those not ending in t/d. During the very early stages of learning, however, the regularized response is stronger than the no-change response-even the verb does end with t/d. This suggests that the generalization that the past tense of tld verbs is formed by adding /^d/ is stronger than the generalization that verbs ending in t/d should not have an ending added. However, as learning proceeds, this secondary generalization is made (though for only a subset of the tl d verbs , as we shall see), and the simulation shows the same interaction that Bybee and Slobin 0982) found iri their preschoolers.

 pg 35

Erroneous no-change responses are clearly stronger for both regular

and irregular t/d verbs. that the erroneous no-change responses are stronger for the t/d verbs than for the other types of irregular verbs

 pg 36



Type 1. to have the least errors of irregular verbs

Probability of regularization: (base+ed + past+ed) / (base+ed + past+ed + correct)



Model should be sensitive to word frequency. guessing that we’re doing some lda shit haha. Their model was always given the present and past tenses together.

pg 41



pg 42 : Intuition on verb patterns. too much to write lmao

pg 43

pg 45: response as in what kind of pattern will the past tense version of the verb take. response strength as in frequency of this specific transformation (kind of pattern)

pg 46: Examine model’s performance by

Overall degree of transfer: how accurately the model generates the correct features of the new verbs.

 Unconstrained responses: model should not try out a certain set of past tenses for every verb, but should actually estimate the correct tense from a set of all tense patterns (so word patterns not necessarily tenses/ tenses of that word) ???












"""



['arose', 'awoke', 'was', 'bore', 'beat', 'became', 'began', 'bent', 'bet', 'bit', 'bled', 'blew', 'broke', 'brought', 'built', 'burned', 'burst', 'bought', 'caught', 'chose', 'clung', 'came', 'cost', 'crept', 'cut', 'dealt', 'dug', 'dived', 'did', 'drew', 'dreamed', 'drank', 'drove', 'ate', 'fell', 'fed', 'felt', 'fought', 'found', 'fitted', 'fled', 'flung', 'flew', 'forbade', 'forgot', 'forgave', 'forwent', 'froze', 'got', 'gave', 'went', 'ground', 'grew', 'hanged', 'had', 'heard', 'hid', 'hit', 'held', 'hurt', 'kept', 'kneeled', 'knitted', 'knew', 'laid', 'led', 'leaped', 'left', 'lent', 'let', 'lay', 'lighted', 'lost', 'made', 'meant', 'met', 'paid', 'proved', 'put', 'quit', 'read', 'rode', 'rang', 'rose', 'ran', 'sawed', 'said', 'saw', 'sought', 'sold', 'sent', 'set', 'sewed', 'shook', 'shaved', 'sheared', 'shined', 'shot', 'showed', 'shrank', 'shut', 'sang', 'sank', 'sat', 'slew', 'slept', 'slid', 'sneaked', 'spoke', 'sped', 'spent', 'spilled', 'spun', 'spat', 'split', 'spread', 

ValueError: could not broadcast input array from shape (10000) into shape (3)