In [None]:
import numpy as np
import theano
import theano.tensor as T
from theano import shared 
from collections import OrderedDict

dtype=T.config.floatX

In [2]:
def sample_weights(sizeX, sizeY):
    W = np.random.uniform(low=-1., high=1., size=(sizeX, sizeY))
    _, svs, _ = np.linalg.svd(W)
    values = np.asarray(W / svs[0], dtype=dtype)
    return shared(values, borrow=True) 

class LSTM:
    def __init__(self, n_in, n_lstm, n_out):        
        self.n_in = n_in
        self.n_lstm = n_lstm
        self.n_out = n_out
        self.W_xi = sample_weights(n_in, n_lstm)
        self.W_hi = sample_weights(n_lstm, n_lstm)
        self.W_ci = sample_weights(n_lstm, n_lstm)
        self.b_i = shared(np.cast[dtype](np.random.uniform(-0.5,.5,size = n_lstm)))
        self.W_xf = sample_weights(n_in, n_lstm)
        self.W_hf = sample_weights(n_lstm, n_lstm)
        self.W_cf = sample_weights(n_lstm, n_lstm)
        self.b_f = shared(np.cast[dtype](np.random.uniform(0, 1.,size = n_lstm)))
        self.W_xc = sample_weights(n_in, n_lstm)
        self.W_hc = sample_weights(n_lstm, n_lstm)
        self.b_c = shared(np.zeros(n_lstm, dtype=dtype))
        self.W_xo = sample_weights(n_in, n_lstm)
        self.W_ho = sample_weights(n_lstm, n_lstm)
        self.W_co = sample_weights(n_lstm, n_lstm)
        self.b_o = shared(np.cast[dtype](np.random.uniform(-0.5,.5,size = n_lstm)))
        self.W_hy = sample_weights(n_lstm, n_out)
        self.b_y = shared(np.zeros(n_out, dtype=dtype))
        self.params = [self.W_xi, self.W_hi, self.W_ci, self.b_i, 
                       self.W_xf, self.W_hf, self.W_cf, self.b_f, 
                       self.W_xc, self.W_hc, self.b_c, 
                       self.W_ho, self.W_co, self.W_co, self.b_o, 
                       self.W_hy, self.b_y]
                

        def step_lstm(x_t, h_tm1, c_tm1):
            i_t = T.nnet.sigmoid(T.dot(x_t, self.W_xi) + T.dot(h_tm1, self.W_hi) + T.dot(c_tm1, self.W_ci) + self.b_i)
            f_t = T.nnet.sigmoid(T.dot(x_t, self.W_xf) + T.dot(h_tm1, self.W_hf) + T.dot(c_tm1, self.W_cf) + self.b_f)
            c_t = f_t * c_tm1 + i_t * T.tanh(T.dot(x_t, self.W_xc) + T.dot(h_tm1, self.W_hc) + self.b_c) 
            o_t = T.nnet.sigmoid(T.dot(x_t, self.W_xo)+ T.dot(h_tm1, self.W_ho) + T.dot(c_t, self.W_co)  + self.b_o)
            h_t = o_t * T.tanh(c_t)
            y_t = T.nnet.sigmoid(T.dot(h_t, self.W_hy) + self.b_y) 
            return [h_t, c_t, y_t]
        
        X = T.matrix() # X is a sequence of vector   
        Y = T.matrix() # Y is a sequence of vector
        h0 = shared(np.zeros(self.n_lstm, dtype=dtype)) # initial hidden state 
        c0 = shared(np.zeros(self.n_lstm, dtype=dtype)) # initial cell state
        
        [h_vals, c_vals, y_vals], _ = theano.scan(fn=step_lstm,                                  
                                                  sequences=X,
                                                  outputs_info=[h0, c0, None])
        
        self.output = y_vals
    
        cost = -T.mean(Y * T.log(y_vals)+ (1.- Y) * T.log(1. - y_vals))
        lr = shared(np.cast[dtype](0.1))
        gparams = T.grad(cost, self.params)
        updates = OrderedDict()
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - gparam * lr
        self.train = theano.function(inputs = [X, Y], outputs = cost, updates=updates) 
        
        self.pred = theano.function(inputs = [X], outputs = self.output)                
            
    def create_train(self, cost, lr):
        gparams = T.grad(cost, self.params)
        updates = OrderedDict()
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - gparam * lr
        return theano.function(inputs = [X, Y], outputs = cost, updates=updates) 
        
    def create_pred(self):        
        return theano.function(inputs = [X], outputs = self.output)


In [None]:
model = LSTM(7, 50, 7)

In [61]:
mode

array([[ 0.44404912,  0.54889768,  0.50595653,  0.48602951,  0.52215582,
         0.41718704,  0.4979817 ],
       [ 0.44585481,  0.56249017,  0.48906904,  0.47611621,  0.55306828,
         0.43845311,  0.50738233],
       [ 0.44442102,  0.56151319,  0.49318475,  0.47775781,  0.55126017,
         0.43592644,  0.50856906],
       [ 0.44428554,  0.56243432,  0.49460772,  0.47738355,  0.55350947,
         0.43670017,  0.50743496],
       [ 0.4445456 ,  0.56140858,  0.49135998,  0.47786784,  0.5519433 ,
         0.43466848,  0.50850576],
       [ 0.4448041 ,  0.56337154,  0.49178609,  0.47771156,  0.55309385,
         0.43593186,  0.50804907],
       [ 0.44455197,  0.56388026,  0.49297667,  0.47771427,  0.55152595,
         0.43546441,  0.5080983 ],
       [ 0.44503286,  0.56241965,  0.49162626,  0.47813872,  0.55250764,
         0.43882495,  0.5069074 ],
       [ 0.44490701,  0.5625419 ,  0.49516577,  0.47700968,  0.55021876,
         0.43521693,  0.50866044],
       [ 0.44395417,  0.5613

In [25]:

X = np.random.uniform(low=-0.1, high=0.1, size=(100,15,10)).astype(dtype=dtype) 
Y = np.zeros(shape=(100,5)).astype(dtype=dtype)
indices = np.random.randint(5, size=(100))
for x in range(Y.shape[0]):
    Y[x,indices[x]]=1.

In [26]:
model.train(X,Y)

TypeError: ('Bad input argument to theano function with name "<ipython-input-22-d65de749fb1e>:40"  at index 0(0-based)', 'Wrong number of dimensions: expected 2, got 3 with shape (100, 15, 10).')

In [32]:
nb_epochs = 100
#stupid and naive sgd
for x in range(nb_epochs):
    error = 0.
    for j in range(len(train_data)):  
        index = np.random.randint(0, len(train_data))
        i, o = train_data[index]
        train_cost = model.train(i, o)
        error += train_cost
    if x%10==0:
            print "epoch "+str(x)+ " error: "+str(error)

epoch 0 error: 506.78519845
epoch 10 error: 21.6646672608
epoch 20 error: 4.38932591164
epoch 30 error: 1.8954052534
epoch 40 error: 1.23063871096
epoch 50 error: 1.38613807154
epoch 60 error: 0.705777432944
epoch 70 error: 0.568835470825
epoch 80 error: 0.47584372235
epoch 90 error: 0.409921698127


In [29]:
import numpy as np

chars='BTSXPVE'

graph = [[(1,5),('T','P')] , [(1,2),('S','X')], \
           [(3,5),('S','X')], [(6,),('E')], \
           [(3,2),('V','P')], [(4,5),('V','T')] ]


def in_grammar(word):
    if word[0] != 'B':
        return False
    node = 0    
    for c in word[1:]:
        transitions = graph[node]
        try:
            node = transitions[0][transitions[1].index(c)]
        except ValueError: # using exceptions for flow control in python is common
            return False
    return True        
      
def sequenceToWord(sequence):
    """
    converts a sequence (one-hot) in a reber string
    """
    reberString = ''
    for s in sequence:
        index = np.where(s==1.)[0][0]
        reberString += chars[index]
    return reberString
    
def generateSequences(minLength):
    while True:
        inchars = ['B']
        node = 0
        outchars = []    
        while node != 6:
            transitions = graph[node]
            i = np.random.randint(0, len(transitions[0]))
            inchars.append(transitions[1][i])
            outchars.append(transitions[1])
            node = transitions[0][i]
        if len(inchars) > minLength:  
            return inchars, outchars


def get_one_example(minLength):
    inchars, outchars = generateSequences(minLength)
    inseq = []
    outseq= []
    for i,o in zip(inchars, outchars): 
        inpt = np.zeros(7)
        inpt[chars.find(i)] = 1.     
        outpt = np.zeros(7)
        for oo in o:
            outpt[chars.find(oo)] = 1.
        inseq.append(inpt)
        outseq.append(outpt)
    return inseq, outseq


def get_char_one_hot(char):
    char_oh = np.zeros(7)
    for c in char:
        char_oh[chars.find(c)] = 1.
    return [char_oh] 
    
def get_n_examples(n, minLength=10):
    examples = []
    for i in xrange(n):
        examples.append(get_one_example(minLength))
    return examples

emb_chars = "TP"


def get_one_embedded_example(minLength=10):
    i, o = get_one_example(minLength)
    emb_char = emb_chars[np.random.randint(0, len(emb_chars))]
    new_in = get_char_one_hot(('B',))
    new_in += get_char_one_hot((emb_char,))
    new_out= get_char_one_hot(emb_chars)
    new_out+= get_char_one_hot('B',)
    new_in += i
    new_out += o
    new_in += get_char_one_hot(('E',))
    new_in += get_char_one_hot((emb_char,))
    new_out += get_char_one_hot((emb_char, ))
    new_out += get_char_one_hot(('E',))
    return new_in, new_out
    
def get_n_embedded_examples(n, minLength=10):
    examples = []
    for i in xrange(n):
        examples.append(get_one_embedded_example(minLength))
    return examples

In [30]:
train_data = get_n_embedded_examples(1000)

In [33]:
test_data = get_n_embedded_examples(10)

def print_out(test_data):
    for i,o in test_data:
        p = model.pred(i)
        print o[-2] # target
        print np.asarray([0. if x!=np.argmax(p[-2]) else 1. for x in range(7)]) # prediction
        print 
print_out(test_data)

[ 0.  0.  0.  0.  1.  0.  0.]
[ 0.  0.  0.  0.  1.  0.  0.]

[ 0.  0.  0.  0.  1.  0.  0.]
[ 0.  0.  0.  0.  1.  0.  0.]

[ 0.  1.  0.  0.  0.  0.  0.]
[ 0.  1.  0.  0.  0.  0.  0.]

[ 0.  0.  0.  0.  1.  0.  0.]
[ 0.  0.  0.  0.  1.  0.  0.]

[ 0.  1.  0.  0.  0.  0.  0.]
[ 0.  1.  0.  0.  0.  0.  0.]

[ 0.  1.  0.  0.  0.  0.  0.]
[ 0.  1.  0.  0.  0.  0.  0.]

[ 0.  0.  0.  0.  1.  0.  0.]
[ 0.  0.  0.  0.  1.  0.  0.]

[ 0.  1.  0.  0.  0.  0.  0.]
[ 0.  1.  0.  0.  0.  0.  0.]

[ 0.  0.  0.  0.  1.  0.  0.]
[ 0.  0.  0.  0.  1.  0.  0.]

[ 0.  1.  0.  0.  0.  0.  0.]
[ 0.  1.  0.  0.  0.  0.  0.]



In [67]:
[len(x[0]) for x in test_data]

[14, 14, 16, 14, 20, 14, 14, 14, 19, 14]

In [17]:
# word prediction
import re
import random
import numpy as np
from gensim import corpora


def process(x):
    return re.sub('\W+', ' ', x).lower().split()


class Corpus:
    def __init__(self, seq_x=None, dic=None):                
        self.seq_x = []
        self.seq_y = []
        self.matrix = []
        self.idx2word = {}
        self.word2idx = {}
        if dic == None:
            dictionary = corpora.Dictionary(process(line) for line in TextList + TitleList)
            dictionary.filter_extremes(no_below=10,no_above=1.0, keep_n=100000)
            dictionary.compactify()
            self.idx2word = {k:v for (k,v) in dictionary.items()}
            self.idx2word[len(self.idx2word)] = 'END'
            self.word2idx = {v:k for (k,v) in self.idx2word.items()}
            del dictionary
        else:
            self.idx2word = dic
            self.idx2word[len(self.idx2word)] = 'END'
            self.word2idx = {v:k for (k,v) in self.idx2word.items()}
        self.vocsize = len(self.idx2word)

        if seq_x!=None:
            for line in seq_x:
                words = filter(lambda w: w in self.word2idx, process(line))
                self.seq_x.append(words)
        '''for line in seq_y:
            words = line.split()
            self.seq_y.append(words)
            words = filter(lambda w: w in dictionary, process(line))
            self.seq_x.append(words)  
            for word in words:
                dic_freq[word] = dic_freq.get(word, 0) + 1'''
        
    def to_numpy(self):
        
        correct_seqs = [seq for seq in self.seq_x if len(seq) > 99]
        self.matrix = np.zeros(shape=(len(correct_seqs), 100), dtype='int32')
        for idx, seq in enumerate(correct_seqs):
            seq_idxs = [self.word2idx[w] for w in seq[:100]]
            if len(seq_idxs)<100:
                continue
            row = np.asarray(seq_idxs, dtype='int32')
            self.matrix[idx,:] = row
        return self.matrix
        
    def one_hot(self, x):
        vec = np.zeros(size=(1,1,self.vocsize), dtype=dtype)
        vec[1,1,x] = 1.0
        return vec

def make_dataset(matrix, pad, start=3, min_len=10, max_len=20):      
    assert(start+max_len<matrix.shape[1])
    dataset_x = np.ones(shape = (matrix.shape[0], max_len), dtype = 'int32') * pad
    dataset_y = np.zeros(shape = (matrix.shape[0]), dtype = 'int32')        
    for idx in range(matrix.shape[0]):
        length = random.randint(min_len, max_len)

        #pad with end seq                        
        dataset_x[idx,0:length] = matrix[idx,start:start+length]    
        dataset_y[idx] = matrix[idx,length]
    return [dataset_x, dataset_y]
        #voc = [k for (k,v) in dic_freq.items() if v>=min_freq]
        #print "loaded "+ len(dic_freq) + "words, kept " + len(voc) + "words"
        #self.idx_voc = {v:k for (k,v) in self.voc_idx.items()}
        
    #todo save / load
            

In [4]:
import json
filename = "/media/charles/data/articles"
h = open(filename)
all_jsons=[]
for line in h:    
    if line[0]=='[':
        all_jsons.append(line[:-1])
        
TitleList = []
TextList = []
IndexList = []
count = 0

for oneJson in all_jsons:
    u = json.loads(oneJson)
    for item in u:
        fields = item['fields']
        TitleList.append(fields['title'])
        TextList.append(fields['text'])
        IndexList.append(item['rowKey'])
        count+=1
        if count%10000==0:
            print("done: "+str(count))

all_jsons = []
del all_jsons


done: 10000
done: 20000
done: 30000
done: 40000
done: 50000
done: 60000
done: 70000
done: 80000
done: 90000
done: 100000


In [73]:
voc = '%*abcdefghijklmnopqrstuvwxyz01234567890 ,.!?\''
def get_char_one_hot(char):
    char_oh = np.zeros(len(voc))
    for c in char:
        char_oh[voc.find(c)] = 1.
    return [char_oh]

def get_one_hot_mat(index, mat, lens):
    vec = mat[index]    
    res_mat = np.zeros(shape=(lens[index]+1,len(voc)), dtype=dtype)
    for i in range(lens[index]):        
        res_mat[i][vec[i]] = 1
    res_mat[lens[index]][0] = 1
    return res_mat
    
def encode(string):
    vec = np.asarray([voc.find(c) for c in string], dtype='int32')      
    res_mat = np.zeros(shape=(len(string)+1,len(voc)), dtype='float32')
    for i in range(len(string)):
        res_mat[i][vec[i]] = 1.
    res_mat[len(string)][0] = 1.
    return res_mat

def pred(model, inputvec):
    c = ""
    for x in model.pred(inputvec):
        c += voc[np.argmax(x)]
    return c

In [36]:
mat = np.load("matrix.npy")
lens = np.load("lens.npy")

In [None]:
model = LSTM(len(voc), 50, len(voc))

In [67]:
len_train = 10000

nb_epochs = 5
#stupid and naive sgd
for x in range(nb_epochs):
    error = 0.
    for j in range(len_train):  
        index = np.random.randint(0, len_train)
        train_example = get_one_hot_mat(index, mat, lens)
        train_cost = model.train(train_example, train_example)
        error += train_cost
    #if x%10==0:
    print "epoch "+str(x)+ " error: "+str(error)


epoch 0 error: 1178.73945314
epoch 1 error: 828.035959486
epoch 2 error: 651.734214276
epoch 3 error: 507.594759025
epoch 4 error: 407.227064786


In [18]:
[dx, dy] = make_dataset(matrix=matrix, pad=corpus.word2idx['END'])

In [6]:
dic = corpus.idx2word

In [69]:
x = get_one_hot_mat(6, mat, lens)
print x.shape, encode("i am very").shape

(13, 45) (10, 45)


In [76]:
pred(model, encode("where is the "))

'*here is the %'

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 