In [1]:
import numpy as np

from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.objectives import to_categorical
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Embedding, OneHot, RepeatVector
from keras.layers.recurrent import LSTM, GRU

def one_hot(value):
    oh = np.zeros(len(voc), dtype='float32')
    oh[value]=1.
    return oh

def get_seq_and_letter(mat, lens, size, split):
    X_dat = []
    Y_dat = []
    X_test = []
    Y_test = []
    for i in range(int(split * size)):
        X_dat.append(mat[i,0:lens[i]-1])
        Y_dat.append(one_hot(mat[i,lens[i]-1]))
    for i in range(int(split* size), size):
        X_test.append(mat[i,0:lens[i]-1])
        Y_test.append(one_hot(mat[i,lens[i]-1]))
    return X_dat, Y_dat, X_test, Y_test

In [2]:
voc = '%*abcdefghijklmnopqrstuvwxyz01234567890 ,.!?\''
mat = np.load("/media/charles/data/matrix.npy")
lens = np.load("/media/charles/data/lens.npy")

max_features=len(voc)
emb_size = 32
maxlen = 30 # cut texts after this number of words (among top max_features most common words)                                                                                                                                                 
batch_size = 16

print "Loading data..."
X_train, y_train, X_test, y_test = get_seq_and_letter(mat, lens, 50000, 0.8)
print len(X_train), 'train sequences'
print len(X_test), 'test sequences'

#print X_train[0:10], y_train[0:10]                                                                                                                                                                                                           

print "Pad sequences (samples x time)"
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print 'X_train shape:', X_train.shape
print 'X_test shape:', X_test.shape


Loading data...
40000 train sequences
10000 test sequences
Pad sequences (samples x time)
X_train shape: (40000, 30)
X_test shape: (10000, 30)


In [None]:
print 'Build model...'
model = Sequential()
#model.add(Embedding(max_features, emb_size))
model.add(OneHot(len(voc)))
model.add(LSTM(len(voc), 128)) # try using a GRU instead, for fun
#model.add(Dropout(0.5))
model.add(Dense(128, len(voc)))
model.add(Activation('softmax'))
optimizer = Adagrad(lr=0.001)
# try using different optimizers and different optimizer configs                                                                                                                                                                              
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

print "Train..."
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=10, verbose=1)
score = model.evaluate(X_test, y_test, batch_size=batch_size)
print 'Test score:', score

classes = model.predict_classes(X_test, batch_size=batch_size)
acc = np_utils.accuracy(classes, y_test)
print 'Test accuracy:', acc



In [14]:
X_train.shape

(40000, 30)

In [None]:
import theano
import theano.tensor as T
print 'Build model s2s...'
max_size = 30
epsilon = 1.0e-15

model = Sequential()
#model.add(Embedding(max_features, emb_size))
model.add(OneHot(len(voc)))
model.add(LSTM(len(voc), 128)) # try using a GRU instead, for fun
model.add(Dropout(0.2))
model.add(RepeatVector(max_size))
model.add(LSTM(128, len(voc), return_sequences=True))
optimizer = Adagrad(lr=0.001)

def categorical(y, num_dim):
    '''Convert class vector (integers from 0 to nb_classes)
    to binary class matrix, for use with categorical_crossentropy
    '''
    ws = np.zeros((num_dim, num_dim), dtype=theano.config.floatX)
    for x in range(num_dim):
        ws[x,x] = 1.0
        W = theano.shared(ws)
    return W[y]

def T_one_hot(t, r=None):
    """
    given a tensor t of dimension d with integer values from range(r), return a
    new tensor of dimension d + 1 with values 0/1, where the last dimension
    gives a one-hot representation of the values in t.
    if r is not given, r is set to max(t) + 1
    """
    if r is None:
        r = T.max(t) + 1
    ranges = T.shape_padleft(T.arange(r), t.ndim)
    return T.eq(ranges, T.shape_padright(t, 1)) 

def categorical_loss(y_true, y_pred):
    '''Expects a binary class matrix instead of a vector of scalar classes
    '''    
    
    y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
    # scale preds so that the class probas of each sample sum to 1
    y_pred /= y_pred.sum(axis=1, keepdims=True) 
    return T.nnet.categorical_crossentropy(y_pred, T_one_hot(y_true, len(voc))).mean()

# try using different optimizers and different optimizer configs                                                                                                                                                                              
model.compile(loss='categorical_scalar_crossentropy', optimizer='rmsprop')



print "Train..."
model.fit(X_train, X_train, batch_size=batch_size, nb_epoch=10, verbose=1)
score = model.evaluate(X_test, X_test, batch_size=batch_size)
print 'Test score:', score

classes = model.predict_classes(X_test, batch_size=batch_size)
acc = np_utils.accuracy(classes, X_test)
print 'Test accuracy:', acc

  nstreams = self.n_streams(size)
  from scan_perform.scan_perform import *


In [6]:
def tostr(u):
    st = ""
    for x in u:
        st+=voc[x]
    return st

def see(X, Y):
    for x,y in zip (X,Y):
        print tostr(x), voc[y]

def seeModel(X, model):
    see(X, model.predict_classes(X))

In [8]:
seeModel(X_test[0:20], model)

*ek though, ev%%%%%%%%%%%%%%%%  
*iments'he%%%%%%%%%%%%%%%%%%%%  
*'quot'negativ%%%%%%%%%%%%%%%%  
*erness.'lt%%%%%%%%%%%%%%%%%%%  
*and james%%%%%%%%%%%%%%%%%%%%  
*he next %%%%%%%%%%%%%%%%%%%%%  
*t the bottom%%%%%%%%%%%%%%%%%  
*, very c%%%%%%%%%%%%%%%%%%%%%  
*sometimes fresh%%%%%%%%%%%%%%  
' a question %%%%%%%%%%%%%%%%%  
*to the pan if%%%%%%%%%%%%%%%%  
*oked covered wit%%%%%%%%%%%%%  
* your fish, y%%%%%%%%%%%%%%%%  
*using lemon%%%%%%%%%%%%%%%%%%  
*ref''quo%%%%%%%%%%%%%%%%%%%%%  
* the brow%%%%%%%%%%%%%%%%%%%%  
* in your ici%%%%%%%%%%%%%%%%%  
*hay' is litera%%%%%%%%%%%%%%%  
*ore than%%%%%%%%%%%%%%%%%%%%%  


In [36]:
classes = model.predict_classes(X_test, batch_size=batch_size)

ValueError: shape mismatch: value array of shape (16,30,45) could not be broadcast to indexing result of shape (16,30)

In [38]:
w = model.predict_proba(X_test, batch_size=batch_size)

ValueError: shape mismatch: value array of shape (16,30,45) could not be broadcast to indexing result of shape (16,30)

In [39]:
w = model._predict(X_test[0:batch_size])

In [46]:
for x in w.argmax(axis=-1):
    print tostr(x)

xxxxxxxx8888888888888888888888
xxxx88888888888888888888888888
xxxxxxxx8888888888888888888888
xxxxx8888888888888888888888888
xxxx88888888888888888888888888
xxx888888888888888888888888888
xxxxxxx88888888888888888888888
xxx888888888888888888888888888
xxxxxxxxxx88888888888888888888
xxxxxxxxxxxx888888888888888888
xxxxxxx88888888888888888888888
xxxxxxxx8888888888888888888888
xxxxxxxxxxx8888888888888888888
xxxxxxxx8888888888888888888888
xxxxxx888888888888888888888888
xxx888888888888888888888888888


In [48]:
for x in X_test[0:batch_size]:
    print tostr(x)

*ek though, ev%%%%%%%%%%%%%%%%
*iments'he%%%%%%%%%%%%%%%%%%%%
*'quot'negativ%%%%%%%%%%%%%%%%
*erness.'lt%%%%%%%%%%%%%%%%%%%
*and james%%%%%%%%%%%%%%%%%%%%
*he next %%%%%%%%%%%%%%%%%%%%%
*t the bottom%%%%%%%%%%%%%%%%%
*, very c%%%%%%%%%%%%%%%%%%%%%
*sometimes fresh%%%%%%%%%%%%%%
' a question %%%%%%%%%%%%%%%%%
*to the pan if%%%%%%%%%%%%%%%%
*oked covered wit%%%%%%%%%%%%%
* your fish, y%%%%%%%%%%%%%%%%
*using lemon%%%%%%%%%%%%%%%%%%
*ref''quo%%%%%%%%%%%%%%%%%%%%%
