In [112]:
#!/usr/bin/env python3
import sys
import os
import numpy as np

import lasagne
from lasagne.layers import InputLayer, ReshapeLayer, GRULayer
from lasagne.layers import DenseLayer
import theano.tensor as T
import theano
from sklearn.cross_validation import train_test_split

In [70]:
EMBEDDINGS_FN = os.path.expanduser("~/data/glove.6B.50d.txt")
def get_embeddings():
    embedding_dict = dict()
    embedding_mat = []
    print("loading embeddings")
    with open(EMBEDDINGS_FN) as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 3:
                continue
            word = parts[0]
            v_t = np.array(parts[1:], dtype=np.float32)
            embedding_mat.append(v_t)
            embedding_dict[word] = len(embedding_mat)
    unk = np.zeros_like(embedding_mat[0])
    embedding_mat = [unk] + embedding_mat
    embedding_mat = np.array(embedding_mat, dtype=np.float32)
    print("done loading embeddings")
    return embedding_dict, embedding_mat, unk

In [71]:
embedding_dict, embedding_mat, unk = get_embeddings()
embedding_mat

loading embeddings
done loading embeddings


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.41800001,  0.24968   , -0.41242   , ..., -0.18411   ,
        -0.11514   , -0.78580999],
       [ 0.013441  ,  0.23682   , -0.16899   , ..., -0.56656998,
         0.044691  ,  0.30392   ],
       ..., 
       [-0.51181   ,  0.058706  ,  1.09130001, ..., -0.25003001,
        -1.125     ,  1.58630002],
       [-0.75897998, -0.47426   ,  0.47369999, ...,  0.78953999,
        -0.014116  ,  0.64480001],
       [ 0.072617  , -0.51393002,  0.47279999, ..., -0.18907   ,
        -0.59021002,  0.55558997]], dtype=float32)

In [123]:
def embed_tweet(tweet, embedding_dict, embedding_mat):
    tweet_embeds = [embedding_mat[embedding_dict.get(token, 0)] for token in tweet.split() if not token.isspace()]
    return tweet_embeds

In [124]:
DATA_DIR = os.path.expanduser("~/data/twitter-datasets")
def load_data(embedding_dict, embedding_mat, full_data=False):
    fns = ['train_pos', 'train_neg']
    if full_data:
        fns = [fn + '_full' for fn in fns]
    
    def load_from_file(fn):
        with open(fn) as f:
            return [embed_tweet(tweet, embedding_dict, embedding_mat) for tweet in f]
    
    data_pos, data_neg = [load_from_file(os.path.join(DATA_DIR, fn + '.txt')) for fn in fns]
    X = data_pos
    X.extend(data_neg)
    Y = np.concatenate((np.ones(len(X) - len(data_neg), dtype=np.int64), np.zeros(len(data_neg), dtype=np.int64)))
    return X, Y

In [156]:
data_x_full, data_y_full = load_data(embedding_dict, embedding_mat)
data_x, data_x_test, data_y, data_y_test = train_test_split(data_x_full, data_y_full, test_size=0.01)
print(len(data_x))

198000


In [151]:
def build_network(input_var, dimensions=50):
    net = InputLayer((1, None, dimensions), input_var=input_var)
    net = GRULayer(net, 128, backwards=True)
    net = GRULayer(net, 128, only_return_final=True)
    net = DenseLayer(net, 2, nonlinearity=lasagne.nonlinearities.softmax)
    return net

In [152]:
def build_model(learning_rate=0.1, momentum=0.9, l2=0.00001):
    input_var = T.tensor3('input', dtype='float32')
    target_var = T.ivector('target')
    network = build_network(input_var)

    prediction = lasagne.layers.get_output(network)

    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()

    loss += lasagne.regularization.regularize_network_params(network, lasagne.regularization.l2) * l2

    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=learning_rate, momentum=momentum)

    acc = T.mean(T.eq(T.argmax(prediction, axis=1), target_var), dtype='float32')

    train_fn = theano.function([input_var, target_var], loss, updates=updates)

    val_fn = theano.function([input_var, target_var], [loss, acc])
    
    return train_fn, val_fn

In [153]:
train_fn, val_fn = build_model()

In [None]:
for iteration in range(100000):
    i = np.random.choice(len(data_x))
    x, y = data_x[i], data_y[i]
    loss = train_fn([x], [y])
    if iteration % 10000 == 0:
        print('performing validation')
        loss, acc = 0., 0.
        for dxt, dyt in zip(data_x_test, data_y_test):
            loss_i, acc_i = val_fn([dxt], [dyt])
            loss += loss_i
            acc += acc_i
        print('validation loss:', loss / len(data_x_test))
        print('validation accuracy:', acc / len(data_x_test))