# Text classifier

This LSTM classifier operates over tweets to classify Twitter account users as Overweight or Not Overweight.

In [1]:
import gzip
import numpy as np
import pickle as pkl

In [2]:
def cap_words(x, nb_words, oov=2):
    return [[oov if w >= nb_words else w for w in sen] for sen in x]

def skip_n(x, n, oov=2):
    return [[oov if w < n else w for w in sen] for sen in x]

def cap_length(x, maxlen):
    return [row[:maxlen] for row in x]

def push_indices(x, start, index_from):
    if start is not None:
        return [[start] + [w + index_from for w in row] for row in x]
    elif index_from:
        return [[w + index_from for w in row] for row in x]
    else:
        return x

def load_data(path='ow.pkl', nb_words=None, skip_top=0,
              maxlen=None, seed=113,
              start=1, oov=2, index_from=3):
    '''
    # Arguments
        path: where the data is stored (in '.')
        nb_words: max number of words to include. Words are ranked
            by how often they occur (in the training set) and only
            the most frequent words are kept
        skip_top: skip the top N most frequently occuring words
            (which may not be informative).
        maxlen: truncate sequences after this length.
        seed: random seed for sample shuffling.
        start_char: The start of a sequence will be marked with this character.
            Set to 1 because 0 is usually the padding character.
        oov: words that were cut out because of the `nb_words`
            or `skip_top` limit will be replaced with this character.
        index_from: index actual words with this index and higher.

    Note that the 'out of vocabulary' character is only used for
    words that were present in the training set but are not included
    because they're not making the `nb_words` cut here.
    Words that were not seen in the training set but are in the test set
    have simply been skipped.
    '''
    
    if path.endswith(".gz"):
        f = gzip.open(path, 'rb')
    else:
        f = open(path, 'rb')

    (train_X, train_y) = pkl.load(f)
    (test_X, test_y) = pkl.load(f)

    f.close()

    # randomize datum order 
    np.random.seed(seed)
    np.random.shuffle(train_X)
    np.random.seed(seed)
    np.random.shuffle(train_y)

    np.random.seed(seed * 2)
    np.random.shuffle(test_X)
    np.random.seed(seed * 2)
    np.random.shuffle(test_y)
    
    # keep maxlen words of each row
    if maxlen is not None:
        train_X = cap_length(train_X, maxlen)
        test_X = cap_length(test_X, maxlen)

    # cut off infrequent words to vocab of size nb_words
    if nb_words is not None:
        train_X = cap_words(train_X, nb_words, oov)
        test_X = cap_words(test_X, nb_words, oov)

    # cut off most frequent skip_top words
    if skip_top > 0:
        train_X = skip_n(train_X, skip_top, oov)
        test_X = skip_n(test_X, skip_top, oov)

    # prepend each sequence with start and raise indices by index_from
    train_X = push_indices(train_X, start, index_from)
    test_X = push_indices(test_X, start, index_from)
    
    train_X = np.array(train_X)
    train_y = np.array(train_y)

    test_X = np.array(test_X)
    test_y = np.array(test_y)
    
    return (train_X, train_y), (test_X, test_y)

def load_embeddings(nb_words=None, index_from=3):
    VOCAB = 'ow.dict.pkl'
    W2V = '/data/nlp/corpora/twitter4food/food_vectors_clean.txt'
    EMBEDDING_DIM = 200

    f = open(VOCAB, 'rb')
    word_index = pkl.load(f)
    f.close()
    
    if nb_words is not None:
        max_features = min(nb_words, len(word_index))
    else:
        max_features = len(word_index)

    embeddings_index = {}
    f = open(W2V, 'rb')
    fl = f.readline().strip().decode('UTF-8')
    i = 1
    for line in f:
        values = line.split()
        word = values[0].decode('UTF-8')
        if word in word_index:
            if word_index[word] < max_features:
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
#         if i <= 50:
#             if word in word_index:
#                 print(i, word, word_index[word])
#             else:
#                 print(i, word)
        if i % 1000 == 0:
            print(".", end="")
        if i % 100000 == 0:
            print("")
            
        i = i + 1

    f.close()
    print("")
    print('Found %s word vectors.' % len(embeddings_index))
    
    embedding_matrix = np.zeros((max_features+index_from, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i+index_from] = embedding_vector

    return embedding_matrix

In [3]:
max_features = 20000
maxlen = 2000  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

(X_train, y_train), (X_test, y_test) = load_data(nb_words=max_features, maxlen=maxlen)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

1082 train sequences
270 test sequences


In [4]:
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM, SimpleRNN, GRU
from keras.datasets import imdb

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Using TensorFlow backend.


Pad sequences (samples x time)
X_train shape: (1082, 2000)
X_test shape: (270, 2000)


In [5]:
embeddings = load_embeddings(nb_words=max_features)
print('embeddings shape: ', embeddings.shape)

....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
.................................................
Found 18441 word vectors.
embeddings shape:  (20003, 200)


In [11]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features + 3, 
                    embeddings.shape[1], 
                    weights=[embeddings],
                    dropout=0.2))
#model.add(GRU(128, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
model.add(Dense(1))
model.add(Activation('sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Build model...


In [None]:
y_test.shape

In [12]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=2,
          validation_data=(X_test, y_test))

Train...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1082 samples, validate on 270 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f40f0726cf8>

In [13]:
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.685949642128
Test accuracy: 0.559259260584


In [27]:
preds = model.predict_classes(X_test,
                      batch_size=batch_size)



In [47]:
def bootstrap(gold, pred, reps=100000):
    accts = len(gold)
    hist = {}
    hist[-1] = 0
    for v in gold:
        if v in hist:
            hist[v] = hist[v] + 1
        else:
            hist[v] = 1
    baseline = max(hist.values()) / float(accts)
    agr = np.array(gold == pred, dtype='int32')
    better = np.zeros(reps)
    for i in range(reps):
        sample = np.random.choice(agr, accts)
        if np.mean(sample) > baseline:
            better[i] = 1
    p = (1. - np.mean(better))
    if p < 0.05:
        stars = '*'
    elif p < 0.01:
        stars = '**'
    elif p < 0.001:
        stars = '***'
    else:
        stars = ''
    print('baseline:', baseline)
    print('p = %.4f%s' % (p, stars))

p = preds.flatten()
y = y_test.flatten()

bootstrap(y, p)


baseline: 0.5
p = 0.0292*
