In [1]:
"""
This script is what created the dataset pickled and generate a first model at word level.

1) You need to download this file and put it in the same directory as this file.
https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission.

2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory.

3) Then run this script.
"""

dataset_path='/home/jorge/proyectos/tesis/RNN/sentiment/'

import numpy
import cPickle as pkl

from collections import OrderedDict

import glob
import os

from subprocess import Popen, PIPE

# tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer
tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-']



In [2]:
#Separate words and punctuation signs by spaces
def tokenize(sentences):

    print 'Tokenizing..',
    text = "\n".join(sentences)
    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
    tok_text, _ = tokenizer.communicate(text)
    toks = tok_text.split('\n')[:-1]
    print 'Done'

    return toks

#create the dictionary to conver words to numbres. Order it with most frequent words first
def build_dict(path):
    sentences = []
    currdir = os.getcwd()
    os.chdir('%s/pos/' % path)
    for ff in glob.glob("*.txt"):
        with open(ff, 'r') as f:
            sentences.append(f.readline().strip())
    os.chdir('%s/neg/' % path)
    for ff in glob.glob("*.txt"):
        with open(ff, 'r') as f:
            sentences.append(f.readline().strip())
    os.chdir(currdir)

    sentences = tokenize(sentences)

    print 'Building dictionary..',
    wordcount = dict()
    for ss in sentences:
        words = ss.strip().lower().split()
        for w in words:
            if w not in wordcount:
                wordcount[w] = 1
            else:
                wordcount[w] += 1

    counts = wordcount.values()
    keys = wordcount.keys()
    sorted_idx = numpy.argsort(counts)[::-1]

    worddict = dict()
    for idx, ss in enumerate(sorted_idx):
        worddict[keys[ss]] = idx+2  # leave 0 and 1 (UNK)
    print numpy.sum(counts), ' total words ', len(keys), ' unique words'

    return worddict


#Read the original corpus 
def grab_data(path, dictionary):
    sentences = []
    currdir = os.getcwd()
    os.chdir(path)
    for ff in glob.glob("*.txt"):
        with open(ff, 'r') as f:
            sentences.append(f.readline().strip())
    os.chdir(currdir)
    sentences = tokenize(sentences)

    seqs = [None] * len(sentences)
    for idx, ss in enumerate(sentences):
        words = ss.strip().lower().split()
        seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words]

    return seqs

In [3]:
# Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/

#Create dictionary using the train data.
path = dataset_path
dictionary = build_dict(os.path.join(path, 'train'))

#Read train sentences and generate target y
train_x_pos = grab_data(path+'train/pos', dictionary)
train_x_neg = grab_data(path+'train/neg', dictionary)
X_train = train_x_pos + train_x_neg
y_train = [1] * len(train_x_pos) + [0] * len(train_x_neg)

#Read test sentences and generate target y
test_x_pos = grab_data(path+'test/pos', dictionary)
test_x_neg = grab_data(path+'test/neg', dictionary)
X_test = test_x_pos + test_x_neg
y_test = [1] * len(test_x_pos) + [0] * len(test_x_neg)



Tokenizing.. Done
Building dictionary.. 7113725  total words  101758  unique words
Tokenizing.. Done
Tokenizing.. Done
Tokenizing.. Done
Tokenizing.. Done


In [4]:
#Save to use in others models
f = open('imdb.pkl', 'wb')
pkl.dump((X_train, y_train), f, -1)
pkl.dump((X_test, y_test), f, -1)
f.close()

f = open('imdb.dict.pkl', 'wb')
pkl.dump(dictionary, f, -1)
f.close()


In [5]:
'''Train a LSTM on the IMDB sentiment classification task.
The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF+LogReg.
Notes:
- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.
- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
GPU command:
    THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python imdb_lstm.py
'''

import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.datasets import imdb

max_features = 20000
maxlen = 100  # cut texts after this number of words (among top max_features most common words)



Using gpu device 0: GeForce GTX TITAN Black (CNMeM is disabled)


In [6]:
#Select the most frequent max_features, recode others using 0
def remove_features(x):
    return [[0 if w >= max_features else w for w in sen] for sen in x]

X_train = remove_features(X_train)
X_test  = remove_features(X_test)


In [7]:
# Cut or complete the sentences to length = maxlen
print("Pad sequences (samples x time)")

X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Pad sequences (samples x time)
('X_train shape:', (25000, 100))
('X_test shape:', (25000, 100))


In [8]:
print('Build model 1...')
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(LSTM(128))  # try using a GRU instead, for fun
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='Adam',
              class_mode="binary")


Build model 1...


In [9]:
#Train the model
batch_size = 128

print("Train...")
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=3,
          validation_data=(X_test, y_test), show_accuracy=True)


Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f4a20e08110>

In [10]:
#Evaluate accuracy in test set
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size,
                            show_accuracy=True)
print 'Test score: ', score
print 'Test accuracy: ', acc

from sklearn.metrics import roc_auc_score
print 'AUC: ', roc_auc_score(y_test, model.predict_proba(X_test))


Test score:  0.432894129109
Test accuracy:  0.82684
 0.9109590272


In [12]:
print('Build model 2...')
model2 = Sequential()
model2.add(Embedding(max_features, 128, input_length=maxlen))
model2.add(LSTM(512, return_sequences=True))
model2.add(Dropout(0.5))
model2.add(LSTM(512, return_sequences=False))
model2.add(Dropout(0.5))
model2.add(Dense(1))
model2.add(Activation('sigmoid'))

# try using different optimizers and different optimizer configs
model2.compile(loss='binary_crossentropy',
              optimizer='Adam',
              class_mode="binary")


Build model 2...


In [13]:
#Train the model
batch_size = 128

print("Train...")
model2.fit(X_train, y_train, batch_size=batch_size, nb_epoch=10,
           validation_data=(X_test, y_test), show_accuracy=True)


Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4a03dc66d0>

In [14]:
#Evaluate accuracy in test set
score, acc = model2.evaluate(X_test, y_test,
                             batch_size=batch_size,
                             show_accuracy=True)
print 'Test score: ', score
print 'Test accuracy: ', acc

from sklearn.metrics import roc_auc_score
print 'AUC: ', roc_auc_score(y_test, model2.predict_proba(X_test))


Test score:  0.677513689604
Test accuracy:  0.81308
 0.8855979264
