In [2]:
from __future__ import print_function, division
import wget
import tarfile

In [1]:
url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

In [3]:
file_get = wget.download(url)

In [4]:
file_tar = tarfile.open('aclImdb_v1.tar.gz')

In [5]:
file_tar.extractall()

In [7]:
# Vocabulary: All words used, starting by the most frequent
with open('aclImdb/imdb.vocab', encoding="UTF-8") as f:
    vocab = [word.rstrip() for word in f]
    # Keep only most frequent 5000 words rather than all 90000
    # Just saving memory - the long tail occurs too few times
    # for the model to learn anything anyway
    vocab = vocab[:5000]
    print('%d words in vocabulary' % (len(vocab),))

5000 words in vocabulary


In [8]:
import re

def text_tokens(text):
    text = text.lower()
    text = re.sub("\\s", " ", text)
    text = re.sub("[^a-zA-Z' ]", "", text)
    tokens = text.split(' ')
    return tokens

In [16]:
import os

def load_dataset(dirname):
    X, y = [], []
    # Review files: neg/0_3.txt neg/10000_4.txt neg/10001_4.txt ...
    for y_val, y_label in enumerate(['neg', 'pos']):
        y_dir = os.path.join(dirname, y_label)
        for fname in os.listdir(y_dir):
            fpath = os.path.join(y_dir, fname)
            # print('\r' + fpath + '   ', end='')
            with open(fpath, encoding="UTF-8") as f:
                tokens = text_tokens(f.read())
            X.append(tokens)
            y.append(y_val)  # 0 for 'neg', 1 for 'pos'
    print()
    return X, y

In [17]:
X_train, y_train = load_dataset('aclImdb/train/')

# We are cheating here - this is a test set, not a validation set.
# This is just to make results quickly comparable to outside results
# during the tutorial, but you should normally never use the test set
# during training, of course!
X_val, y_val = load_dataset('aclImdb/test/')





In [18]:
def bow_onehot_vector(tokens):
    vector = [0] * len(vocab)
    for t in tokens:
        try:
            vector[vocab.index(t)] = 1
        except:
            pass  # ignore missing words
    return vector

In [19]:
from tqdm import tqdm

X_bow_train = [bow_onehot_vector(x) for x in tqdm(X_train)]
X_bow_val = [bow_onehot_vector(x) for x in tqdm(X_val)]

100%|██████████| 25000/25000 [01:14<00:00, 333.94it/s]
100%|██████████| 25000/25000 [01:15<00:00, 331.91it/s]


In [20]:
def best_train_history(history):
    best_epoch = np.argmax(history.history['val_acc'])
    print('Accuracy (epoch %d): %.4f train, %.4f val' % \
          (best_epoch + 1, history.history['acc'][best_epoch], history.history['val_acc'][best_epoch]))
# (Note that sentiment.model is the state after the last epoch rather than best epoch!
# Use ModelCheckpointer to restore the best epoch.)

In [21]:
from keras.layers import Activation, Dense, Input
from keras.models import Model
import numpy as np

class BOWSentimentModel(object):
    def __init__(self):
        bow = Input(shape=(len(vocab),), name='bow_input')
        # weights of all inputs
        sentiment = Dense(1)(bow)
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        self.model = Model(inputs=[bow], outputs=[sentiment])
        self.model.summary()
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    def train(self, X, y, X_val, y_val):
        print('Fitting...')
        return self.model.fit(np.array(X), np.array(y), validation_data=(np.array(X_val), np.array(y_val)), epochs=10, verbose=1)

    def predict(self, X):
        return self.model.predict(np.array(X))
    
sentiment = BOWSentimentModel()
history = sentiment.train(X_bow_train, y_train, X_bow_val, y_val)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bow_input (InputLayer)      [(None, 5000)]            0         
                                                                 
 dense (Dense)               (None, 1)                 5001      
                                                                 
 activation (Activation)     (None, 1)                 0         
                                                                 
Total params: 5,001
Trainable params: 5,001
Non-trainable params: 0
_________________________________________________________________
Fitting...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
test_text = 'No, it won’t do ANY of those things. It will just make life more miserable for Bronx residents. Maybe fix a highway, raise the speed limit or get rid of the bike lanes. THAT might actually help residents and businesses in The Bronx.'
test_tokens = text_tokens(test_text)
print(test_text)
print(sentiment.predict([bow_onehot_vector(test_tokens)])[0])

test_text = 'any updates on the 5th Avenue Bus way #BetterBuses ?'
test_tokens = text_tokens(test_text)
print(test_text)
print(sentiment.predict([bow_onehot_vector(test_tokens)])[0])

No, it won’t do ANY of those things. It will just make life more miserable for Bronx residents. Maybe fix a highway, raise the speed limit or get rid of the bike lanes. THAT might actually help residents and businesses in The Bronx.
[0.36745435]
any updates on the 5th Avenue Bus way #BetterBuses ?
[0.562199]


In [33]:
test_text = 'Yes! People like pedestrian plazas, dont let all the suburbanites that dont live here (just drive in for the day) tell you otherwise. Focus on the ppl that live in NYC, not the ones that commute to it.'
test_tokens = text_tokens(test_text)
print(test_text)
print(sentiment.predict([bow_onehot_vector(test_tokens)])[0])

test_text = '@ThunderGawdKen did they inform the community about this?'
test_tokens = text_tokens(test_text)
print(test_text)
print(sentiment.predict([bow_onehot_vector(test_tokens)])[0])

test_text = 'Im gonna take a walk down there to see if theres any signage. This was touched upon at the last meeting for the Transportation Committee for CB2: DOT is moving without any input from CB2, and tends to move in secrecy for the past few years, especially in this neighborhood 🤦🏿‍♂️'
test_tokens = text_tokens(test_text)
print(test_text)
print(sentiment.predict([bow_onehot_vector(test_tokens)])[0])

Yes! People like pedestrian plazas, dont let all the suburbanites that dont live here (just drive in for the day) tell you otherwise. Focus on the ppl that live in NYC, not the ones that commute to it.
[0.38242462]
@ThunderGawdKen did they inform the community about this?
[0.52011335]
Im gonna take a walk down there to see if theres any signage. This was touched upon at the last meeting for the Transportation Committee for CB2: DOT is moving without any input from CB2, and tends to move in secrecy for the past few years, especially in this neighborhood 🤦🏿‍♂️
[0.6647847]


In [31]:
sentiment.model.save('saved_model/model.h5', save_format='h5')