# IMDB Sentiment Classifier

In [1]:
import sys

sys.path.append("/Users/eric/Code/sentiment-classifier")

In [40]:
import logging
import tensorflow as tf

from sentiment_classifier.nlp.reader import IMDBReader
from sentiment_classifier.nlp.preprocessing import clean_text
from sentiment_classifier.nlp.models import Model
from sentiment_classifier.nlp.tokenizer import KerasTokenizer
from sentiment_classifier.nlp.utils import load_word_vectors

logging.basicConfig(level=logging.INFO)

## Preparing Data

### Preprocessing

In [3]:
# Text cleaning function
input_sequence = "Hi there, I loved this movie! <br>"
cleaned_sequence = clean_text(input_sequence)

print(cleaned_sequence)

Hi there ,  I loved this movie !


### Loading the dataset

In [18]:
# Dataset reader
imdb = IMDBReader("../../data/aclImdb/")

imdb.load_dataset(
    preprocessing_function=clean_text,
    limit=5000
)

In [19]:
print(imdb.train_data.shape, imdb.test_data.shape)

(10000, 2) (10000, 2)


### Tokenizing the dataset

In [6]:
# Getting a Tokenizer
tokenizer = KerasTokenizer(
    pad_max_len=20,
    lower=True
)

In [7]:
# Toy example
tokenizer.fit([cleaned_sequence])

tokenized_sequences = tokenizer.transform([cleaned_sequence])

print(tokenized_sequences)
print(len(tokenized_sequences[0]))
print(tokenizer.tokenizer.word_index)

[[0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 4 5 6 7 8]]
20
{'hi': 1, 'there': 2, ',': 3, 'i': 4, 'loved': 5, 'this': 6, 'movie': 7, '!': 8}


## Building a model

In [44]:
class ExampleModel(Model):
    def __init__(self):
        super(ExampleModel, self).__init__()
        
        self.tokenizer = KerasTokenizer(
            pad_max_len=512,
            lower=True
        )

    def build_model(self, input_shape):
        word_vectors = load_word_vectors(
            filepath="../../data/wiki-news-300d-1M.vec",
            word_index=self.tokenizer.tokenizer.word_index,
            vector_size=300
        )
        
        model = tf.keras.Sequential([
            tf.keras.layers.Embedding(
                word_vectors.shape[0],
                word_vectors.shape[1],
                weights=[word_vectors],
                trainable=False
            ),
            tf.keras.layers.GlobalAveragePooling1D(),
            tf.keras.layers.Dense(128, activation=tf.nn.relu),
            tf.keras.layers.Dense(64, activation=tf.nn.relu),
            tf.keras.layers.Dense(16, activation=tf.nn.relu),
            tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
        ])
        
        return model

    def train(self, reader, filepath):
        x_train, x_test, y_train, y_test = self._make_training_data(reader)

        self.model = self.build_model(input_shape=x_train.shape[1])

        self.model.compile(loss="binary_crossentropy",
                           optimizer="adam",
                           metrics=["accuracy"])
        
        self.model.summary()
        
        print("\nTraining")
        
        self.model.fit(x=x_train,
                       y=y_train,
                       validation_split=0.1,
                       epochs=5)
        
        print("\nEvaluate on test data")
        
        self.model.evaluate(x_test, y_test)

        self.save(filepath)

In [45]:
model = ExampleModel()

In [46]:
model.train(reader=imdb, filepath="/tmp/")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, None, 300)         18293700  
_________________________________________________________________
global_average_pooling1d_10  (None, 300)               0         
_________________________________________________________________
dense_28 (Dense)             (None, 128)               38528     
_________________________________________________________________
dense_29 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_30 (Dense)             (None, 16)                1040      
_________________________________________________________________
dense_31 (Dense)             (None, 1)                 17        
Total params: 18,341,541
Trainable params: 47,841
Non-trainable params: 18,293,700
___________________________________________________________