In [1]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, EarlyStopping

import pandas as pd
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.read_csv('full_preprocessed_data.csv')
df = df.head()

In [7]:
max_length = 32
vocabulary_size = 100000

tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(df['tweets'])

sequences = tokenizer.texts_to_sequences(df['tweets'])
X = pad_sequences(sequences, maxlen=max_length)
y = list(df.sentiment)

In [9]:
def build_model_lstm(X,
                     y,
                     filepath="LSTM_best_weights.hdf5",
                     callbacks_list=[
                         ModelCheckpoint(
                             'LSTM_best_weights.hdf5',
                             monitor='val_acc',
                             verbose=1,
                             save_best_only=True,
                             mode='max'),
                         EarlyStopping(
                             monitor='val_acc', patience=3, mode='max')
                     ],
                     Embedding_size=200,
                     batch_size=16384,
                     validation_split=0.04,
                     epochs=100):
    """
    Create the model for a Long Short-Term Memory Network

    INPUT:
        X : Multidimensional list - The traning features
        y : list                  - The traning results
        callbacks_list :          - The callback options for the model
        Embedding_size            - The size of the embedding 
        batch_size                - The size of the batch in the neural network
        validation_split          - The validation_test split
        epochs                    - The number of epochs
        

    OUTPUT:
        Returns the model trained and the history of the training
    """

    model_lstm = Sequential()
    model_lstm.add(
        Embedding(vocabulary_size, Embedding_size, input_length=max_length))
    model_lstm.add(LSTM(Embedding_size, dropout=0.2, recurrent_dropout=0.2))
    model_lstm.add(Dense(1, activation='sigmoid'))
    model_lstm.compile(
        loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    history = model_lstm.fit(
        X,
        y,
        validation_split=validation_split,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks_list)

    return model_lstm, history

In [10]:
build_model_lstm(X, y)

Train on 4 samples, validate on 1 samples
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 1.00000, saving model to LSTM_best_weights.hdf5
Epoch 2/100

Epoch 00002: val_acc did not improve from 1.00000
Epoch 3/100

Epoch 00003: val_acc did not improve from 1.00000
Epoch 4/100

Epoch 00004: val_acc did not improve from 1.00000


(<keras.engine.sequential.Sequential at 0xb2def6d68>,
 <keras.callbacks.History at 0xb33123e80>)