In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.layers import Bidirectional, Embedding
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
import pickle
from PIL import Image
from gensim.parsing.preprocessing import preprocess_string

In [None]:
# Tweet for model building as already been preprocess
tweets_df = pd.read_csv("data/dataset.csv", encoding="UTF-8", nrows=500)
tweets_df = tweets_df.dropna()
tweets_df = tweets_df.rename(columns={"selected_text":"text", "sentiment":"label"})

In [None]:
# Data processing
## Tokenize tweets and label
tweet = tweets_df.text.values
tokenizer = Tokenizer(num_words=25000)
tokenizer.fit_on_texts(tweet)

# Save tokenizer
with open("./model/tokenizer.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(tweet)
tweets = pad_sequences(encoded_docs, maxlen=200)

## Labels
labels = to_categorical(tweets_df["label"].values, 3, dtype="float32")

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.3, random_state=0)
print (len(X_train),len(X_test),len(y_train),len(y_test))

In [None]:
# Build model
model = Sequential()
model.add(Embedding(vocab_size, 32, input_length=200))
model.add(Bidirectional(LSTM(50,dropout=0.2)))
model.add(Dropout(0.2))
model.add(Dense(3, activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

#Implementing model checkpoints to save the best metric and do not lose it on training.
checkpoint = ModelCheckpoint("./model/model.hdf5", 
    monitor='val_accuracy', 
    verbose=1,
    save_best_only=True, 
    mode='auto', 
    period=1,
    save_weights_only=False)

history = model.fit(X_train, y_train, 
    epochs=70,
    validation_data=(X_test, y_test),
    callbacks=[checkpoint])
    
print(model.summary())

In [None]:
#Let's load the best model obtained during training
model = load_model("./model/model.hdf5")

test_loss, test_acc = best_model.evaluate(X_test, y_test, verbose=2)
print('Model accuracy: ',test_acc)

In [None]:
model.summary()