<a href="https://colab.research.google.com/github/daini10421/Mini-Project-51-Sentiment-Analysis-Using-LSTM-RNN-and-Bidirectional-LSTM/blob/main/Mini_Project_51_Sentiment_Analysis_Using_LSTM%2C_RNN_and_Bidirectional_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Utilities
import re
import pickle
import numpy as np
import pandas as pd

# Plot libraries
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
import tensorflow as tf
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
dataset = pd.read_csv("../input/imdb-movie-ratings-sentiment-analysis/movie.csv")

In [None]:
dataset.head()

In [None]:
ax = dataset.groupby('label').count().plot(kind='bar', title='Distribution of data',
                                               legend=False)
ax = ax.set_xticklabels(['Negative','Positive'], rotation=0)

In [None]:
contractions = pd.read_csv('../input/contractions/contractions.csv', index_col='Contraction')
contractions.index = contractions.index.str.lower()
contractions.Meaning = contractions.Meaning.str.lower()
contractions_dict = contractions.to_dict()['Meaning']

# Defining regex patterns.
linebreaks        = "<br /><br />"
alphaPattern      = "[^a-z0-9<>]"
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"

# Defining regex for emojis
smileemoji        = r"[8:=;]['`\-]?[)d]+"
sademoji          = r"[8:=;]['`\-]?\(+"
neutralemoji      = r"[8:=;]['`\-]?[\/|l*]"
lolemoji          = r"[8:=;]['`\-]?p+"

def preprocess_reviews(review):

    review = review.lower()

    review = re.sub(linebreaks," ",review)
    # Replace 3 or more consecutive letters by 2 letter.
    review = re.sub(sequencePattern, seqReplacePattern, review)

    # Replace all emojis.
    review = re.sub(r'<3', '<heart>', review)
    review = re.sub(smileemoji, '<smile>', review)
    review = re.sub(sademoji, '<sadface>', review)
    review = re.sub(neutralemoji, '<neutralface>', review)
    review = re.sub(lolemoji, '<lolface>', review)

    for contraction, replacement in contractions_dict.items():
        review = review.replace(contraction, replacement)

    # Remove non-alphanumeric and symbols
    review = re.sub(alphaPattern, ' ', review)

    # Adding space on either side of '/' to seperate words (After replacing URLS).
    review = re.sub(r'/', ' / ', review)
    return review

In [None]:
%%time
dataset['cleaned_review'] = dataset.text.apply(preprocess_reviews)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

In [None]:
dataset["cleaned_review"][0]

In [None]:
# j = 0
for i in range(len(dataset)):
    lis = []
    for words in dataset["cleaned_review"][i].split():
        if words not in stop_words:
            words = lemmatizer.lemmatize(words)
            lis.append(words)
    dataset["cleaned_review"][i] = " ".join(lis)

#     j += 1
#     if j > 0:
#         break

In [None]:
print(dataset["cleaned_review"][69])

In [None]:
data_pos = dataset[dataset["label"]==1]["cleaned_review"]
data_neg = dataset[dataset["label"]==0]["cleaned_review"]

In [None]:
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800,
              collocations=False).generate(" ".join(data_pos))
plt.figure(figsize = (20,20))
plt.imshow(wc)

In [None]:
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800,
               collocations=False).generate(" ".join(data_neg))
plt.figure(figsize = (20,20))
plt.imshow(wc)

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X_data, y_data = np.array(dataset['cleaned_review']), np.array(dataset['label'])

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,
                                                    test_size = 0.05, random_state = 0)
print('Data Split done.')


In [None]:
from gensim.models import Word2Vec

Embedding_dimensions = 100

# Creating Word2Vec training dataset.
Word2vec_train_data = list(map(lambda x: x.split(), X_train))

In [None]:
# Defining the model and training it.
word2vec_model = Word2Vec(Word2vec_train_data,
                 vector_size=Embedding_dimensions,
                 workers=8,
                 min_count=5)

print("Vocabulary Length:", len(word2vec_model.wv.key_to_index))

In [None]:
input_length = 750

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
vocab_length = 35000

tokenizer = Tokenizer(filters="", lower=False, oov_token="<oov>")
tokenizer.fit_on_texts(X_data)
tokenizer.num_words = vocab_length
print("Tokenizer vocab length:", vocab_length)

In [None]:
X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=input_length)
X_test  = pad_sequences(tokenizer.texts_to_sequences(X_test) , maxlen=input_length)

print("X_train.shape:", X_train.shape)
print("X_test.shape :", X_test.shape)

In [None]:
embedding_matrix = np.zeros((vocab_length, Embedding_dimensions))

for word, token in tokenizer.word_index.items():
    if word2vec_model.wv.__contains__(word):
        embedding_matrix[token] = word2vec_model.wv.__getitem__(word)

print("Embedding Matrix Shape:", embedding_matrix.shape)

In [None]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import SimpleRNN,Dense,Activation,Bidirectional,GlobalMaxPool1D
from keras.utils.vis_utils import plot_model

In [None]:
def getModel():
    embedding_layer = Embedding(input_dim = vocab_length,
                                output_dim = Embedding_dimensions,
                                weights=[embedding_matrix],
                                input_length=input_length,
                                trainable=False)

    model = Sequential([
        embedding_layer,
        SimpleRNN(100,input_shape = (vocab_length,input_length),return_sequences=False,activation="LeakyReLU"),
        Dense(32,activation="relu"),
        Dense(1,activation="sigmoid"),
    ],
     name="Sentiment_Model")
    return model

In [None]:
training_model = getModel()
training_model.summary()

In [None]:
plot_model(training_model, "RNN.png", show_shapes=True)

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
             EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)]

In [None]:
training_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = training_model.fit(
    X_train, y_train,
    batch_size=1024,
    epochs=20,
    validation_split=0.1,
    callbacks=callbacks,
    verbose=1,
)

### LSTM


In [None]:
from tensorflow.keras.layers import LSTM, Dropout

In [None]:
def getModel2():
    embedding_layer = Embedding(input_dim = vocab_length,
                                output_dim = Embedding_dimensions,
                                weights=[embedding_matrix],
                                input_length=input_length,
                                trainable=False)

    model = Sequential([
        embedding_layer,
        LSTM(64,return_sequences=True),
        Dropout(0.2),
        LSTM(32),
        Dropout(0.2),
        Dense(1,activation="sigmoid")
    ],
    name="Sentiment_Model_LSTM")
    return model

In [None]:
training_model2 = getModel2()
training_model2.summary()

In [None]:
plot_model(training_model2, "LSTM.png", show_shapes=True)

In [None]:
training_model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history2 = training_model2.fit(
    X_train, y_train,
    batch_size=1024,
    epochs=20,
    validation_split=0.1,
    callbacks=callbacks,
    verbose=1,
)

### Bidirectional LSTM

In [None]:
def getModel3():
    embedding_layer = Embedding(input_dim = vocab_length,
                                output_dim = Embedding_dimensions,
                                weights=[embedding_matrix],
                                input_length=input_length,
                                trainable=False)

    model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(35,return_sequences=True)),
        GlobalMaxPool1D(),
        Dense(40, activation="relu"),
        Dropout(0.5),
        Dense(20, activation="relu"),
        Dropout(0.5),
        Dense(1, activation="sigmoid"),
    ],
    name="Sentiment_Model_LSTM_Bidirectional")
    return model

In [None]:
training_model3 = getModel3()
training_model3.summary()

In [None]:
plot_model(training_model3, "LSTM_bidirectional.png", show_shapes=True)

In [None]:
training_model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history3 = training_model3.fit(
    X_train, y_train,
    batch_size=1024,
    epochs=20,
    validation_split=0.1,
    callbacks=callbacks,
    verbose=1,
)

In [None]:

figure , axis = plt.subplots(2,3,figsize=(15,8))
# figure.add_gridspec(2, 2, hspace=10, wspace=10)
# plt.figure(figsize=(8, 6))
acc2,  val_acc2  = history2.history['accuracy'], history2.history['val_accuracy']
loss2, val_loss2 = history2.history['loss'], history2.history['val_loss']
epochs2 = range(len(acc2))
axis[0,0].plot(epochs2, acc2, 'b', label='Training acc')
axis[0,0].plot(epochs2, val_acc2, 'r', label='Validation acc')
axis[0,0].set_title('Training and validation accuracy - LSTM')
axis[0,0].legend()

# plt.figure()

axis[1,0].plot(epochs2, loss2, 'b', label='Training loss')
axis[1,0].plot(epochs2, val_loss2, 'r', label='Validation loss')
axis[1,0].set_title('Training and validation loss - LSTM')
axis[1,0].legend()

# plt.show()

acc,  val_acc  = history.history['accuracy'], history.history['val_accuracy']
loss, val_loss = history.history['loss'], history.history['val_loss']
epochs = range(len(acc))

axis[0,1].plot(epochs, acc, 'b', label='Training acc')
axis[0,1].plot(epochs, val_acc, 'r', label='Validation acc')
axis[0,1].set_title('Training and validation accuracy - RNN')
axis[0,1].legend()

# plt.figure()

axis[1,1].plot(epochs, loss, 'b', label='Training loss')
axis[1,1].plot(epochs, val_loss, 'r', label='Validation loss')
axis[1,1].set_title('Training and validation loss -  RNN')
axis[1,1].legend()

acc3,  val_acc3  = history3.history['accuracy'], history3.history['val_accuracy']
loss3, val_loss3 = history3.history['loss'], history3.history['val_loss']
epochs3 = range(len(acc3))
axis[0,2].plot(epochs3, acc3, 'b', label='Training acc')
axis[0,2].plot(epochs3, val_acc3, 'r', label='Validation acc')
axis[0,2].set_title('Training and validation accuracy - LSTM(Bidirectional)')
axis[0,2].legend()

# plt.figure()

axis[1,2].plot(epochs3, loss3, 'b', label='Training loss')
axis[1,2].plot(epochs3, val_loss3, 'r', label='Validation loss')
axis[1,2].set_title('Training and validation loss - LSTM(Bidirectional)')
axis[1,2].legend()

# plt.figure(figsize=(100,100))
figure.tight_layout()
plt.subplots_adjust(wspace=0.4,
                    hspace=0.4)

plt.show()


In [None]:
from sklearn.metrics import classification_report,ConfusionMatrixDisplay , confusion_matrix

# def ConfusionMatrix(y_pred, y_test):
#     # Compute and plot the Confusion matrix
#     cf_matrix = confusion_matrix(y_test, y_pred)



#     sns.heatmap(cf_matrix, annot = labels, cmap = 'Blues',fmt = '',
#                 xticklabels = categories, yticklabels = categories)

#     plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
#     plt.ylabel("Actual values"   , fontdict = {'size':14}, labelpad = 10)
def get_labels(y_pred,y_test):
    cf_matrix = confusion_matrix(y_test, y_pred)
    categories  = ['Negative','Positive']
    group_names = ['True Neg','False Pos', 'False Neg','True Pos']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]

    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    return labels

# Predicting on the Test dataset.
y_pred = training_model.predict(X_test)
y_pred2 = training_model2.predict(X_test)
y_pred3 = training_model3.predict(X_test)

# Converting prediction to reflect the sentiment predicted.
y_pred = np.where(y_pred>=0.5, 1, 0)
y_pred2 = np.where(y_pred2>=0.5, 1, 0)
y_pred3 = np.where(y_pred3>=0.5, 1, 0)

l1 = get_labels(y_pred,y_test)
l2 = get_labels(y_pred2,y_test)
l3 = get_labels(y_pred3,y_test)

# Printing out the Evaluation metrics.
fig, ax = plt.subplots(1,3,figsize=(15,8))
ax[0].set_title("RNN")
ax[1].set_title("LSTM")
ax[2].set_title("LSTM Bidirectional")

ConfusionMatrixDisplay.from_predictions(y_test,y_pred,cmap="Blues" ,ax = ax[0])
ConfusionMatrixDisplay.from_predictions(y_test,y_pred2,cmap="Blues",ax = ax[1])
ConfusionMatrixDisplay.from_predictions(y_test,y_pred3,cmap="Blues" ,ax = ax[2])

plt.show()




In [None]:
# Print the evaluation metrics for the dataset.
print(classification_report(y_test, y_pred))
# Print the evaluation metrics for the dataset.
print(classification_report(y_test, y_pred2))
# Print the evaluation metrics for the dataset.
print(classification_report(y_test, y_pred3))

In [None]:
statement = ["worst day of my life"]
print(statement)
for i in range(len(statement)):
    listr = []
    statement[i] = preprocess_reviews(statement[i])
    for word in statement[i].split():
        if word.lower() not in stop_words:
            listr.append(word)
    statement[i] = " ".join(listr)
print(statement)

In [None]:
statement = pad_sequences(tokenizer.texts_to_sequences(statement) , maxlen=input_length)
statement.shape
pred = training_model.predict(statement)
pred = np.where(pred>=0.5, 1, 0)
pred2 = training_model2.predict(statement)
pred2 = np.where(pred>=0.5, 1, 0)
pred3 = training_model3.predict(statement)
pred3 = np.where(pred>=0.5, 1, 0)

In [None]:
print("RNN :" , pred)
print("LSTM :" , pred2)
print("LSTM Bidirectional :" , pred3)