# **Import Necessary Libraries**

In [1]:
# Import necessary libraries
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, GRU, Conv1D, GlobalMaxPooling1D, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau 
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics import f1_score


# **Load the IMDb Dataset From a CSV File**

In [2]:
# Load the IMDb dataset from a CSV file
file_path = '/content/drive/MyDrive/Colab Notebooks/IMDB_Dataset.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', error_bad_lines=False)




  data = pd.read_csv(file_path, encoding='ISO-8859-1', error_bad_lines=False)


# **Preprocess the Dataset**

In [3]:
def preprocess_text(text):
    # 1. Remove HTML tags
    clean_text = re.sub('<[^>]*>', '', text)

    # 2. Remove special characters, URLs, and email addresses
    clean_text = re.sub('[^\w\s]', ' ', clean_text)
    clean_text = re.sub('\S*@\S*\s?', '', clean_text)
    clean_text = re.sub(r'http\S+', '', clean_text)

    # 3. Remove numbers
    clean_text = re.sub('\d+', '', clean_text)

    # 4. Remove extra white space
    clean_text = re.sub('\s+', ' ', clean_text)

    # 5. Remove punctuation and perform tokenization and Convert text to lowercase 
    tokenized_text = simple_preprocess(clean_text, deacc=True)

    return tokenized_text

In [4]:
# Preprocess the dataset
reviews = data['review']
sentiments = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Preprocess and tokenize the reviews
tokenized_reviews = [preprocess_text(review) for review in reviews]

# Preprocess and tokenize the reviews
# tokenized_reviews = [simple_preprocess(review) for review in reviews]

# Create a tokenizer and fit it on the reviews
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(reviews)

# Convert the tokenized reviews to sequences
sequences = tokenizer.texts_to_sequences(reviews)



# **Train the Word2Vec Model**

In [5]:
# Train the Word2Vec model with hyperparameters
embedding_dim = 128
word2vec_model = Word2Vec(sentences=tokenized_reviews, vector_size=embedding_dim, window=7, min_count=2, workers=4, sg=1, epochs=10)


# **Create an Embedding Matrix for the Embedding Layer**

In [6]:
# Create an embedding matrix for the Embedding layer
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, index in tokenizer.word_index.items():
    if index < max_words:
        if word in word2vec_model.wv:
            embedding_matrix[index] = word2vec_model.wv[word]



# **Pad Sequences to Have the Same Length**

In [7]:
# Pad sequences to have the same length
maxlen = 500
x = pad_sequences(sequences, maxlen=maxlen)
y = np.array(sentiments)


# **Split the Data into Training and Testing Sets**

In [8]:
# Split the data into training and testing sets (80% training, 20% testing)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


# **Define the Mbi-GRUMConv Model**

In [9]:
# Define the Mbi-GRUMConv model with parameters
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
model.add(Dropout(0.3))
model.add(Bidirectional(GRU(128, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(0.001))))
model.add(BatchNormalization())
model.add(Conv1D(128, 3, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))






# **Define the LSTM Model**

In [14]:
# Define the Bi-LSTM-Conv model with the updated parameters
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(0.001))))
model.add(BatchNormalization())
model.add(Conv1D(128, 3, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))





# **Use Adam optimizer**

In [15]:
# Use Adam optimizer with a learning rate scheduler
optimizer = Adam(learning_rate=0.001)

# Compile the model with binary_crossentropy loss and accuracy metric
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


# **Train the model**

In [16]:
# Train the model with early stopping and learning rate reduction
batch_size = 64
epochs = 30
validation_split = 0.1

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-6, verbose=1)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 128)          1280000   
                                                                 
 dropout_1 (Dropout)         (None, 500, 128)          0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 500, 256)         263168    
 nal)                                                            
                                                                 
 batch_normalization_1 (Batc  (None, 500, 256)         1024      
 hNormalization)                                                 
                                                                 
 conv1d_1 (Conv1D)           (None, 498, 128)          98432     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)             

# **Fit the model**

In [18]:
# Fit the model using the training data, with validation, and the specified callbacks
checkpoint = ModelCheckpoint('M_model_weights_{epoch:02d}.h5', save_weights_only=False, save_freq='epoch')
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=validation_split, callbacks=[early_stopping, reduce_lr,checkpoint])



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 7: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 14: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 19: ReduceLROnPlateau reducing learning rate to 8.000000525498762e-06.
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 26: ReduceLROnPlateau reducing learning rate to 1.6000001778593287e-06.
Epoch 27/30


<keras.callbacks.History at 0x7f9cf9b670d0>

# **Evaluate the model**

In [19]:
# Evaluate the model on the test set
scores = model.evaluate(x_test, y_test, batch_size=batch_size)
# Print the test accuracy
print(f"Test accuracy: {scores[1]}")

# Get predictions for the test set
y_pred = model.predict(x_test)

# Convert the predictions to binary by setting a threshold
y_pred_binary = np.where(y_pred >= 0.5, 1, 0)

# Compute and print the F1 score
f1 = f1_score(y_test, y_pred_binary)
print(f"F1 score: {f1}")


Test accuracy: 0.9171000123023987
F1 score: 0.9182848693937901
