## Mounting to the main project directory in gdrive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/2024-ML-AI-Projects/aviroc-projects/imdb-sentiment-analysis-Transformers

/content/drive/MyDrive/2024-ML-AI-Projects/aviroc-projects/imdb-sentiment-analysis-Transformers


In [3]:
!pwd

/content/drive/MyDrive/2024-ML-AI-Projects/aviroc-projects/imdb-sentiment-analysis-Transformers


## Importing Necessary Libraries

In [4]:
from sentiment_analysis import DataIngestion, DataPreprocessing, FeatureEngineering, custom_model, ModelSaveLoad, MovieSentimentPredictor
from utils import (
    VOCAB_SIZE,
    MAX_LEN,
    EMBED_DIM,
    NUM_HEADS,
    FF_DIM,
    MODEL_NAME,
    MODEL_PATH,
    OPTIMIZER,
    LOSS,
    METRICS,
    BATCH_SIZE,
    EPOCHS,
    DATA_PATH,
    TEXT_VECTORIZER_PATH
)
import tensorflow as tf

## Data Ingestion and Preprocessing

In [None]:
data_ingestion = DataIngestion(DATA_PATH)
df = data_ingestion.load_data()

In [None]:
df.sample(n=5, random_state=42)

Unnamed: 0,review,sentiment
33553,I really liked this Summerslam due to the look...,positive
9427,Not many television shows appeal to quite as m...,positive
199,The film quickly gets to a major chase scene w...,negative
12447,Jane Austen would definitely approve of this o...,positive
39489,Expectations were somewhat high for me when I ...,negative


In [None]:
data_preprocessing = DataPreprocessing(df)
train_df, val_df, test_df = data_preprocessing.split_data()
train_df, test_df, val_df = data_preprocessing.encode_labels(train_df, test_df, val_df, target_feature='sentiment')

In [None]:
train_df.sample(n=2, random_state=42)

Unnamed: 0,review,label
12611,I rented this DVD for two reasons. A cast of g...,0
6857,We expected something great when we went to se...,0


In [None]:
train_sentences, test_sentences, val_sentences, train_labels, test_labels, val_labels = data_preprocessing.convert_series_to_nparray(train_df, val_df, test_df, input_feature='review', target_feature='label')

In [None]:
# Check the lengths
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(34999, 34999, 7501, 7501)

### Feature Engineering

In [None]:
TEXT_VECTORIZER_PATH

'models/text_vectorizer.pkl'

In [None]:
feature_engineering = FeatureEngineering(VOCAB_SIZE, MAX_LEN)
text_vectorizer = feature_engineering.create_text_vectorizer()
text_vectorizer.adapt(train_sentences)

In [None]:
import numpy as np
np.save("data/train_sentences", train_sentences)

In [None]:
x_train_v2 = text_vectorizer(train_sentences)
x_test_v2 = text_vectorizer(test_sentences)
x_val_v2 = text_vectorizer(val_sentences)

x_train_v2 = tf.keras.preprocessing.sequence.pad_sequences(x_train_v2, maxlen=MAX_LEN)
x_test_v2 = tf.keras.preprocessing.sequence.pad_sequences(x_test_v2, maxlen=MAX_LEN)
x_val_v2 = tf.keras.preprocessing.sequence.pad_sequences(x_val_v2, maxlen=MAX_LEN)

## Model Building, Training and Evaluation

In [None]:
custom_model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=[METRICS])

In [None]:
custom_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_and_position_embeddi  (None, 200, 32)           646400    
 ng (TokenAndPositionEmbedd                                      
 ing)                                                            
                                                                 
 transformer_block (Transfo  (None, 200, 32)           10656     
 rmerBlock)                                                      
                                                                 
 global_average_pooling1d (  (None, 32)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout_2 (Dropout)         (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 20)               

In [None]:
history = custom_model.fit(x_train_v2, train_labels, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(x_val_v2, val_labels))

Epoch 1/2
Epoch 2/2


In [None]:
# Create an instance of ModelSaver with the trained model
model_saver = ModelSaveLoad(custom_model)

# Save the model to a file
model_saver.save_model(MODEL_PATH)

In [None]:
# Load the model from the file
loaded_model = ModelSaveLoad.load_model(MODEL_PATH)

In [None]:
results = loaded_model.evaluate(x_test_v2, test_labels, verbose=2)

for name, value in zip(loaded_model.metrics_names, results):
    print("%s: %.3f" % (name, value))

235/235 - 11s - loss: 0.2903 - accuracy: 0.8815 - 11s/epoch - 47ms/step
loss: 0.290
accuracy: 0.881


### Prediction Pipeline

In [5]:
predictor = MovieSentimentPredictor(VOCAB_SIZE, MAX_LEN, MODEL_PATH)

sentence = (
    "This movie was fantastic! The acting was brilliant and the story was engaging."
)
sentiment = predictor.predict_sentiment(sentence)
print(f"Sentence: {sentence}")
print(f"Predicted sentiment: {sentiment}")

Model loaded successfully.
Text vectorizer loaded and adapted.
Sentence: This movie was fantastic! The acting was brilliant and the story was engaging.
Predicted sentiment: Movie is Positive - Score 1


In [6]:
sentence = "This movie was an absolute disaster. The acting was terrible, the plot was nonsensical, and the special effects were laughable. It was a complete waste of time and money. I can't believe I sat through the entire thing. Avoid this movie at all costs!"

sentiment = predictor.predict_sentiment(sentence)
print(f"Sentence: {sentence}")
print(f"Predicted sentiment: {sentiment}")

Sentence: This movie was an absolute disaster. The acting was terrible, the plot was nonsensical, and the special effects were laughable. It was a complete waste of time and money. I can't believe I sat through the entire thing. Avoid this movie at all costs!
Predicted sentiment: Movie is Negative - Score 0
