In [21]:
!pip install memory-profiler
import pandas as pd
import numpy as np
import time
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk
from memory_profiler import memory_usage
import os
import psutil
nltk.download('punkt')
nltk.download('stopwords')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting memory-profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.61.0


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
#define function for computational cost calculation
def peak_memory_usage():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    # return mem_info.peak_wset  # For Windows
    return mem_info.peak_rss  # For Unix-based systems

In [18]:
#create empty dictinaory to store computational factors
computational_factor={}

# Load the dataset
data = pd.read_csv('/content/NLP_Project/dataset/IMDB_clean.csv')
data.head()

Unnamed: 0,review,sentiment,sentiment_boolean
0,one reviewers mentioned watching 1 oz episode ...,positive,1
1,wonderful little production the filming techni...,positive,1
2,thought wonderful way spend time hot summer we...,positive,1
3,basically theres family little boy jake thinks...,negative,0
4,petter matteis love time money visually stunni...,positive,1


In [9]:
# function to preprocess the data (tokenize, remove stopwords, and stem words):
def preprocess_text(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return ' '.join(stemmed_tokens)

In [10]:
#Preprocess the 'review' column in the dataset:
data['cleaned_review'] = data['review'].apply(preprocess_text)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_review'], data['sentiment_boolean'], test_size=0.2, random_state=42)

# Create a Bag of Words model using CountVectorizer
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [47]:
def NB():
    # Train the Naive Bayes model using the training data:
    start_time = time.time()
    global naive_bayes_classifier
    naive_bayes_classifier = MultinomialNB()
    naive_bayes_classifier.fit(X_train_bow, y_train)
    end_time = time.time()
    training_time = end_time - start_time
    computational_factor['NB']=(training_time,0)
    time.sleep(2)  # Simulate training with a 2-second delay

mem_usage = memory_usage(NB, interval=0.1, max_usage=True)
print("Peak memory usage (MB):", mem_usage)

# memory_usage = memory_after_training - memory_before_training
training_time,b=computational_factor['NB']
computational_factor['NB']=(training_time,mem_usage)
print(computational_factor)

Peak memory usage (MB): 1295.53515625
{'NB': (0.06569957733154297, 1295.53515625), 'RF': (426.1929829120636, 772.953125), 'SVM': (12.723561525344849, 864.42578125), 'RNN': (384.2572820186615, 1287.5859375), 'E1': (0.03712582588195801, 1289.7421875)}


In [24]:
# Predict the sentiment of the test data and evaluate the model:
y_pred_naiveBayes = naive_bayes_classifier.predict(X_test_bow)

accuracy = accuracy_score(y_test, y_pred_naiveBayes)
conf_matrix = confusion_matrix(y_test, y_pred_naiveBayes)
class_report = classification_report(y_test, y_pred_naiveBayes)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")


Accuracy: 0.8564
Confusion Matrix:
[[4311  650]
 [ 786 4253]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      4961
           1       0.87      0.84      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [25]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

In [30]:
# Train the Random Forest model using the training data
def RF():
  start_time = time.time()
  global random_forest_classifier
  random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
  random_forest_classifier.fit(X_train_bow, y_train)
  random_forest_classifier=random_forest_classifier
  end_time = time.time()
  training_time = end_time - start_time
  computational_factor['RF']=(training_time,0)
  time.sleep(2)  # Simulate training with a 2-second delay


mem_usage = memory_usage(RF, interval=0.1, max_usage=True)
print("Peak memory usage (MB):", mem_usage)

# memory_usage = memory_after_training - memory_before_training
training_time,b=computational_factor['RF']
computational_factor['RF']=(training_time,mem_usage)
print(computational_factor)

Peak memory usage (MB): 772.953125
{'NB': (0.06332778930664062, 0), 'RF': (426.1929829120636, 772.953125)}


In [32]:
# Predict the sentiment of the test data and evaluate the model 
y_pred_rf = random_forest_classifier.predict(X_test_bow)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
class_report_rf = classification_report(y_test, y_pred_rf)

print(f"Random Forest Accuracy: {accuracy_rf}")
print(f"Random Forest Confusion Matrix:\n{conf_matrix_rf}")
print(f"Random Forest Classification Report:\n{class_report_rf}")


Random Forest Accuracy: 0.8537
Random Forest Confusion Matrix:
[[4210  751]
 [ 712 4327]]
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.85      0.85      4961
           1       0.85      0.86      0.86      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [33]:
# Support Vector Machines
from sklearn.svm import LinearSVC

In [34]:
def SVM():
  start_time = time.time()
  # train the SVM model using the training data 
  global svm_classifier
  svm_classifier = LinearSVC(random_state=42)
  svm_classifier.fit(X_train_bow, y_train)
  end_time = time.time()
  training_time = end_time - start_time
  computational_factor['SVM']=(training_time,0)
  time.sleep(2)  # Simulate training with a 2-second delay

mem_usage = memory_usage(SVM, interval=0.1, max_usage=True)
print("Peak memory usage (MB):", mem_usage)


training_time,b=computational_factor['SVM']
computational_factor['SVM']=(training_time,mem_usage)
print(computational_factor['SVM'])




Peak memory usage (MB): 864.42578125
(12.723561525344849, 864.42578125)


In [35]:
# Predict the sentiment of the test data and evaluate the model
y_pred_svm = svm_classifier.predict(X_test_bow)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
class_report_svm = classification_report(y_test, y_pred_svm)

print(f"SVM Accuracy: {accuracy_svm}")
print(f"SVM Confusion Matrix:\n{conf_matrix_svm}")
print(f"SVM Classification Report:\n{class_report_svm}")


SVM Accuracy: 0.8648
SVM Confusion Matrix:
[[4239  722]
 [ 630 4409]]
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.85      0.86      4961
           1       0.86      0.87      0.87      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [36]:
#using RNN
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [37]:
# Set up parameters for tokenization and padding:
vocab_size = 10000
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_token = '<OOV>'

In [38]:
# Tokenize and pad the sequences:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


In [39]:
# Define and compile the RNN model:
embedding_dim = 16

model_rnn = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    SimpleRNN(32),
    Dense(1, activation='sigmoid')
])

model_rnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [40]:
# Train the RNN model using the training data:
epochs = 10


def RNN():
  start_time = time.time()
  # train the SVM model using the training data 
  global history
  history = model_rnn.fit(X_train_padded, y_train, epochs=epochs, validation_data=(X_test_padded, y_test), verbose=2)
  end_time = time.time()
  training_time = end_time - start_time
  computational_factor['RNN']=(training_time,0)
  time.sleep(2)  # Simulate training with a 2-second delay

mem_usage = memory_usage(RNN, interval=0.1, max_usage=True)
print("Peak memory usage (MB):", mem_usage)

training_time,b=computational_factor['RNN']
computational_factor['RNN']=(training_time,mem_usage)
print(computational_factor['RNN'])


Epoch 1/10
1250/1250 - 34s - loss: 0.6888 - accuracy: 0.5267 - val_loss: 0.6877 - val_accuracy: 0.5445 - 34s/epoch - 27ms/step
Epoch 2/10
1250/1250 - 30s - loss: 0.6306 - accuracy: 0.6566 - val_loss: 0.6281 - val_accuracy: 0.6641 - 30s/epoch - 24ms/step
Epoch 3/10
1250/1250 - 29s - loss: 0.5363 - accuracy: 0.7452 - val_loss: 0.7187 - val_accuracy: 0.5225 - 29s/epoch - 24ms/step
Epoch 4/10
1250/1250 - 30s - loss: 0.5436 - accuracy: 0.6780 - val_loss: 0.7709 - val_accuracy: 0.5432 - 30s/epoch - 24ms/step
Epoch 5/10
1250/1250 - 31s - loss: 0.4611 - accuracy: 0.7380 - val_loss: 0.8541 - val_accuracy: 0.5280 - 31s/epoch - 25ms/step
Epoch 6/10
1250/1250 - 29s - loss: 0.3873 - accuracy: 0.7960 - val_loss: 0.8985 - val_accuracy: 0.5976 - 29s/epoch - 23ms/step
Epoch 7/10
1250/1250 - 32s - loss: 0.4245 - accuracy: 0.7762 - val_loss: 1.0575 - val_accuracy: 0.5090 - 32s/epoch - 26ms/step
Epoch 8/10
1250/1250 - 31s - loss: 0.3569 - accuracy: 0.8139 - val_loss: 1.1621 - val_accuracy: 0.5177 - 31s/ep

In [41]:
# Evaluate the model's performance:
loss, accuracy_rnn = model_rnn.evaluate(X_test_padded, y_test)
print(f"RNN Accuracy: {accuracy_rnn}")

RNN Accuracy: 0.508899986743927


In [42]:
#Ensemble SVM,Naive Bayies , Random Forest -Majority Voting
# Combine the predictions of the three classifiers for each instance in the test data:
# predictions = list(zip(y_pred_naiveBayes, y_pred_rf, y_pred_svm))

# function that returns the majority sentiment
def majority_voting(predictions):
    return round(sum(predictions) / len(predictions))


In [43]:
#  applying the majority voting function to each instance:
# y_pred_ensemble = [majority_voting(pred_tuple) for pred_tuple in predictions]

def E1():
  start_time = time.time()
  global predictions
  predictions = list(zip(y_pred_naiveBayes, y_pred_rf, y_pred_svm))
  # train the SVM model using the training data 
  global y_pred_ensemble
  #  applying the majority voting function to each instance:
  y_pred_ensemble = [majority_voting(pred_tuple) for pred_tuple in predictions]
  end_time = time.time()
  training_time = end_time - start_time
  computational_factor['E1']=(training_time,0)
  time.sleep(2)  # Simulate training with a 2-second delay

mem_usage = memory_usage(E1, interval=0.1, max_usage=True)
print("Peak memory usage (MB):", mem_usage)

training_time,b=computational_factor['E1']
computational_factor['E1']=(training_time,mem_usage)
print(computational_factor['E1'])


Peak memory usage (MB): 1289.7421875
(0.03712582588195801, 1289.7421875)


In [44]:
# Evaluate the ensemble model's performance:
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
conf_matrix_ensemble = confusion_matrix(y_test, y_pred_ensemble)
class_report_ensemble = classification_report(y_test, y_pred_ensemble)

print(f"Ensemble Accuracy: {accuracy_ensemble}")
print(f"Ensemble Confusion Matrix:\n{conf_matrix_ensemble}")
print(f"Ensemble Classification Report:\n{class_report_ensemble}")


Ensemble Accuracy: 0.8813
Ensemble Confusion Matrix:
[[4359  602]
 [ 585 4454]]
Ensemble Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      4961
           1       0.88      0.88      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [45]:
#Ensemble SVM,Naive Bayies , Random Forest-Stacking
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict


In [93]:
# Prepare the data for stacking by using the predictions of the three classifiers on the training dataset:
# y_train_pred_naiveBayes = cross_val_predict(naive_bayes_classifier, X_train_bow, y_train, cv=5)
# y_train_pred_rf = cross_val_predict(random_forest_classifier, X_train_bow, y_train, cv=5)
# y_train_pred_svm = cross_val_predict(svm_classifier, X_train_bow, y_train, cv=5)

# train_predictions = np.column_stack((y_train_pred_naiveBayes, y_train_pred_rf, y_train_pred_svm))


In [94]:
# # Train the meta-model (a logistic regression classifier) on the stacked training predictions:
# meta_model = LogisticRegression()
# meta_model.fit(train_predictions, y_train)

def E2():
  start_time = time.time()
  # Prepare the data for stacking by using the predictions of the three classifiers on the training dataset:
  y_train_pred_naiveBayes = cross_val_predict(naive_bayes_classifier, X_train_bow, y_train, cv=5)
  y_train_pred_rf = cross_val_predict(random_forest_classifier, X_train_bow, y_train, cv=5)
  y_train_pred_svm = cross_val_predict(svm_classifier, X_train_bow, y_train, cv=5)

  global train_predictions

  train_predictions = np.column_stack((y_train_pred_naiveBayes, y_train_pred_rf, y_train_pred_svm))
  # train the SVM model using the training data 
  global y_pred_ensemble
  #  applying the majority voting function to each instance:
  # Train the meta-model (a logistic regression classifier) on the stacked training predictions:
  global meta_model
  meta_model = LogisticRegression()
  meta_model.fit(train_predictions, y_train)
  end_time = time.time()
  training_time = end_time - start_time
  computational_factor['E2']=(training_time,0)
  time.sleep(2)  # Simulate training with a 2-second delay

mem_usage = memory_usage(E2, interval=0.1, max_usage=True)
print("Peak memory usage (MB):", mem_usage)

training_time,b=computational_factor['E2']
computational_factor['E2']=(training_time,mem_usage)
print(computational_factor['E2'])




Peak memory usage (MB): 3099.2421875
(1557.2816035747528, 3099.2421875)


In [96]:
# Prepare the data for stacking on the test dataset by using the predictions of the three classifiers:
test_predictions = np.column_stack((y_pred_naiveBayes, y_pred_rf, y_pred_svm))


In [97]:
# Calculate the ensemble predictions by applying the meta-model to the stacked test predictions:
y_pred_stacking = meta_model.predict(test_predictions)


In [98]:
# Evaluate the stacking ensemble model's performance:
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
conf_matrix_stacking = confusion_matrix(y_test, y_pred_stacking)
class_report_stacking = classification_report(y_test, y_pred_stacking)

print(f"Stacking Accuracy: {accuracy_stacking}")
print(f"Stacking Confusion Matrix:\n{conf_matrix_stacking}")
print(f"Stacking Classification Report:\n{class_report_stacking}")


Stacking Accuracy: 0.8813
Stacking Confusion Matrix:
[[4359  602]
 [ 585 4454]]
Stacking Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      4961
           1       0.88      0.88      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [49]:
#using CNN
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

In [50]:
# Set up parameters for tokenization and padding:
vocab_size = 10000
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_token = '<OOV>'
max_words = 10000

In [51]:
# Tokenize and pad the sequences:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

In [52]:
#Create the CNN model
embedding_dim = 100

model_cnn = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_length),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_cnn.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 100)          1000000   
                                                                 
 conv1d (Conv1D)             (None, 96, 128)           64128     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense_1 (Dense)             (None, 10)                1290      
                                                                 
 dense_2 (Dense)             (None, 1)                 11        
                                                                 
Total params: 1,065,429
Trainable params: 1,065,429
Non-trainable params: 0
____________________________________________

In [53]:
# Compile the model:
model_cnn.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [55]:
# Train the model
# history = model_cnn.fit(X_train_pad, y_train,
#                     epochs=10,
#                     batch_size=32,
#                     validation_split=0.1)

def CNN():
  start_time = time.time()
  # train the SVM model using the training data 
  global history
  history = model_cnn.fit(X_train_pad, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.1)
  end_time = time.time()
  training_time = end_time - start_time
  computational_factor['CNN']=(training_time,0)
  time.sleep(2)  # Simulate training with a 2-second delay

mem_usage = memory_usage(CNN, interval=0.1, max_usage=True)
print("Peak memory usage (MB):", mem_usage)

training_time,b=computational_factor['CNN']
computational_factor['CNN']=(training_time,mem_usage)
print(computational_factor['CNN'])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Peak memory usage (MB): 1616.9765625
(623.7481956481934, 1616.9765625)


In [56]:
# Evaluate the model
loss, accuracy = model_cnn.evaluate(X_test_pad, y_test)
print(f'Test accuracy: {accuracy}')

Test accuracy: 0.8792999982833862


In [57]:
#Ensemble RNN,CNN -Majority Voting
import numpy as np
from sklearn.metrics import accuracy_score

In [58]:
# #Make predictions with both models
# y_pred_rnn = (model_rnn.predict(X_test_pad) > 0.5).astype("int32")
# y_pred_cnn = (model_cnn.predict(X_test_pad) > 0.5).astype("int32")


In [59]:
#Combine the predictions using averaging
# majority voting approach is more applicable to a multi-class classification problem. 
# Since this is a binary classification, 
# we will use a simple averaging approach instead.
def combine_predictions(predictions_list):
    combined_proba = np.mean(predictions_list, axis=0)
    combined_pred = (combined_proba > 0.5).astype("int32")
    return combined_pred

# y_pred_combined = combine_predictions([y_pred_rnn, y_pred_cnn])

def E3():
  start_time = time.time()
  # train the SVM model using the training data 
  #Make predictions with both models
  global y_pred_rnn
  global y_pred_cnn
  y_pred_rnn = (model_rnn.predict(X_test_pad) > 0.5).astype("int32")
  y_pred_cnn = (model_cnn.predict(X_test_pad) > 0.5).astype("int32")
  global y_pred_combined
  y_pred_combined = combine_predictions([y_pred_rnn, y_pred_cnn])
  end_time = time.time()
  training_time = end_time - start_time
  computational_factor['E3']=(training_time,0)
  time.sleep(2)  # Simulate training with a 2-second delay

mem_usage = memory_usage(E3, interval=0.1, max_usage=True)
print("Peak memory usage (MB):", mem_usage)

training_time,b=computational_factor['E3']
computational_factor['E3']=(training_time,mem_usage)
print(computational_factor['E3'])

Peak memory usage (MB): 1618.38671875
(8.127866744995117, 1618.38671875)


In [60]:
# Evaluate the ensemble model
ensemble_accuracy = accuracy_score(y_test, y_pred_combined)
print(f'Ensemble accuracy: {ensemble_accuracy}')


Ensemble accuracy: 0.6828


In [101]:
#Ensemble RNN,CNN -Stacking
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [102]:
# Make predictions on the training set using the pretrained models
# y_pred_rnn_train = model_rnn.predict(X_train_pad)
# y_pred_cnn_train = model_cnn.predict(X_train_pad)

In [103]:
# Create a new dataset with these predictions as features
# X_train_stacked = np.column_stack((y_pred_rnn_train, y_pred_cnn_train))

In [104]:
# Train a meta-classifier(logistic regression) and train it on the new dataset with stacked features.
# meta_classifier = LogisticRegression()
# meta_classifier.fit(X_train_stacked, y_train)

def E4():
  start_time = time.time()
  # train the SVM model using the training data 
  #Make predictions with both models
  global y_pred_rnn_train
  global y_pred_cnn_train
  y_pred_rnn_train = model_rnn.predict(X_train_pad)
  y_pred_cnn_train = model_cnn.predict(X_train_pad)

  # Create a new dataset with these predictions as features
  X_train_stacked = np.column_stack((y_pred_rnn_train, y_pred_cnn_train))

  global meta_classifier
  # Train a meta-classifier(logistic regression) and train it on the new dataset with stacked features.
  meta_classifier = LogisticRegression()
  meta_classifier.fit(X_train_stacked, y_train)
  end_time = time.time()
  training_time = end_time - start_time
  computational_factor['E4']=(training_time,0)
  time.sleep(2)  # Simulate training with a 2-second delay

mem_usage = memory_usage(E4, interval=0.1, max_usage=True)
print("Peak memory usage (MB):", mem_usage)

training_time,b=computational_factor['E4']
computational_factor['E4']=(training_time,mem_usage)
print(computational_factor['E4'])

Peak memory usage (MB): 2918.13671875
(41.28328990936279, 2918.13671875)


In [105]:
# Make predictions using RNN and CNN models on the test data:
y_pred_rnn_test = model_rnn.predict(X_test_pad)
y_pred_cnn_test = model_cnn.predict(X_test_pad)




In [106]:
# Combine the test predictions using the trained meta-classifier:
X_test_stacked = np.column_stack((y_pred_rnn_test, y_pred_cnn_test))
y_pred_stacked = meta_classifier.predict(X_test_stacked)


In [107]:
# Evaluate the ensemble model:
stacked_accuracy = accuracy_score(y_test, y_pred_stacked)
print(f'Stacked ensemble accuracy: {stacked_accuracy}')


Stacked ensemble accuracy: 0.8794


In [64]:
#Ensemble SVM,CNN -Majority Voting
import numpy as np

In [None]:
# use the predicted labels from SVM and CNN
# svm_preds = y_pred_svm
# cnn_preds = y_pred_cnn

In [65]:
# combine the predicted labels using majority voting
# ensemble_preds = np.round((svm_preds + cnn_preds) / 2)

def E5():
  start_time = time.time()
  # train the SVM model using the training data 
  #Make predictions with both models
  global svm_preds
  global y_pred_cnn_binary
  # Convert the continuous CNN predictions to binary labels
  y_pred_cnn_binary = np.where(y_pred_cnn > 0.5, 1, 0)
  svm_preds = y_pred_svm
  cnn_preds = y_pred_cnn
  global y_pred_ensemble
  # Combine the predictions from SVM and CNN using majority voting
  y_pred_ensemble = np.round((y_pred_svm + y_pred_cnn_binary) / 2)
  end_time = time.time()
  training_time = end_time - start_time
  computational_factor['E5']=(training_time,0)
  time.sleep(2)  # Simulate training with a 2-second delay

mem_usage = memory_usage(E5, interval=0.1, max_usage=True)
print("Peak memory usage (MB):", mem_usage)

training_time,b=computational_factor['E5']
computational_factor['E5']=(training_time,mem_usage)
print(computational_factor['E5'])

Peak memory usage (MB): 3964.28125
(2.4740681648254395, 3964.28125)


In [71]:
# print("y_test shape:", y_test.shape)
# print("y_test type:", type(y_test))
# print("y_pred_ensemble shape:", y_pred_ensemble.shape)
# print("y_pred_ensemble type:", type(y_pred_ensemble))
# print("y_pred_svm shape:", y_pred_svm.shape)
# print("y_pred_cnn_binary shape:", y_pred_cnn_binary.shape)
# Flatten y_pred_cnn_binary to match the shape of y_pred_svm
y_pred_cnn_binary_flat = np.ravel(y_pred_cnn_binary)

# Stack the predictions from SVM and CNN vertically
predictions = np.vstack((y_pred_svm, y_pred_cnn_binary_flat))

# Calculate the mean along the first axis and round to get the final ensemble prediction
y_pred_ensemble = np.round(np.mean(predictions, axis=0))

# Calculate the accuracy of the ensemble model using the corrected y_pred_ensemble
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)

In [72]:
print("Ensemble model accuracy:", ensemble_accuracy)

Ensemble model accuracy: 0.8731


In [73]:
# SVM,CNN - Stacking
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [78]:
class CNNBinaryPredictions(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        y_pred_cnn = model_cnn.predict(X)
        y_pred_cnn_binary = np.where(y_pred_cnn > 0.5, 1, 0).ravel()
        return y_pred_cnn_binary.reshape(-1, 1)
        
    def predict(self, X):
        return self.transform(X)

In [79]:
# Create the stacking classifier(LogisticRegression)
stacked_classifier = StackingClassifier(
    estimators=[
        ('svm', svm_classifier),
        ('cnn', make_pipeline(CNNBinaryPredictions()))
    ],
    final_estimator=LogisticRegression(),
    n_jobs=-1,
)

In [80]:
# Train the stacking classifier using the training data
# stacked_classifier.fit(X_train_padded, y_train)

def E6():
  start_time = time.time()
  # Train the stacking classifier using the training data
  stacked_classifier.fit(X_train_padded, y_train)
  end_time = time.time()
  training_time = end_time - start_time
  computational_factor['E6']=(training_time,0)
  time.sleep(2)  # Simulate training with a 2-second delay

mem_usage = memory_usage(E6, interval=0.1, max_usage=True)
print("Peak memory usage (MB):", mem_usage)

training_time,b=computational_factor['E6']
computational_factor['E6']=(training_time,mem_usage)
print(computational_factor['E6'])



Peak memory usage (MB): 2689.6796875
(160.9642424583435, 2689.6796875)


In [81]:
# Predict the sentiment labels using the test data
y_pred_stacked = stacked_classifier.predict(X_test_padded)



In [82]:
# Calculate the accuracy of the stacked classifier
stacked_accuracy = accuracy_score(y_test, y_pred_stacked)

print("Stacked model accuracy:", stacked_accuracy)

Stacked model accuracy: 0.5133


In [88]:
# SVM,RF,NB,CNN- Majority Voting:
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, ClassifierMixin

In [89]:
class CNNBinaryClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model_cnn):
        self.model_cnn = model_cnn

    def fit(self, X, y):
        return self

    def predict(self, X):
        y_pred_cnn = self.model_cnn.predict(X)
        y_pred_cnn_binary = np.where(y_pred_cnn > 0.5, 1, 0).ravel()
        return y_pred_cnn_binary

In [90]:
# Create the majority voting classifier
majority_voting_classifier = VotingClassifier(
    estimators=[
        ('svm', svm_classifier),
        ('naive_bayes', naive_bayes_classifier),
        ('random_forest', random_forest_classifier),
        ('cnn', CNNBinaryClassifier(model_cnn)),
    ],
    voting='hard',
    n_jobs=-1,
)

In [91]:
# # Train the majority voting classifier using the training data
# majority_voting_classifier.fit(X_train_padded, y_train)
def E7():
  start_time = time.time()
  # Train the majority voting classifier using the training data
  majority_voting_classifier.fit(X_train_padded, y_train)
  end_time = time.time()
  training_time = end_time - start_time
  computational_factor['E7']=(training_time,0)
  time.sleep(2)  # Simulate training with a 2-second delay

mem_usage = memory_usage(E7, interval=0.1, max_usage=True)
print("Peak memory usage (MB):", mem_usage)

training_time,b=computational_factor['E7']
computational_factor['E7']=(training_time,mem_usage)
print(computational_factor['E7'])



Peak memory usage (MB): 2874.15625
(66.99269223213196, 2874.15625)


In [92]:
# Predict the sentiment labels using the test data
y_pred_majority_voting = majority_voting_classifier.predict(X_test_padded)

# Calculate the accuracy of the majority voting classifier
majority_voting_accuracy = accuracy_score(y_test, y_pred_majority_voting)

print("Majority voting model accuracy:", majority_voting_accuracy)

Majority voting model accuracy: 0.5094


In [111]:
for pair in computational_factor.items():
  print(pair)

('NB', (0.06569957733154297, 1295.53515625))
('RF', (426.1929829120636, 772.953125))
('SVM', (12.723561525344849, 864.42578125))
('RNN', (326.4002013206482, 0))
('E1', (0.03712582588195801, 1289.7421875))
('CNN', (623.7481956481934, 1616.9765625))
('E3', (8.127866744995117, 1618.38671875))
('E5', (2.4740681648254395, 3964.28125))
('E6', (160.9642424583435, 2689.6796875))
('E7', (66.99269223213196, 2874.15625))
('E2', (1557.2816035747528, 3099.2421875))
('E4', (41.28328990936279, 2918.13671875))
