In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, LSTM, Dense
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense
# import warnings
# warnings.filterwarnings("ignore")


df = pd.read_csv("amazon_reviews.csv")

# Data Pre-processing
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    if isinstance(text, str):  # Check if text is a string
        text = text.lower()  # Convert text to lowercase
        text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
        text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Remove special characters
        return text
    else:
        return ''  # Return empty string if NaN or non-string value is encountered

df['cleaned_review'] = df['cleaned_review'].apply(clean_text)

# Data Splitting
X_train, X_val, y_train, y_val = train_test_split(df['cleaned_review'], df['sentiments'], test_size=0.2, random_state=42)

# Word Embedding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert text data to sequences of indices
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

vocab_size = len(tokenizer.word_index) + 1
max_length = 50
# max_length = 100
#apply sequence padding
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_val_padded = pad_sequences(X_val_seq, maxlen=max_length, padding='post')

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

#####################################RNN################################
model_rnn = Sequential()
model_rnn.add(Embedding(vocab_size, 100, input_length=max_length))
model_rnn.add(SimpleRNN(100))
model_rnn.add(Dense(3, activation='softmax'))

model_rnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_rnn.fit(X_train_padded, y_train_encoded, epochs=10, validation_data=(X_val_padded, y_val_encoded), verbose=2)

####################################LSTM#################################
model_lstm = Sequential()
model_lstm.add(Embedding(vocab_size, 100, input_length=max_length))
model_lstm.add(LSTM(100))
model_lstm.add(Dense(3, activation='softmax'))

model_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_lstm.fit(X_train_padded, y_train_encoded, epochs=10, validation_data=(X_val_padded, y_val_encoded), verbose=2)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/10
434/434 - 11s - 25ms/step - accuracy: 0.5795 - loss: 0.8762 - val_accuracy: 0.5966 - val_loss: 0.8821
Epoch 2/10
434/434 - 8s - 19ms/step - accuracy: 0.6879 - loss: 0.7579 - val_accuracy: 0.7166 - val_loss: 0.7052
Epoch 3/10
434/434 - 9s - 20ms/step - accuracy: 0.7657 - loss: 0.6197 - val_accuracy: 0.7315 - val_loss: 0.7141
Epoch 4/10
434/434 - 8s - 19ms/step - accuracy: 0.7463 - loss: 0.6595 - val_accuracy: 0.7393 - val_loss: 0.6866
Epoch 5/10
434/434 - 8s - 19ms/step - accuracy: 0.7864 - loss: 0.5708 - val_accuracy: 0.7673 - val_loss: 0.6242
Epoch 6/10
434/434 - 8s - 19ms/step - accuracy: 0.8348 - loss: 0.4577 - val_accuracy: 0.7858 - val_loss: 0.5878
Epoch 7/10
434/434 - 8s - 19ms/step - accuracy: 0.8594 - loss: 0.3999 - val_accuracy: 0.7788 - val_loss: 0.6112
Epoch 8/10
434/434 - 8s - 19ms/step - accuracy: 0.8751 - loss: 0.3664 - val_accuracy: 0.8042 - val_loss: 0.5685
Epoch 9/10
434/434 - 9s - 21ms/step - accuracy: 0.9032 - loss: 0.3032 - val_accuracy: 0.7806 - val_loss

<keras.src.callbacks.history.History at 0x18158d3b3e0>

In [5]:
# 1. BONUS: Allow the user to input a new review and predict the result.

def predict_review(review, model):
    # Clean
    cleaned_review = clean_text(review)
    review_seq = tokenizer.texts_to_sequences([cleaned_review])
    review_padded = pad_sequences(review_seq, maxlen=max_length, padding='post')
    
    # Predict the review sentiment
    prediction = model.predict(review_padded)
    predicted_label = np.argmax(prediction) # encoded label
    # Convert back to original label
    original_label = label_encoder.inverse_transform([predicted_label])[0]
    return original_label

# A new review
new_review = input("Enter a review: ")

# calling predict_review for both models
rnn_prediction = predict_review(new_review, model_rnn)
lstm_prediction = predict_review(new_review, model_lstm)

# print result of prediction
print(f"RNN Model Prediction: {rnn_prediction}")
print(f"LSTM Model Prediction: {lstm_prediction}")

Enter a review:  The movie was really good. It was interesting, I can recommend it to you.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
RNN Model Prediction: positive
LSTM Model Prediction: positive


In [6]:
# 2. BONUS: Show model summary of each model

# Using summary() function for both models

# Printing the RNN model summary
model_rnn.summary()

print("------------------------------------------------------------------------------")

# Printing the RNN model summary
model_lstm.summary()

------------------------------------------------------------------------------
