In [1]:
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
data = pd.read_csv("./lemmatized_clean_dataset.csv")

Loading data: 50it [00:00, 120.75it/s]


In [3]:
data.shape

(50000, 2)

In [4]:
data.head()

Unnamed: 0,text,label
0,rented curious yellow video store controversy ...,0
1,curious yellow risible pretentious steaming pi...,0
2,avoid making type film future film interesting...,0
3,film probably inspired godard masculin féminin...,0
4,oh brother hearing ridiculous film umpteen yea...,0


In [5]:
data.tail()

Unnamed: 0,text,label
49995,got around seeing monster man yesterday long w...,1
49996,got part competition prize watched really expe...,1
49997,got monster man box set three film mainly want...,1
49998,five minute started feel naff looking got comp...,1
49999,caught movie sci fi channel recently actually ...,1


In [6]:
data["label"].value_counts()

label
0    25000
1    25000
Name: count, dtype: int64

In [7]:
data.head()

Unnamed: 0,text,label
0,rented curious yellow video store controversy ...,0
1,curious yellow risible pretentious steaming pi...,0
2,avoid making type film future film interesting...,0
3,film probably inspired godard masculin féminin...,0
4,oh brother hearing ridiculous film umpteen yea...,0


In [None]:
# split data into training data and test data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data["label"])

In [9]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


**Data Preprocessing**

In [10]:
# Tokenize text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["text"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["text"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["text"]), maxlen=200)

In [11]:
print(X_train)

[[   0    0    0 ...   20    8  218]
 [  36  408 1110 ...  127 2850  924]
 [   0    0    0 ... 1045  669   72]
 ...
 [   0    0    0 ...   20    2  983]
 [   0    0    0 ...  978   35 1076]
 [   0    0    0 ...  131 1638  299]]


In [12]:
print(X_test)

[[   0    0    0 ... 4252 1113  703]
 [   0    0    0 ...  128  210  133]
 [   0    0    0 ...   35 3021  246]
 ...
 [   0    0    0 ... 2420  345 2787]
 [   0    0    0 ...  172  186 1142]
 [   0    0    0 ...  243  104    2]]


In [13]:
Y_train = train_data["label"]
Y_test = test_data["label"]

In [14]:
print(Y_train)

39087    1
30893    0
45278    1
16398    1
13653    1
        ..
11284    0
44732    1
38158    1
860      0
15795    1
Name: label, Length: 40000, dtype: int64


**LSTM - Long Short-Term Memory**

In [15]:
# build the model

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))



In [16]:
model.summary()

In [17]:
# compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

**Training the Model**

In [18]:
model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 217ms/step - accuracy: 0.7666 - loss: 0.4756 - val_accuracy: 0.8616 - val_loss: 0.3326
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 261ms/step - accuracy: 0.8865 - loss: 0.2839 - val_accuracy: 0.8756 - val_loss: 0.3141
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 273ms/step - accuracy: 0.9092 - loss: 0.2388 - val_accuracy: 0.8758 - val_loss: 0.3031
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 267ms/step - accuracy: 0.9210 - loss: 0.2133 - val_accuracy: 0.8755 - val_loss: 0.3115
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 276ms/step - accuracy: 0.9313 - loss: 0.1826 - val_accuracy: 0.8756 - val_loss: 0.3397


<keras.src.callbacks.history.History at 0x30d5e4c50>

**Model Evaluation**

In [19]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 96ms/step - accuracy: 0.8773 - loss: 0.3602
Test Loss: 0.3599548041820526
Test Accuracy: 0.8741999864578247


**Building a Predictive System**

In [20]:
def predict_sentiment(review):
  # tokenize and pad the review
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [21]:
# example usage
new_review = "This movie was fantastic. I loved it."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step
The sentiment of the review is: positive


In [22]:
# example usage
new_review = "This movie was not that good"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
The sentiment of the review is: negative


In [23]:
# example usage
new_review = "This movie was ok but not that good."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
The sentiment of the review is: negative


In [27]:
model.save('./models/test_model.keras')
model.save_weights('./models/test_model.weights.h5')

In [29]:
from tensorflow.keras.models import load_model

# Load the model
model = load_model('./models/test_model.keras')

# Print the model summary
model.summary()

  saveable.load_own_variables(weights_store.get(inner_path))
