**Importing the Dependencies**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

**Data Collection- Kaggle API**

**Loading teh Dataset**

In [2]:
data = pd.read_csv("./lemmatized_clean_dataset.csv")

In [3]:
data.shape

(50000, 2)

In [4]:
data.head()

Unnamed: 0,text,label
0,rented curious yellow video store controversy ...,0
1,curious yellow risible pretentious steaming pi...,0
2,avoid making type film future film interesting...,0
3,film probably inspired godard masculin féminin...,0
4,oh brother hearing ridiculous film umpteen yea...,0


In [5]:
data.tail()

Unnamed: 0,text,label
49995,got around seeing monster man yesterday long w...,1
49996,got part competition prize watched really expe...,1
49997,got monster man box set three film mainly want...,1
49998,five minute started feel naff looking got comp...,1
49999,caught movie sci fi channel recently actually ...,1


In [6]:
data["label"].value_counts()

label
0    25000
1    25000
Name: count, dtype: int64

In [7]:
data.head()

Unnamed: 0,text,label
0,rented curious yellow video store controversy ...,0
1,curious yellow risible pretentious steaming pi...,0
2,avoid making type film future film interesting...,0
3,film probably inspired godard masculin féminin...,0
4,oh brother hearing ridiculous film umpteen yea...,0


In [8]:
# split data into training data and test data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [9]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


**Data Preprocessing**

In [10]:
# Tokenize text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["text"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["text"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["text"]), maxlen=200)

In [11]:
print(X_train)

[[   0    0    0 ...   20    8  218]
 [  36  408 1110 ...  127 2850  924]
 [   0    0    0 ... 1045  669   72]
 ...
 [   0    0    0 ...   20    2  983]
 [   0    0    0 ...  978   35 1076]
 [   0    0    0 ...  131 1638  299]]


In [12]:
print(X_test)

[[   0    0    0 ... 4252 1113  703]
 [   0    0    0 ...  128  210  133]
 [   0    0    0 ...   35 3021  246]
 ...
 [   0    0    0 ... 2420  345 2787]
 [   0    0    0 ...  172  186 1142]
 [   0    0    0 ...  243  104    2]]


In [13]:
Y_train = train_data["label"]
Y_test = test_data["label"]

In [14]:
print(Y_train)

39087    1
30893    0
45278    1
16398    1
13653    1
        ..
11284    0
44732    1
38158    1
860      0
15795    1
Name: label, Length: 40000, dtype: int64


**LSTM - Long Short-Term Memory**

In [15]:
# build the model

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))



In [16]:
model.summary()

In [17]:
# compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

**Training the Model**

In [18]:
model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m 61/500[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1:24[0m 193ms/step - accuracy: 0.6057 - loss: 0.6833

KeyboardInterrupt: 

**Model Evaluation**

In [None]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Test Loss: 0.313868910074234
Test Accuracy: 0.8830000162124634


**Building a Predictive System**

In [None]:
def predict_sentiment(review):
  # tokenize and pad the review
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [None]:
# example usage
new_review = "This movie was fantastic. I loved it."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

The sentiment of the review is: positive


In [None]:
# example usage
new_review = "This movie was not that good"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

The sentiment of the review is: negative


In [None]:
# example usage
new_review = "This movie was ok but not that good."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

The sentiment of the review is: negative
