In [3]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from sklearn.preprocessing import LabelEncoder

# Load the IMDB dataset from CSV
imdb_df = pd.read_csv("imdb.csv")

# Split the dataset into features (X) and labels (y)
X = imdb_df["review"].values
y = imdb_df["sentiment"].values

# Encode labels to numerical values (0 for negative, 1 for positive)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

# Pad sequences to a fixed length of 500
X = pad_sequences(X, maxlen=500)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = Sequential()
model.add(Embedding(5000, 32, input_length=500))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=128, verbose=2)

# Evaluate the model
scores = model.evaluate(X_test, y_test, verbose=0)
accuracy = scores[1] * 100
print(f"Accuracy: {accuracy:.2f}%")


Epoch 1/2
313/313 - 35s - loss: 0.4491 - accuracy: 0.7575 - val_loss: 0.2798 - val_accuracy: 0.8802 - 35s/epoch - 111ms/step
Epoch 2/2
313/313 - 32s - loss: 0.1789 - accuracy: 0.9323 - val_loss: 0.2905 - val_accuracy: 0.8830 - 32s/epoch - 103ms/step
Accuracy: 88.30%
