<a href="https://colab.research.google.com/github/fahadshakeel23/DataScience/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import TextVectorization

import tensorflow as tf
import tensorflow_datasets as tfds


In [51]:
# Load IMDb reviews dataset from TensorFlow Datasets
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_data, test_data = dataset['train'], dataset['test']

print(info)


tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset. This is a dataset for binary sentiment
    classification containing substantially more data than previous benchmark
    datasets. We provide a set of 25,000 highly polar movie reviews for training,
    and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_dir='/root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
        'text': Text(shape=(), dtype=string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    nondeterministic_order=False,
    splits={
        'test': <SplitInfo num_e

Step 4: Text Preprocessing and Tokenization

In [52]:

# Define text vectorization layer for tokenizing and indexing words
max_features = 10000
sequence_length = 250

vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode = 'int',
    output_sequence_length= sequence_length
)

# adapt vectoize layer to train data text
train_text = train_data.map(lambda text, label:text)
vectorize_layer.adapt(train_text)


Step 5: Prepare Dataset for Training

In [53]:
# Define a function mapping raw text and label to vectorized text and label
def vectorize_text(text, label):
    # Remove extra dimension if exists (safe measure)
    text = tf.squeeze(text)
    # Vectorize text to integer sequences
    return vectorize_layer(text), label

# Apply the vectorization function to datasets
train_data = train_data.map(vectorize_text)
test_data = test_data.map(vectorize_text)

# Batch datasets and prefetch for performance
batch_size = 32
train_data = train_data.shuffle(10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_data = test_data.batch(batch_size).prefetch(tf.data.AUTOTUNE)


Step 6: Build and Compile a Simple Neural Network Model



In [54]:
model = Sequential([
    layers.Embedding(max_features + 1, 128, input_length=sequence_length),  # Embedding layer
    layers.GlobalAveragePooling1D(),  # Pooling over the sequence dimension
    layers.Dense(64, activation='relu'),  # Hidden fully-connected layer
    layers.Dense(1, activation='sigmoid')  # Output sigmoid for binary classification
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()




Step 7: Train the Model



In [55]:
# Train the model
epochs = 5
history = model.fit(
    train_data,
    validation_data=test_data,
    epochs=epochs
)

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 35ms/step - accuracy: 0.6504 - loss: 0.5882 - val_accuracy: 0.8606 - val_loss: 0.3340
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 33ms/step - accuracy: 0.8795 - loss: 0.2889 - val_accuracy: 0.8668 - val_loss: 0.3194
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 33ms/step - accuracy: 0.9067 - loss: 0.2315 - val_accuracy: 0.8563 - val_loss: 0.3428
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 34ms/step - accuracy: 0.9194 - loss: 0.2075 - val_accuracy: 0.8594 - val_loss: 0.3429
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 38ms/step - accuracy: 0.9261 - loss: 0.1873 - val_accuracy: 0.8529 - val_loss: 0.3793


Step 7: Evaluate the Model

In [56]:
loss, accuracy = model.evaluate(test_data)
print(f"Test Accuracy: {accuracy:.4f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - accuracy: 0.8525 - loss: 0.3795
Test Accuracy: 0.8529
