In [13]:
pip install tensorflow




In [15]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [23]:
# Import Libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# TensorFlow/Keras for Deep Learning
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load the Dataset
data_path = r"C:\Users\HP\Downloads\archive (8)\LabeledText.xlsx"  # Replace with your file
data = pd.read_excel(data_path)

# Map LABEL values to numerical values
label_mapping = {'negative': 0, 'positive': 1, 'neutral': 2}  # Add neutral if present
data['LABEL'] = data['LABEL'].map(label_mapping)

# Separate features (X) and target (y)
X = data['Caption']
y = data['LABEL']

# Updated Text Preprocessing Function (without NLTK)
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text.strip()  # Remove leading/trailing spaces

# Apply preprocessing to the text data
X = X.apply(preprocess_text)

# Tokenization and Padding
max_vocab_size = 5000
max_seq_length = 100

tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(X)

X_tokenized = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_tokenized, maxlen=max_seq_length)

# Convert labels to one-hot encoding
y = to_categorical(y)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.25, random_state=42)

# Build LSTM Model
embedding_dim = 128
model = Sequential([
    Embedding(max_vocab_size, embedding_dim, input_length=max_seq_length),
    LSTM(64, return_sequences=True),
    Dropout(0.5),
    LSTM(32),
    Dropout(0.5),
    Dense(y.shape[1], activation='softmax')  # Output neurons match number of label classes
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the Model
batch_size = 32
epochs = 10  # Adjust epochs based on needs
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the Model
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test, axis=1)

accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_true, y_pred))



Epoch 1/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 90ms/step - accuracy: 0.3777 - loss: 1.0881 - val_accuracy: 0.4952 - val_loss: 1.0215
Epoch 2/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 79ms/step - accuracy: 0.6048 - loss: 0.8925 - val_accuracy: 0.6785 - val_loss: 0.7384
Epoch 3/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 101ms/step - accuracy: 0.8472 - loss: 0.4406 - val_accuracy: 0.6744 - val_loss: 0.7918
Epoch 4/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 92ms/step - accuracy: 0.9272 - loss: 0.2350 - val_accuracy: 0.6867 - val_loss: 0.9247
Epoch 5/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 102ms/step - accuracy: 0.9696 - loss: 0.1151 - val_accuracy: 0.6826 - val_loss: 1.1980
Epoch 6/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 161ms/step - accuracy: 0.9832 - loss: 0.0680 - val_accuracy: 0.6758 - val_loss: 1.2998
Epoch 7/10
[1m92/92[0m [32