In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("comment.csv")

In [3]:
df = df.dropna(subset=['comment'])
df = df[df['comment'].str.strip().astype(bool)]  # remove empty strings

In [4]:
X = df['comment'].values
y = df['encoded_label'].values

In [5]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

In [6]:
max_length = 50
X_padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [8]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=64, input_length=max_length))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='tanh'))  # tanh for [-1, 1]



In [9]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 67ms/step - accuracy: 0.5953 - loss: 0.3142 - val_accuracy: 0.8433 - val_loss: 0.1582
Epoch 2/5
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 70ms/step - accuracy: 0.8475 - loss: 0.1416 - val_accuracy: 0.8684 - val_loss: 0.1314
Epoch 3/5
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 66ms/step - accuracy: 0.9003 - loss: 0.0808 - val_accuracy: 0.8627 - val_loss: 0.1330
Epoch 4/5
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 68ms/step - accuracy: 0.9070 - loss: 0.0612 - val_accuracy: 0.8614 - val_loss: 0.1258
Epoch 5/5
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 68ms/step - accuracy: 0.9142 - loss: 0.0506 - val_accuracy: 0.8622 - val_loss: 0.1222


<keras.src.callbacks.history.History at 0x7ad0c4383090>

In [10]:
loss, acc = model.evaluate(X_test, y_test)
print(f"\n🟢 Test Accuracy: {acc*100:.2f}%")

[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.8499 - loss: 0.1273

🟢 Test Accuracy: 85.35%


In [11]:
y_pred = model.predict(X_test)
y_pred_label = np.round(y_pred).astype(int)

[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step


In [12]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred_label))


Classification Report:
              precision    recall  f1-score   support

          -1       0.78      0.39      0.52       136
           0       0.87      0.87      0.87      1382
           1       0.87      0.91      0.89      1322

    accuracy                           0.87      2840
   macro avg       0.84      0.72      0.76      2840
weighted avg       0.87      0.87      0.86      2840



In [14]:
model.save("bilstm_sentiment_model.h5")



In [15]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)