## Bi-LSTM

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional, SpatialDropout1D, Dropout, BatchNormalization
from keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

df_tweets = pd.read_csv("tweets-clean.csv", sep=";")

df_tweets['clean_content'] = (
    df_tweets['clean_content']
    .str.replace(r'\b(temp|hyphen)\b', '', regex=True)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)

print("Total data:", len(df_tweets))

max_words = 4000
max_len = 30

tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
tokenizer.fit_on_texts(df_tweets['clean_content'].values.astype('U'))
X = tokenizer.texts_to_sequences(df_tweets['clean_content'].values.astype('U'))
X = pad_sequences(X, maxlen=max_len)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_tweets['polarity'])
y = to_categorical(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

!wget -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz
!gunzip -f cc.id.300.vec.gz

embeddings_index = {}
with open('cc.id.300.vec', encoding='utf-8') as f:
    next(f)
    for line in f:
        values = line.rstrip().split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

embedding_dim = 300
word_index = tokenizer.word_index
num_words = min(max_words, len(word_index) + 1)

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

lstm_units = 128
dropout_rate = 0.45
trainable_embed = False

bi_model = Sequential([
    Embedding(
        input_dim=num_words,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=trainable_embed
    ),
    SpatialDropout1D(0.3),
    Bidirectional(LSTM(lstm_units, dropout=0.3, recurrent_dropout=0.3)),
    Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    BatchNormalization(),
    Dropout(dropout_rate),
    Dense(3, activation='softmax')
])

bi_model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

bi_model.summary()

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint_callback = ModelCheckpoint('best_bilstm.weights.h5', save_weights_only=True, monitor='val_loss', save_best_only=True)

history = bi_model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=8,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping_callback, model_checkpoint_callback],
    verbose=1
)

y_pred = bi_model.predict(X_test, batch_size=8, verbose=1)
y_pred_classes = y_pred.argmax(axis=-1)
y_true = y_test.argmax(axis=-1)

print("\n=== Evaluation Results ===")
print("Accuracy:", round(accuracy_score(y_true, y_pred_classes), 4))
print(classification_report(y_true, y_pred_classes, target_names=label_encoder.classes_))


Total data: 2726
Found 2000000 word vectors.




Epoch 1/20
[1m239/239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 213ms/step - accuracy: 0.3785 - loss: 2.7807 - val_accuracy: 0.4340 - val_loss: 2.0537
Epoch 2/20
[1m239/239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 212ms/step - accuracy: 0.4531 - loss: 1.9797 - val_accuracy: 0.5355 - val_loss: 1.5895
Epoch 3/20
[1m239/239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 208ms/step - accuracy: 0.4939 - loss: 1.5495 - val_accuracy: 0.5685 - val_loss: 1.3147
Epoch 4/20
[1m239/239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 209ms/step - accuracy: 0.5352 - loss: 1.3123 - val_accuracy: 0.5587 - val_loss: 1.1892
Epoch 5/20
[1m239/239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 218ms/step - accuracy: 0.5459 - loss: 1.1703 - val_accuracy: 0.5697 - val_loss: 1.0886
Epoch 6/20
[1m239/239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 208ms/step - accuracy: 0.5500 - loss: 1.0773 - val_accuracy: 0.5685 - val_loss: 1.0227
Epoch 7/20