In [1]:
import os, gzip, numpy as np, pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout, GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate, Layer, Input, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import pickle
import tensorflow as tf

nltk.download('punkt')
RANDOM_SEED = 42
tf.random.set_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Parameters
SEQ_LEN = 50
EMBEDDING_DIM = 300
BATCH_SIZE = 128
EPOCHS = 15
NUM_CLASSES = 3

FASTTEXT_EN_GZ = "cc.en.300.vec.gz"
FASTTEXT_HI_GZ = "cc.hi.300.vec.gz"
DATA_CSV = "final_cleaned_dataset.csv"
# Dataset
df = pd.read_csv(DATA_CSV)
print("Dataset shape:", df.shape)
print("Label distribution:\n", df['label'].value_counts())

texts = df['clean_tweet'].astype(str).str.lower().tolist()
labels = df['label'].astype(int).values

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
seqs = tokenizer.texts_to_sequences(texts)
X = pad_sequences(seqs, maxlen=SEQ_LEN, padding="post")
y = labels
vocab_size = len(tokenizer.word_index) + 1
print("Vocab size:", vocab_size)

# stratified split
x_train, x_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_SEED)
x_val, x_test, y_val, y_test = train_test_split(
    x_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=RANDOM_SEED)

# Load FastText embeddings (subset for vocab)
def load_fasttext_subset(vec_gz, vocab, dim):
    found = {}
    with gzip.open(vec_gz, "rt", encoding="utf-8", errors="ignore") as f:
        header = next(f)
        for line in tqdm(f, desc=f"Reading {os.path.basename(vec_gz)}"):
            parts = line.rstrip().split(" ")
            if len(parts) < dim+1: continue
            word = parts[0]
            if word in vocab:
                found[word] = np.asarray(parts[1:1+dim], dtype=np.float32)
            if len(found) >= len(vocab): break
    return found

vocab_tokens = set(tokenizer.word_index.keys())
vectors = {}
if os.path.exists(FASTTEXT_HI_GZ):
    vectors.update(load_fasttext_subset(FASTTEXT_HI_GZ, vocab_tokens, EMBEDDING_DIM))
if os.path.exists(FASTTEXT_EN_GZ):
    remain = vocab_tokens - set(vectors.keys())
    vectors.update(load_fasttext_subset(FASTTEXT_EN_GZ, remain, EMBEDDING_DIM))

rng = np.random.RandomState(RANDOM_SEED)
embedding_matrix = rng.normal(scale=0.01, size=(vocab_size, EMBEDDING_DIM)).astype(np.float32)
for word, idx in tokenizer.word_index.items():
    if word in vectors:
        embedding_matrix[idx] = vectors[word]
        
# Class weights
classes = np.unique(y_train)
cw = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight_dict = {int(c): float(w) for c,w in zip(classes,cw)}
print("Class weights:", class_weight_dict)

# Custom Attention Layer
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
                                 initializer="zeros")
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        e = tf.keras.backend.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        a = tf.keras.backend.softmax(e, axis=1)
        output = x * a
        return tf.keras.backend.sum(output, axis=1)

# Model
inp = Input(shape=(SEQ_LEN,))
emb = Embedding(vocab_size, EMBEDDING_DIM, weights=[embedding_matrix],
                trainable=True)(inp)
x = SpatialDropout1D(0.25)(emb)
x = Bidirectional(GRU(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.1))(x)
x = Bidirectional(GRU(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.1))(x)

att = AttentionLayer()(x)
maxp = GlobalMaxPooling1D()(x)
avgp = GlobalAveragePooling1D()(x)
x = Concatenate()([att, maxp, avgp])

x = Dense(128, activation="relu")(x)
x = Dropout(0.3)(x)
x = Dense(64, activation="relu")(x)
out = Dense(NUM_CLASSES, activation="softmax")(x)

model = Model(inputs=inp, outputs=out)
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=Adam(1e-4), metrics=["accuracy"])
model.summary()

# Training
es = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
rlp = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-7)
mc = ModelCheckpoint("best_model.keras", monitor="val_loss", save_best_only=True)

history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    class_weight=class_weight_dict,
    callbacks=[es, rlp, mc],
    verbose=1
)
# Evaluation
loss, acc = model.evaluate(x_test, y_test, verbose=2)
print(f"\nTest Accuracy: {acc*100:.2f}%")

y_pred_probs = model.predict(x_test)
y_pred = np.argmax(y_pred_probs, axis=1)

print("\nClassification report (macro-F1 focus):")
print(classification_report(y_test, y_pred, target_names=["negative","neutral","positive"]))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

# Save
model.save("sentiment_model_final.keras")
with open("tokenizer.pkl","wb") as f: pickle.dump(tokenizer,f)
print("Final model and tokenizer saved.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dataset shape: (89926, 5)
Label distribution:
 label
2    39396
1    30182
0    20348
Name: count, dtype: int64
Vocab size: 79900
Class weights: {0: 1.473153950116722, 1: 0.9931662870159453, 2: 0.7608592188342799}



Epoch 1/15
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m334s[0m 578ms/step - accuracy: 0.6127 - loss: 0.8459 - val_accuracy: 0.7735 - val_loss: 0.6029 - learning_rate: 1.0000e-04
Epoch 2/15
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 552ms/step - accuracy: 0.8225 - loss: 0.4923 - val_accuracy: 0.8637 - val_loss: 0.4331 - learning_rate: 1.0000e-04
Epoch 3/15
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 559ms/step - accuracy: 0.8944 - loss: 0.3246 - val_accuracy: 0.8728 - val_loss: 0.4223 - learning_rate: 1.0000e-04
Epoch 4/15
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 561ms/step - accuracy: 0.9246 - loss: 0.2446 - val_accuracy: 0.8616 - val_loss: 0.4418 - learning_rate: 1.0000e-04
Epoch 5/15
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 565ms/step - accuracy: 0.9401 - loss: 0.1993 - val_accuracy: 0.8584 - val_loss: 0.4725 - learning_rate: 1.0000e-04
Epoch 6/15
[1m563/563[0m [3