### **Imports and Config**

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

# Reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Paths
DATA_DIR = Path("/kaggle/input/fake-news-splits/fake-news-splits")  # <-- change to your dataset slug
WORK_DIR = Path("/kaggle/working")

# Input/output
INPUT_COL = "text_clean"   # or "text_clean"
LABEL_COL = "label"

# Tokenizer / sequence
NUM_WORDS = 50000
MAX_LEN   = 256
OOV_TOKEN = "<UNK>"

# Model settings
EMBED_DIM  = 128
LSTM_UNITS = 128
DROPOUT    = 0.3
LR         = 1e-3
BATCH_SIZE = 64
EPOCHS     = 8

2025-09-29 09:19:34.861940: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759137575.208053      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759137575.307376      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### **Load Data and Labels**

In [2]:
df_train = pd.read_csv(DATA_DIR / "train.csv")
df_val   = pd.read_csv(DATA_DIR / "val.csv")
df_test  = pd.read_csv(DATA_DIR / "test.csv")

# Normalize labels
for df in [df_train, df_val, df_test]:
    df[LABEL_COL] = df[LABEL_COL].astype(str).str.upper().str.strip()
    df[INPUT_COL] = df[INPUT_COL].astype(str).str.strip()

# Convert labels to 0/1
def labels_to_int(series):
    return (series == "TRUE").astype(int).values

y_tr = labels_to_int(df_train[LABEL_COL])
y_va = labels_to_int(df_val[LABEL_COL])
y_te = labels_to_int(df_test[LABEL_COL])

### **Tokenize and Pad**

In [3]:
tok = Tokenizer(num_words=NUM_WORDS, lower=True, oov_token=OOV_TOKEN)
tok.fit_on_texts(df_train[INPUT_COL])

def to_padded(texts, tokenizer, max_len=MAX_LEN):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=max_len, padding="post", truncating="post")

X_tr = to_padded(df_train[INPUT_COL], tok)
X_va = to_padded(df_val[INPUT_COL], tok)
X_te = to_padded(df_test[INPUT_COL], tok)

### **Build Model**

In [4]:
def build_model(vocab_size, max_len):
    inp = layers.Input(shape=(max_len,), dtype="int32")
    emb = layers.Embedding(input_dim=vocab_size,
                           output_dim=EMBED_DIM,
                           mask_zero=True)(inp)
    x = layers.Bidirectional(layers.LSTM(LSTM_UNITS, return_sequences=True))(emb)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(DROPOUT)(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(DROPOUT)(x)
    out = layers.Dense(1, activation="sigmoid")(x)
    model = models.Model(inp, out)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LR),
                  loss="binary_crossentropy",
                  metrics=[tf.keras.metrics.AUC(name="auc"), "accuracy"])
    return model

vocab_size = min(NUM_WORDS, len(tok.word_index) + 1)
model = build_model(vocab_size, MAX_LEN)
model.summary()

I0000 00:00:1759137606.823764      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1759137606.824405      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


### **Train the Model**

In [5]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_auc", mode="max",
                                     patience=2, restore_best_weights=True)
]

history = model.fit(
    X_tr, y_tr,
    validation_data=(X_va, y_va),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/8


I0000 00:00:1759137613.838175     102 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 32ms/step - accuracy: 0.9378 - auc: 0.9740 - loss: 0.1518 - val_accuracy: 0.9976 - val_auc: 0.9995 - val_loss: 0.0103
Epoch 2/8
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 30ms/step - accuracy: 0.9980 - auc: 0.9993 - loss: 0.0097 - val_accuracy: 0.9969 - val_auc: 0.9996 - val_loss: 0.0115
Epoch 3/8
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 31ms/step - accuracy: 0.9993 - auc: 0.9998 - loss: 0.0038 - val_accuracy: 0.9991 - val_auc: 0.9996 - val_loss: 0.0049
Epoch 4/8
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 31ms/step - accuracy: 0.9999 - auc: 1.0000 - loss: 3.9462e-04 - val_accuracy: 0.9982 - val_auc: 0.9998 - val_loss: 0.0076
Epoch 5/8
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 31ms/step - accuracy: 0.9999 - auc: 1.0000 - loss: 4.0808e-04 - val_accuracy: 0.9993 - val_auc: 0.9995 - val_loss: 0.0075
Epoch 6/8
[1m562/562[0m 

### **Saving the model and Tokenizers**

In [6]:
# Save the trained model
model.save(WORK_DIR / "best_lstm.keras")

# Save the tokenizer
import json
with open(WORK_DIR / "tokenizer.json", "w", encoding="utf-8") as f:
    f.write(tok.to_json())

### **Metrics**

In [7]:
y_prob = model.predict(X_te, batch_size=2*BATCH_SIZE).ravel()
y_pred = (y_prob >= 0.5).astype(int)

acc = accuracy_score(y_te, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_te, y_pred, average="binary")
roc = roc_auc_score(y_te, y_prob)

metrics = {"accuracy": float(acc), "precision": float(prec),
           "recall": float(rec), "f1": float(f1), "roc_auc": float(roc)}
metrics

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step


{'accuracy': 0.9988864142538976,
 'precision': 0.9995324918186068,
 'recall': 0.9981325863678805,
 'f1': 0.9988320485867788,
 'roc_auc': 0.9999578479887129}

### **Saving Metrics**

In [8]:
with open(WORK_DIR / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)