In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')


In [None]:
import os
from pathlib import Path

# Create the required folders if they don't exist
folders_to_create = ['../models', '../results']

for folder in folders_to_create:
    Path(folder).mkdir(parents=True, exist_ok=True)
    print(f"Created/verified folder: {folder}")

# Check if the data files exist
data_files = ['../data/train.csv', '../data/val.csv', '../data/test.csv']

for file in data_files:
    if os.path.exists(file):
        print(f" Found: {file}")
    else:
        print(f" MISSING: {file} - Ask for these files!")

In [None]:
# ==================== FILE CHECK ====================
print("=== Checking required files ===")
# Check if data files exist
data_files = {
    'train': '../data/train.csv',
    'val': '../data/val.csv', 
    'test': '../data/test.csv'
}

missing_files = []
for name, path in data_files.items():
    if os.path.exists(path):
        print(f" Found {name} data: {path}")
    else:
        print(f" MISSING: {path}")
        missing_files.append(path)

if missing_files:
    print(f"\n ERROR: Missing {len(missing_files)} files!:")
    for file in missing_files:
        print(f"   - {file}")
    print("\nThe LSTM code cannot run without these files!")
else:
    print("\n All files found! Starting LSTM training...")

In [None]:
# ==================== LOAD DATA ====================
df_train = pd.read_csv("../data/train.csv")
df_val = pd.read_csv("../data/val.csv") 
df_test = pd.read_csv("../data/test.csv")

print(f"\nDataset sizes:")
print(f"Training: {len(df_train)} samples")
print(f"Validation: {len(df_val)} samples") 
print(f"Test: {len(df_test)} samples")
print(f"Class distribution: {df_train['label'].value_counts().to_dict()}")

In [None]:
 # ==================== DATA PREPARATION ====================
# Use the combined_text column
X_train = df_train['combined_text'].fillna('').astype(str).values
X_val = df_val['combined_text'].fillna('').astype(str).values
X_test = df_test['combined_text'].fillna('').astype(str).values

# Convert labels to binary (FAKE=1, TRUE=0)
y_train = (df_train['label'] == 'FAKE').astype(int).values
y_val = (df_val['label'] == 'FAKE').astype(int).values
y_test = (df_test['label'] == 'FAKE').astype(int).values

print(f"Labels - FAKE: {y_train.sum()}, REAL: {len(y_train) - y_train.sum()}")


In [None]:
# ==================== TOKENIZATION  ====================
NUM_WORDS = 50000    # Tokenizer vocabulary size
MAX_LEN = 256        # Maximum sequence length  
OOV_TOKEN = "<UNK>"  # Out-of-vocabulary token
    
EMBED_DIM = 128      # Embedding dimension
LSTM_UNITS = 128     # LSTM units
DROPOUT = 0.3        # Dropout rate
LR = 1e-3            # Learning rate
BATCH_SIZE = 64      # Batch size
EPOCHS = 8           # Number of epochs

# Create tokenizer 
tokenizer = Tokenizer(num_words=NUM_WORDS, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(X_train)

# Convert to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to MAX_LEN
 X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

print(f"\nData shapes :")
print(f"X_train_pad: {X_train_pad.shape}")
print(f"Vocabulary size: {len(tokenizer.word_index)}")
print(f"Max sequence length: {MAX_LEN}")


In [None]:
# ==================== BUILD LSTM MODEL  ====================
from tensorflow.keras.metrics import Precision, Recall
def create_lstm_model(vocab_size, embed_dim, lstm_units, dropout_rate, sequence_length):
        """
        Build LSTM model 
        """
        model = Sequential([
            # Embedding layer with EMBED_DIM
            Embedding(
                input_dim=vocab_size, 
                output_dim=embed_dim, 
                input_length=sequence_length,
                name='embedding_layer'
            ),
            
            # Spatial dropout
            SpatialDropout1D(dropout_rate, name='spatial_dropout'),
            
            # Bidirectional LSTM with LSTM_UNITS
            Bidirectional(
                LSTM(lstm_units, return_sequences=True, dropout=dropout_rate),
                name='bidirectional_lstm_1'
            ),
            
            # Second Bidirectional LSTM
            Bidirectional(
                LSTM(lstm_units // 2, dropout=dropout_rate),  # Half the units for second layer
                name='bidirectional_lstm_2'
            ),
            
            # Dense layer
            Dense(lstm_units // 2, activation='relu', name='dense_1'),
            Dropout(dropout_rate, name='dropout_1'),
            
            # Output layer
            Dense(1, activation='sigmoid', name='output_layer')
        ])
        
        # Compile 
        optimizer = Adam(learning_rate=LR)
        model.compile(
            optimizer=optimizer,
            loss='binary_crossentropy', 
            metrics=['accuracy', Precision(), Recall()]
        )
        return model

    # Create model 
vocab_size = min(NUM_WORDS, len(tokenizer.word_index) + 1)
lstm_model = create_lstm_model(
        vocab_size=vocab_size,
        embed_dim=EMBED_DIM,
        lstm_units=LSTM_UNITS,
        dropout_rate=DROPOUT,
        sequence_length=MAX_LEN
    )
    
print(" Model created successfully!")
print(f" Model Parameters:")
print(f"   - Vocabulary size: {vocab_size}")
print(f"   - Embedding dimension: {EMBED_DIM}")
print(f"   - LSTM units: {LSTM_UNITS}")
print(f"   - Dropout rate: {DROPOUT}")
print(f"   - Learning rate: {LR}")
print(f"   - Batch size: {BATCH_SIZE}")
print(f"   - Epochs: {EPOCHS}")
    
lstm_model.summary()
