In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')


In [None]:
import os
from pathlib import Path

# Create the required folders if they don't exist
folders_to_create = ['../models', '../results']

for folder in folders_to_create:
    Path(folder).mkdir(parents=True, exist_ok=True)
    print(f"Created/verified folder: {folder}")

# Check if the data files exist
data_files = ['../data/train.csv', '../data/val.csv', '../data/test.csv']

for file in data_files:
    if os.path.exists(file):
        print(f" Found: {file}")
    else:
        print(f" MISSING: {file} - Ask for these files!")

In [None]:
# ==================== FILE CHECK ====================
print("=== Checking required files ===")
# Check if data files exist
data_files = {
    'train': '../data/train.csv',
    'val': '../data/val.csv', 
    'test': '../data/test.csv'
}

missing_files = []
for name, path in data_files.items():
    if os.path.exists(path):
        print(f" Found {name} data: {path}")
    else:
        print(f" MISSING: {path}")
        missing_files.append(path)

if missing_files:
    print(f"\n ERROR: Missing {len(missing_files)} files!:")
    for file in missing_files:
        print(f"   - {file}")
    print("\nThe LSTM code cannot run without these files!")
else:
    print("\n All files found! Starting LSTM training...")

In [None]:
# ==================== LOAD DATA ====================
df_train = pd.read_csv("../data/train.csv")
df_val = pd.read_csv("../data/val.csv") 
df_test = pd.read_csv("../data/test.csv")

print(f"\nDataset sizes:")
print(f"Training: {len(df_train)} samples")
print(f"Validation: {len(df_val)} samples") 
print(f"Test: {len(df_test)} samples")
print(f"Class distribution: {df_train['label'].value_counts().to_dict()}")

In [None]:
 # ==================== DATA PREPARATION ====================
# Use the combined_text column
X_train = df_train['combined_text'].fillna('').astype(str).values
X_val = df_val['combined_text'].fillna('').astype(str).values
X_test = df_test['combined_text'].fillna('').astype(str).values

# Convert labels to binary (FAKE=1, TRUE=0)
y_train = (df_train['label'] == 'FAKE').astype(int).values
y_val = (df_val['label'] == 'FAKE').astype(int).values
y_test = (df_test['label'] == 'FAKE').astype(int).values

print(f"Labels - FAKE: {y_train.sum()}, REAL: {len(y_train) - y_train.sum()}")
