In [1]:
import os
from pathlib import Path

# Create the required folders if they don't exist
folders_to_create = ['../models', '../results']

for folder in folders_to_create:
    Path(folder).mkdir(parents=True, exist_ok=True)
    print(f"Created/verified folder: {folder}")

# Check if the data files exist
data_files = ['../data/train.csv', '../data/val.csv', '../data/test.csv']

for file in data_files:
    if os.path.exists(file):
        print(f"✓ Found: {file}")
    else:
        print(f"✗ MISSING: {file} - Ask your leader for these files!")

Created/verified folder: ../models
Created/verified folder: ../results
✓ Found: ../data/train.csv
✓ Found: ../data/val.csv
✓ Found: ../data/test.csv


## LSTM Modeling - with files check

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print(f"TensorFlow version: {tf.__version__}")


ModuleNotFoundError: No module named 'pandas'

In [74]:
# SIMPLE & SAFE LSTM MODEL
def create_guaranteed_lstm(vocab_size, embedding_dim, sequence_length):
    """
    Ultra-simple LSTM that GUARANTEES no compatibility issues
    """
    model = Sequential()
    
    # Embedding layer
    model.add(Embedding(
        input_dim=vocab_size, 
        output_dim=embedding_dim, 
        input_length=sequence_length
    ))
    
    # SINGLE LSTM layer - no return_sequences, no bidirectional
    model.add(LSTM(64))  # This will definitely work
    
    # Simple dense layers
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.3))
    
    # Output layer
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy', 'precision', 'recall']
    )
    
    return model

In [75]:
# LOAD AND PREPARE DATA
# Check files
data_files = ['../data/train.csv', '../data/val.csv', '../data/test.csv']
all_files_exist = all(os.path.exists(f) for f in data_files)

if not all_files_exist:
    print(" Missing data files. Please check the file paths.")
else:
    print(" All files found!")
    
    # Load data
    df_train = pd.read_csv("../data/train.csv")
    df_val = pd.read_csv("../data/val.csv") 
    df_test = pd.read_csv("../data/test.csv")

    print(f"\n Dataset sizes: Train={len(df_train)}, Val={len(df_val)}, Test={len(df_test)}")

    # Prepare data
    X_train = df_train['combined_text'].fillna('').astype(str).values
    X_val = df_val['combined_text'].fillna('').astype(str).values
    X_test = df_test['combined_text'].fillna('').astype(str).values

    y_train = (df_train['label'] == 'FAKE').astype(int).values
    y_val = (df_val['label'] == 'FAKE').astype(int).values
    y_test = (df_test['label'] == 'FAKE').astype(int).values

 All files found!

 Dataset sizes: Train=35918, Val=4490, Test=4490


In [88]:
# Tokenization
MAX_SEQUENCE_LENGTH = 500
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

print(f" Data shape: {X_train_pad.shape}")
print(f" Vocabulary size: {len(tokenizer.word_index)}")

 Data shape: (35918, 500)
 Vocabulary size: 89319


In [99]:
# BUILD & TRAIN MODEL
vocab_size = min(MAX_VOCAB_SIZE, len(tokenizer.word_index) + 1)

# Ensure input data is correct type for LSTM
X_train_pad = X_train_pad.astype('int32')
X_val_pad = X_val_pad.astype('int32')
X_test_pad = X_test_pad.astype('int32')

print("X_train_pad shape:", X_train_pad.shape)
print("X_train_pad dtype:", X_train_pad.dtype)
print("First row:", X_train_pad[0])
print("Contains NaN:", np.isnan(X_train_pad).any())

print("🏗️ Building guaranteed-working LSTM model...")
lstm_model = create_guaranteed_lstm(vocab_size, EMBEDDING_DIM, MAX_SEQUENCE_LENGTH)

print("✅ Model built successfully!")
lstm_model.summary()

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1
)

# Train
print("\n🚀 Starting training...")
history = lstm_model.fit(
    X_train_pad, y_train,
    batch_size=32,
    epochs=15,
    validation_data=(X_val_pad, y_val),
    callbacks=[early_stopping],
    verbose=1
)

X_train_pad shape: (35918, 500)
X_train_pad dtype: int32
First row: [ 2458  4415   674   214   270   555   439    88   557    91     2   700
  1792    42   674   214  1480    67   270   191   233   669     2   700
  4662  1420    25    17  1334  1153   674   214  2035  4662  3676 11652
 12770 12321  6694  2373  6831     4   674   214     1   537   304   251
   270   555   439   219     2    74   700 11652 12770  5742    25    17
   674   214   235   669   213   430   240  1004    22  2956   976   568
    25    17  3328   555   439    32   178   700  1792   270 14478    91
 11652  1067    25    17   140  1004  3130  4904  6874 17759   277    69
   617    94   107     2  3784  5742   674   214     1     1     1     1
   270   537  7325     2  4905 19033   166  3469  2114 14058   994   124
    57  2748  8829   855  8829   170  2691   239   165 19034  7831     1
  2114 14058 10163  2559 11652 12770    25    17     1     1     1     1
   270   537   674   214   907   334   332   202   788  

NotImplementedError: Cannot convert a symbolic Tensor (lstm_17/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported

In [100]:
# Upgrade TensorFlow and Keras using pip (run this cell, then restart the kernel)
!pip install --upgrade tensorflow keras

Collecting tensorflow
  Downloading tensorflow-2.13.1-cp38-cp38-win_amd64.whl (1.9 kB)
Collecting keras
  Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)


ERROR: Could not find a version that satisfies the requirement tensorflow-intel==2.13.1; platform_system == "Windows" (from tensorflow) (from versions: 0.0.1, 2.10.0.dev20220728, 2.10.0rc0, 2.10.0rc1, 2.10.0rc2, 2.10.0rc3, 2.10.0, 2.10.1, 2.11.0rc0, 2.11.0rc1, 2.11.0rc2, 2.11.0, 2.11.1, 2.12.0rc0, 2.12.0rc1, 2.12.0, 2.12.1, 2.13.0rc0, 2.13.0rc1, 2.13.0rc2, 2.13.0)
ERROR: No matching distribution found for tensorflow-intel==2.13.1; platform_system == "Windows" (from tensorflow)


In [101]:
import tensorflow as tf
import keras
print("TensorFlow version:", tf.__version__)
print("Keras version:", keras.__version__)

TensorFlow version: 2.3.1
Keras version: 2.4.3
