In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report,
    confusion_matrix, roc_curve, auc, f1_score, roc_auc_score
)
from xgboost import XGBClassifier
import joblib
import re
import warnings
warnings.filterwarnings('ignore')

# Load the data (assuming df is already loaded from data collection notebook)
df = pd.read_csv("../data/combined_final.csv")

## Data Preparation and Train-Test Split

In [None]:
# Prepare data for modeling
X = df["title_text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

In [None]:
def normalize_series(series: pd.Series) -> pd.Series:
    # Ensure everything is a string
    series = series.fillna('').astype(str)

    # Convert to lowercase
    series = series.str.lower()

    # Remove URLs
    series = series.str.replace(r'https?://\S+|www\.\S+', '', regex=True)

    # Remove non-word characters
    series = series.str.replace(r'\W', ' ', regex=True)

    # Remove newlines
    series = series.str.replace(r'\n', '', regex=True)

    # Replace multiple spaces with a single space
    series = series.str.replace(r' +', ' ', regex=True)

    # Strip leading/trailing spaces
    series = series.str.strip()

    return series


# Normalize the text data
X_train = normalize_series(pd.Series(X_train))
X_test = normalize_series(pd.Series(X_test))

In [None]:
# Create TF-IDF features
tfidf = TfidfVectorizer(
    max_features=8000,        # tune up to 15K for higher accuracy
    stop_words="english",
    ngram_range=(1,2)         # unigrams + bigrams
)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)
print("TF-IDF shape:", X_train_tfidf.shape)

## Baseline Model: Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=300, n_jobs=-1, C=3.0)
lr.fit(X_train_tfidf, y_train)

# Predictions
y_pred_lr = lr.predict(X_test_tfidf)
y_pred_prob_lr = lr.predict_proba(X_test_tfidf)[:,1]

print("\nLogistic Regression - Model Evaluation:")
print("Accuracy:", round(accuracy_score(y_test, y_pred_lr) * 100, 2), "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr, digits=3))

In [None]:
# Save Logistic Regression model
joblib.dump(lr, "fake_news_lr_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
print("\nModels saved successfully: 'fake_news_lr_model.pkl' and 'tfidf_vectorizer.pkl'")

In [None]:
# Test with a sample
sample = ["Breaking: President announces new healthcare reform."]
sample_tfidf = tfidf.transform(sample)
prediction = lr.predict(sample_tfidf)[0]
label = "Real News" if prediction == 1 else "Fake News"
print(f"\nPrediction for sample:\n→ {label}")

## Baseline Model: XGBoost

In [None]:
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    n_jobs=-1,
    random_state=42
)

In [None]:
xgb_model.fit(X_train_tfidf, y_train)

In [None]:
y_pred_xgb = xgb_model.predict(X_test_tfidf)
y_pred_prob_xgb = xgb_model.predict_proba(X_test_tfidf)[:, 1]

print("\nXGBoost - Model Evaluation:")
print("Accuracy:", round(accuracy_score(y_test, y_pred_xgb) * 100, 2), "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb, digits=3))

In [None]:
# Save XGBoost model
joblib.dump(xgb_model, "fake_news_xgb_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
print("\nModels saved successfully: 'fake_news_xgb_model.pkl' and 'tfidf_vectorizer.pkl'")

In [None]:
# Test with a sample
sample = ["Breaking: President announces new healthcare reform."]
sample_tfidf = tfidf.transform(sample)
prediction = xgb_model.predict(sample_tfidf)[0]
label = "Real News" if prediction == 1 else "Fake News"
print(f"\nPrediction for sample:\n→ {label}")

## K-Fold Cross Validation using XGBoost

In [None]:
# Prepare full dataset for K-fold
tfidf = TfidfVectorizer(max_features=10000, stop_words="english", ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(X)

# 5-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

acc_scores, f1_scores, auc_scores = [], [], []

fold = 1
for train_idx, test_idx in kfold.split(X_tfidf):
    print(f"\nFold {fold}")

    X_train, X_test = X_tfidf[train_idx], X_tfidf[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model = XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        n_jobs=-1,
        random_state=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_prob)

    acc_scores.append(acc)
    f1_scores.append(f1)
    auc_scores.append(auc_score)

    print(f"Accuracy: {acc:.4f} | F1: {f1:.4f} | AUC: {auc_score:.4f}")
    fold += 1

# Overall Performance
print("\nAverage Performance (5-Fold):")
print(f"Accuracy: {np.mean(acc_scores):.4f} ± {np.std(acc_scores):.3f}")
print(f"F1-score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.3f}")
print(f"AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.3f}")

## Deep Learning Model: Long Short-Term Memory (LSTM)

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 10000
embedding_dim = 64
max_length = 256
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [None]:
# Tokenize and pad sequences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding=padding_type, truncating=trunc_type, maxlen=max_length)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding=padding_type, truncating=trunc_type, maxlen=max_length)

In [None]:
# Build Bi-LSTM model
model_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

In [None]:
# Setup callbacks
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6
)

# Compile model
model_lstm.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

# Train model
history_lstm = model_lstm.fit(X_train, y_train, epochs=10, validation_split=0.1, verbose=1, batch_size=30, shuffle=True, callbacks=[early_stop, reduce_lr])

In [None]:
# Model summary
model_lstm.summary()

In [None]:
# Save LSTM model
model_lstm.save("fake_news_bi_lstm_model.keras")
joblib.dump(tokenizer, "tokenizer.pkl")

# Optionally zip it to download later
!zip -r fake_news_bi_lstm_model.zip fake_news_bi_lstm_model.keras

## Training Visualization

In [None]:
history_dict = history_lstm.history

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = history_lstm.epoch

plt.figure(figsize=(10,6))
plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss', size=15)
plt.xlabel('Epochs', size=15)
plt.ylabel('Loss', size=15)
plt.legend(prop={'size': 15})
plt.show()

plt.figure(figsize=(10,6))
plt.plot(epochs, acc, 'g', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy', size=15)
plt.xlabel('Epochs', size=15)
plt.ylabel('Accuracy', size=15)
plt.legend(prop={'size': 15})
plt.ylim((0.5,1))
plt.show()

**Training Summary**

*1. Training Loss*
* Training loss decreases steadily and consistently across epochs.
* This indicates that the model is learning the training data well.

*2. Validation Loss*
* Validation loss decreases initially but then begins to fluctuate and slightly increase after ~3 epochs.
* This suggests the model starts to overfit after the 3rd or 4th epoch.

*3. Training Accuracy*
* Training accuracy increases smoothly and reaches close to 0.95 by the end.
* This is expected as the model learns patterns in the training set.

*4. Validation Accuracy*
* Validation accuracy improves initially and stabilizes around 0.88–0.90.
* After a few epochs, it does not improve further, hinting that more training does not help generalization.

## Upload Models to Hugging Face (Optional)

In [None]:
# Steps to Push Models in Hugging Face Repo for Later Evaluation
!pip install huggingface_hub
from huggingface_hub import login
login()   # paste your HF access token here

In [None]:
from huggingface_hub import create_repo

repo_id = "dl-quad/fake-news-bi-lstm-dl-quadrilateral"
create_repo(repo_id, exist_ok=True)

In [None]:
from huggingface_hub import upload_file

# Upload Logistic Regression
upload_file(
    path_or_fileobj="/content/fake_news_lr_model.pkl",
    path_in_repo="logistic_regression/fake_news_lr_model.pkl",
    repo_id=repo_id,
    repo_type="model",
    commit_message="Upload Logistic Regression model"
)

# Upload XGBoost
upload_file(
    path_or_fileobj="/content/fake_news_xgb_model.pkl",
    path_in_repo="xgboost/fake_news_xgb_model.pkl",
    repo_id=repo_id,
    repo_type="model",
    commit_message="Upload XGBoost model"
)

# Upload LSTM model
upload_file(
    path_or_fileobj="/content/fake_news_bi_lstm_model.keras",
    path_in_repo="lstm/fake_news_bi_lstm_model.keras",
    repo_id=repo_id,
    repo_type="model",
    commit_message="Upload LSTM model"
)

# Upload tokenizer for LSTM
upload_file(
    path_or_fileobj="/content/tokenizer.pkl",
    path_in_repo="lstm/tokenizer.pkl",
    repo_id=repo_id,
    repo_type="model",
    commit_message="Upload tokenizer"
)

# Upload TF-IDF vectorizer
upload_file(
    path_or_fileobj="/content/tfidf_vectorizer.pkl",
    path_in_repo="tfidf_vectorizer/tfidf_vectorizer.pkl",
    repo_id=repo_id,
    repo_type="model",
    commit_message="Upload TF-IDF vectorizer"
)

print("All models and artifacts uploaded successfully!")