In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import zipfile
from wordcloud import WordCloud
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import StratifiedShuffleSplit

# Download resources once
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")
# Core Python / Utilities
import os
import json
import random
import pickle
from pathlib import Path
import warnings
import torch
import sys
import inspect

# Data Handling
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# NLP / Text Preprocessing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Scikit-learn (ML, metrics, preprocessing)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    roc_auc_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve
)
import joblib

# Deep Learning (LSTM)
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import (
    Embedding,
    LSTM,
    Dense,
    Dropout,
    SpatialDropout1D,
    Bidirectional
)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

# Transformers (BERT)
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW  
import torch.nn as nn
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import get_linear_schedule_with_warmup
from transformers import BertModel
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

from datasets import Dataset

# Utility for progress
from tqdm import tqdm



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### **2.2 Load and Preview Data**

In [2]:
with zipfile.ZipFile("../data/archive.zip", 'r') as zip_ref:
    zip_ref.extractall("unzipped_data")
    
print("Files extracted successfully!")

Files extracted successfully!


In [3]:
os.listdir("unzipped_data")

['Fake.csv', 'True.csv']

In [4]:
fake_df = pd.read_csv("unzipped_data/Fake.csv")
true_df = pd.read_csv("unzipped_data/True.csv")

print("Fake News Dataset:", fake_df.shape)
print("True News Dataset:", true_df.shape)

fake_df.head()

Fake News Dataset: (23481, 4)
True News Dataset: (21417, 4)


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [5]:
#merge and label

#Add a label column
fake_df["label"] = "FAKE"
true_df["label"] = "TRUE"

#Merge into one dataset
df = pd.concat([fake_df, true_df], ignore_index = True)

#Shuffle the rows so FAKE and TRUE are mixed
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

#Check the structure
print(df.shape)
print(df["label"].value_counts())
print(df.info())
df.head()

(44898, 5)
label
FAKE    23481
TRUE    21417
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB
None


Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",FAKE
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",TRUE
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",TRUE
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",FAKE
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",TRUE


## **Chapter 3. Data Preparation**
In this section, we will 

### **3.1 Lowercasing & URL removal**

**1. Defining Preprocessing Function**

In [6]:
def preprocess_text_lowercase_url(text):
    """
    MAIN PREPROCESSING FUNCTION:
    - Converts text to lowercase
    - Removes URLs, hyperlinks, and website addresses
    - Handles missing values safely
    - Cleans extra whitespace
    """
    # Handle missing values
    if pd.isna(text) or text is None:
        return ""
    
    # Convert to string to ensure consistent processing
    text = str(text)
    
    # COMPREHENSIVE URL REMOVAL PATTERN:
    url_pattern = r'https?://\S+|www\.\S+|\S+\.(com|org|net|edu|gov|io|co|uk)\S*|bit\.ly/\S+|t\.co/\S+'
    
    # Remove all URLs from text
    text = re.sub(url_pattern, '', text)
    
    # Convert entire text to lowercase for consistency
    text = text.lower()
    
    # Clean up extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

print("Preprocessing functions defined!\n")

Preprocessing functions defined!



**2. Quality check functions**

In [7]:
def contains_url(text):
    """Check if text contains any URLs"""
    url_pattern = r'https?://|www\.|\.[a-z]{2,}'
    return bool(re.search(url_pattern, str(text).lower()))

def count_uppercase(text):
    """Count uppercase characters in text"""
    return sum(1 for char in str(text) if char.isupper())

### **3.2 Remove Non-Alphabetic Characters**

In [8]:
URL_RE   = re.compile(r'https?://\S+|www\.\S+')
HTML_RE  = re.compile(r'<.*?>')
NONALPH  = re.compile(r'[^a-z\s]+')     # keep letters & spaces only
WS_RE    = re.compile(r'\s+')

# Defining Preprocessing Function
def _keep_alpha_only(text: str) -> str:
    text = NONALPH.sub(" ", text)    # remove non-letters
    text = WS_RE.sub(" ", text).strip()
    return text

### **3.3 Lemmatization**

In [9]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Define preprocessing + lemmatization function
def preprocess_and_lemmatize(text):
    if isinstance(text, str):  # make sure it's a string
        # Lowercase
        text = text.lower()

        # Remove punctuation, numbers, special chars
        text = re.sub(r'[^a-z\s]', '', text)

        # Tokenize
        tokens = nltk.word_tokenize(text)

        # Remove stopwords + lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

        return " ".join(tokens)
    else:
        return ""


### **3.4 Apply Preprocessing**

**Defining Function**

In [None]:
# Defining function to apply preprocessing
def apply_preprocessing(text: str) -> str:
    """
      1) preprocess_text_lowercase_url  [lowercase + URL removal + whitespace clean]
      2) _keep_alpha_only                [remove non-alphabetic, collapse spaces]
      3) preprocess_and_lemmatize [tokenize, drop stopwords, lemmatize]
    """
    # Step 1 (Teammate 4)
    text = preprocess_text_lowercase_url(text)

    # Step 2 (Teammate 4)
    text = _keep_alpha_only(text)

    # Step 3 (Teammate 5, adapter)
    text = preprocess_and_lemmatize(text)

    return text


**Applying Preprocessing**

In [11]:
df['title_clean'] = df['title'].apply(apply_preprocessing)
df['text_clean'] = df['text'].apply(apply_preprocessing)

### **3.5 Creating `combined_text` column**

In [16]:
df["combined_text"] = (df["title_clean"] + " " + df["text_clean"]).str.strip()

In [17]:
df.head()

Unnamed: 0,title,text,subject,date,label,title_clean,text_clean,combined_text
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",FAKE,ben stein call th circuit court committed coup...,st century wire say ben stein reputable profes...,ben stein call th circuit court committed coup...
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",TRUE,trump drop steve bannon national security council,washington reuters u president donald trump re...,trump drop steve bannon national security coun...
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",TRUE,puerto rico expects u lift jones act shipping ...,reuters puerto rico governor ricardo rossello ...,puerto rico expects u lift jones act shipping ...
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",FAKE,oops trump accidentally confirmed leaked israe...,monday donald trump embarrassed country accide...,oops trump accidentally confirmed leaked israe...
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",TRUE,donald trump head scotland reopen golf resort,glasgow scotland reuters u presidential candid...,donald trump head scotland reopen golf resort ...


**Checking length of combined texts**

In [20]:
# Character lengths
COL = "combined_text" 
df["_char_len"] = df[COL].str.len()

# Simple whitespace token count [plain: word count proxy]
df["_tok_len_ws"] = df[COL].str.split().apply(len)

# Summaries
print("Char length stats:", df["_char_len"].describe(percentiles=[.5,.9,.95,.99]).to_dict())
print("Token length stats:", df["_tok_len_ws"].describe(percentiles=[.5,.9,.95,.99]).to_dict())

# Flags for extremes (adjust thresholds to your data)
too_short = df["_tok_len_ws"] < 3           # [plain: likely junk]
too_long_char = df["_char_len"] > 8000      # [plain: abnormally long articles]
print({"too_short": int(too_short.sum()), "too_long_char": int(too_long_char.sum())})

Char length stats: {'count': 44898.0, 'mean': 1752.6512762261125, 'std': 1504.6273547302997, 'min': 0.0, '50%': 1542.0, '90%': 3202.0, '95%': 3917.300000000003, '99%': 6340.029999999999, 'max': 37972.0}
Token length stats: {'count': 44898.0, 'mean': 242.22885206467993, 'std': 204.33721510097453, 'min': 0.0, '50%': 215.0, '90%': 440.0, '95%': 538.0, '99%': 863.0, 'max': 4968.0}
{'too_short': 9, 'too_long_char': 266}


**Insights**

Since some combined texts are too long, those training BERT and LSTM should set paramters for the maximum length allowed

## **Chapter 4. Modelling & Evaluation**

In this section we will implement 3 different models on our `clean_text` and `combined_text` columns.

The models are:
- `Logistic Regression`
- `LSTM`
- `BERT`

We will also do `evaluation` for each of the models together with training due to the training for some of the models requiring to be done in external environments with GPUs

### **4.1 Stratified Train Test Split**

We create a function to do two Stratified Train Test Splits to our data ensruing that we have 10% in both the validation split and the test split.

A Stratified Split ensures we maintain the ratio of classes `Main/Fake` throughhout our splits

We export the splits into csv files for GPU training

In [None]:
def train_val_test_split_stratified(df, label_col="label", test_size=0.1, val_size=0.1, seed=42):
    y = df[label_col].values  # these are "FAKE"/"TRUE" strings

    # First split: train+val vs test
    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
    idx_trainval, idx_test = next(sss1.split(df, y))
    df_trainval = df.iloc[idx_trainval].reset_index(drop=True)
    df_test = df.iloc[idx_test].reset_index(drop=True)

    # Second split: train vs val
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=val_size/(1-test_size), random_state=seed)
    y_tv = df_trainval[label_col].values
    idx_train, idx_val = next(sss2.split(df_trainval, y_tv))
    df_train = df_trainval.iloc[idx_train].reset_index(drop=True)
    df_val = df_trainval.iloc[idx_val].reset_index(drop=True)

    return df_train, df_val, df_test

# Run the split
df_train, df_val, df_test = train_val_test_split_stratified(
    df, label_col="label", test_size=0.10, val_size=0.10, seed=42
)

# Print sizes
print({k: len(v) for k,v in {"train": df_train, "val": df_val, "test": df_test}.items()})

# Save the raw splits
from pathlib import Path
split_dir = Path("../data")        # define as Path, not string
split_dir.mkdir(parents=True, exist_ok=True)  # make sure folder exists

df_train.to_csv(split_dir / "train.csv", index=False)
df_val.to_csv(split_dir / "val.csv", index=False)
df_test.to_csv(split_dir / "test.csv", index=False)
print("Files saved succesfully under data/")

{'train': 35918, 'val': 4490, 'test': 4490}


We export `train.csv`,  `test.csv` & `val.csv` to kaggle as a dataset

### **Kaggle Configuration**

We define paths that suit both the local repository paths and the kaggle directories. The notebook falls on either depending on where the notebook is being run

In [None]:
# --- Portable paths: Kaggle vs Local ---
from pathlib import Path
import os

IS_KAGGLE = Path("/kaggle/working").exists()

WORK_DIR   = Path("/kaggle/working") if IS_KAGGLE else Path(".").resolve()
DATA_OUT   = (WORK_DIR / "data")      if IS_KAGGLE else Path("../data")
MODELS_OUT = (WORK_DIR / "models")    if IS_KAGGLE else Path("models")
RESULTS_OUT= (WORK_DIR / "results")   if IS_KAGGLE else Path("../results")
DATA_DIR = Path("/kaggle/input/fake-news-split") if IS_KAGGLE else Path("../data")

# Make sure they exist if/when used
for p in [DATA_OUT, MODELS_OUT, RESULTS_OUT]:
    p.mkdir(parents=True, exist_ok=True)

def save_artifact(df, filename: str):
    """
    Save a pandas DataFrame both to Kaggle (/kaggle/working/...)
    and, if present locally, to ../data. Falls back to CWD.
    """
    written = []
    # Kaggle
    if IS_KAGGLE:
        out_k = WORK_DIR / filename
        df.to_csv(out_k, index=False)
        print(f"[Kaggle] Saved: {out_k}")
        written.append(str(out_k))
    # Local ../data
    local_dir = Path("../data")
    if local_dir.exists():
        out_l = local_dir / filename
        df.to_csv(out_l, index=False)
        print(f"[Local]  Saved: {out_l}")
        written.append(str(out_l))
    # Fallback
    if not written:
        out_f = Path("./") / filename
        df.to_csv(out_f, index=False)
        print(f"[Fallback] Saved: {out_f}")
        written.append(str(out_f))
    return written


In [None]:
df_train = pd.read_csv(DATA_DIR / "train.csv")
df_val   = pd.read_csv(DATA_DIR / "val.csv")
df_test  = pd.read_csv(DATA_DIR / "test.csv")

### **4.2 Logistic Regression on Text Only**

Before moving to advanced models , we first establish a simple baseline using TF-IDF vectorization combined with Logistic Regression. This provides a benchmark to compare more complex approaches against.

#### **Step 1. Vectorize text with TF-IDF**

In [None]:

vectorizer = TfidfVectorizer(
    max_features=10000,     # top 10k features
    ngram_range=(1,2),      # unigrams + bigrams
    stop_words="english"    # remove stopwords
)

label_map = {"FAKE": 1, "TRUE": 0}
for d in (df_train, df_val, df_test):
    d["label_num"] = d['label'].map(label_map)
    
X_train = vectorizer.fit_transform(df_train["text_clean"])
X_val   = vectorizer.transform(df_val["text_clean"])
X_test  = vectorizer.transform(df_test["text_clean"])

y_train = df_train["label_num"]
y_val   = df_val["label_num"]
y_test  = df_test["label_num"]

#### **Step 2. Train logistic regression**

In [None]:
log_reg = LogisticRegression(
    max_iter=500,       # enough iterations
    solver="liblinear"  # good for small/medium datasets
)
log_reg.fit(X_train, y_train)

#### **Step 3. Evaluate on Validation Set**

In [None]:
y_val_pred = log_reg.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report (Validation):\n", classification_report(y_val, y_val_pred))

#### **Step 4. Evaluate on Test Set**

In [None]:
y_test_pred = log_reg.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report (Test):\n", classification_report(y_test, y_test_pred))
y_pred_lr_clean = y_test_pred  # alias for comparison table

### **4.3 Logistic Regression on Combined Text**

#### **Step 1. Train/Validation/Test Setup**

We use stratified splits to ensure balanced representation of FAKE and TRUE labels across train, validation, and test sets.  
- Labels are encoded as FAKE = 1, TRUE = 0.  
- Text features are taken from the `combined_text` column (or `text` if not available).  
- The resulting splits are: Train, Validation, and Test.

In [None]:
# Define features and labels using prepared splits: df_train, df_val, df_test

TEXT_COL = "combined_text" if "combined_text" in df_train.columns else "text"
LABEL_COL = "label"

# Encode labels: FAKE = 1, TRUE = 0
label_map = {"FAKE": 1, "TRUE": 0}
for d in (df_train, df_val, df_test):
    d["label_num"] = d[LABEL_COL].map(label_map)

# Split into features (X) and targets (y)
X_train, y_train = df_train[TEXT_COL], df_train["label_num"]
X_val,   y_val   = df_val[TEXT_COL],   df_val["label_num"]
X_test,  y_test  = df_test[TEXT_COL],  df_test["label_num"]

print("Splits:", df_train.shape, df_val.shape, df_test.shape, "| text column:", TEXT_COL)


#### **Step 2. Baseline Model: TF-IDF + Logistic Regression**

We fit a simple baseline using TF-IDF features (unigrams, max 5k terms) and Logistic Regression.  
This establishes a reference for accuracy, F1, and ROC-AUC before tuning.

In [None]:
# Vectorize
tfidf_base = TfidfVectorizer(max_features=5000, stop_words="english")
Xtr_base = tfidf_base.fit_transform(X_train)
Xte_base = tfidf_base.transform(X_test)

# Model
lr_base = LogisticRegression(max_iter=1000, solver="liblinear", random_state=42)
lr_base.fit(Xtr_base, y_train)

# Predictions
y_pred_base  = lr_base.predict(Xte_base)
y_proba_base = lr_base.predict_proba(Xte_base)[:, 1]
y_pred_lr_combined = y_pred_base  # alias for comparison table

#### **Step 3. Baseline Evaluation**

In [None]:
# Evaluate Baseline Logistic Regression
print("Baseline Accuracy:", round(accuracy_score(y_test, y_pred_base), 4))
print("\nClassification Report (Baseline):\n", classification_report(y_test, y_pred_base, target_names=["TRUE","FAKE"]))

# Confusion Matrix
cm_base = confusion_matrix(y_test, y_pred_base)
ConfusionMatrixDisplay(cm_base, display_labels=["TRUE","FAKE"]).plot(values_format="d")
plt.title("Baseline — Confusion Matrix")
plt.show()

# ROC-AUC Curve
auc_base = roc_auc_score(y_test, y_proba_base)
fpr_b, tpr_b, _ = roc_curve(y_test, y_proba_base)
plt.plot(fpr_b, tpr_b, label=f"Baseline ROC-AUC = {auc_base:.4f}")
plt.plot([0,1],[0,1],"--")
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("Baseline — ROC Curve"); plt.legend(); plt.show()


**Model Performance Summary**

Accuracy: 0.9846 (~99%)

Precision: TRUE = 0.98, FAKE = 0.99

Recall: TRUE = 0.99, FAKE = 0.98

F1-score: Both classes ~0.99

ROC-AUC: 0.9988 (excellent separation between classes)


**Confusion Matrix Insights**

TRUE articles: 2119 correctly predicted, 23 misclassified as FAKE.

FAKE articles: 2302 correctly predicted, 46 misclassified as TRUE.

The model makes very few mistakes compared to the large sample size.

* **Interpretation:**

Errors are balanced between both classes - the model is not biased towards TRUE or FAKE.

This means misclassifications happens in fewer than ~70 out of ~4490 teest samples (<1.6%) 


**ROC Curve**

ROC-AUC = 0.9988 (~99.9%) - that means the model can almost perfectly distinguish between fake and true news.

The curve hugs the top-left corner which is an indication of very high sensitivity and specificity.


**Key Takeaways**

- The baseline Logistic Regression model performs exceptionally well, with nearly perfect accuracy and ROC-AUC.

- Both classes (TRUE, FAKE) are balanced in performance, so the model is not biased toward one class.

- The few misclassifications (23 TRUE → FAKE, 46 FAKE → TRUE) are very small relative to the dataset size.

This strong baseline suggests Logistic Regression with TF-IDF is already a very competitive model for fake news detection.

Hyperparameter tuning may improve results slightly, but even the untuned model is strong enough for deployment.


**Save Baseline Model — For Reuse/Deployment**  

After evaluating the baseline Logistic Regression, we save the model and vectorizer as artifacts.  
These files can be reloaded later for deployment (e.g., Streamlit app, API, or reporting).  
This ensures we preserve a strong baseline, even if tuning results differ.

In [None]:
import os, joblib

#Create models directory if doesn't exists
os.makedirs("models", exist_ok=True)

#Save baseline Logistic Regression + TF-IDF
joblib.dump(lr_base, "models/logreg_model.joblib")
joblib.dump(tfidf_base,   "models/tfidf_vectorizer.joblib")
print("Saved: models/logreg_model.joblib, models/tfidf_vectorizer.joblib")


#### **Step 4. Hyperparameter Tuning with GridSearchCV**

Now that we have a strong baseline, we perform hyperparameter tuning to confirm robustness and test whether performance can be further optimized.

We tune both TF-IDF and Logistic Regression:

- `tfidf__ngram_range`: unigrams vs bigrams  
- `tfidf__max_df` and `tfidf__min_df`: filter overly common/rare words  
- `clf__C`: regularization strength  
- `clf__solver` and `clf__penalty`: logistic regression optimization  

Evaluation metric: **F1-macro** (balances FAKE and TRUE equally).

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Pipeline: TF-IDF + Logistic Regression
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("clf", LogisticRegression(max_iter=2000, random_state=42))
])

# Parameter grid
param_grid = {
    "tfidf__max_df": [0.5, 0.7, 0.9],
    "tfidf__min_df": [2, 5],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "clf__C": [0.1, 1, 3, 10],
    "clf__solver": ["liblinear"],
    "clf__penalty": ["l2"]
}

# Cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Run GridSearch
gs = GridSearchCV(pipe, param_grid, scoring="f1_macro", cv=cv, n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)

print("Best CV f1_macro:", round(gs.best_score_, 4))
print("Best params:", gs.best_params_)

#### **Step 5. Tuned Model Evaluation**

We evaluate the best configuration on the held-out test set and compare to baseline

In [None]:
# Evaluate best model on test set
best = gs.best_estimator_

pred_tuned= best.predict(X_test)
proba_tuned = best.predict_proba(X_test)[:,1]

from sklearn.metrics import f1_score
print("Tuned Accuracy:", round(accuracy_score(y_test, pred_tuned), 4))
print("Tuned F1 (macro):", round(f1_score(y_test, pred_tuned, average="macro"), 4))
print("\nClassification Report (Tuned):\n",
      classification_report(y_test, pred_tuned, target_names=["TRUE","FAKE"]))

cm_tuned = confusion_matrix(y_test, pred_tuned)
ConfusionMatrixDisplay(cm_tuned, display_labels=["TRUE","FAKE"]).plot(values_format="d")
plt.title("Tuned — Confusion Matrix"); plt.show()

auc_tuned = roc_auc_score(y_test, proba_tuned)
fpr_t, tpr_t, _ = roc_curve(y_test, proba_tuned)
plt.plot(fpr_t, tpr_t, label=f"Tuned ROC-AUC = {auc_tuned:.4f}")
plt.plot([0,1],[0,1],"--")
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("Tuned — ROC Curve"); plt.legend(); plt.show()


**F. Baseline vs Tuned Model Comparison**

We compare baseline Logistic Regression with the tuned GridSearchCV model to highlight improvements.

| Model                      | Accuracy | F1 (Macro) | ROC-AUC |
|-----------------------------|----------|------------|---------|
| Logistic Regression (Base)  | 0.9846   | ~0.98    | ~0.9988  |
| Logistic Regression (Tuned) | 0.9922   | ~0.99  | ~0.9997 |

**Interpretation:**
- The baseline model was already very strong, with high precision, recall, and ROC-AUC.
- After tuning, the **tuned model** achieved slightly higher Accuracy (+0.0076), F1 (+0.0076), and ROC-AUC (+0.0009).  
- Although the improvement is modest, tuning confirms that the model is **robust and reliable across multiple folds**, making it safer for deployment.  
- Both models perform well, but the tuned version is recommended for production since it has been validated systematically.

**Step 6. Save Tuned Model Artifacts - For Deployment**

In [None]:
import os, joblib

os.makedirs("models", exist_ok=True)

# Save tuned model and vectorizer
joblib.dump(best, "models/logreg_tuned_model.joblib")
joblib.dump(best.named_steps["tfidf"], "models/tfidf_tuned_vectorizer.joblib")

print("Saved: models/logreg_tuned_model.joblib")
print("Saved: models/tfidf_tuned_vectorizer.joblib")

### **4.4 LSTM on Text Only**

This step was exported to kaggle to utilise its GPU for training

#### **Step 1. Config**

In [None]:
# Reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Input/output
INPUT_COL = "text_clean"   # or "text_clean"
LABEL_COL = "label"

# Tokenizer / sequence
NUM_WORDS = 50000
MAX_LEN   = 256
OOV_TOKEN = "<UNK>"

# Model settings
EMBED_DIM  = 128
LSTM_UNITS = 128
DROPOUT    = 0.3
LR         = 1e-3
BATCH_SIZE = 64
EPOCHS     = 8

#### **Step 2. Normalize Labels**

In [None]:
# Normalize labels
for df in [df_train, df_val, df_test]:
    df[LABEL_COL] = df[LABEL_COL].astype(str).str.upper().str.strip()
    df[INPUT_COL] = df[INPUT_COL].astype(str).str.strip()

# Convert labels to 0/1
def labels_to_int(series):
    return (series == "TRUE").astype(int).values

y_tr = labels_to_int(df_train[LABEL_COL])
y_va = labels_to_int(df_val[LABEL_COL])
y_te = labels_to_int(df_test[LABEL_COL])

#### **Step 3. Tokenize and Pad**

In [None]:
tok = Tokenizer(num_words=NUM_WORDS, lower=True, oov_token=OOV_TOKEN)
tok.fit_on_texts(df_train[INPUT_COL])

def to_padded(texts, tokenizer, max_len=MAX_LEN):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=max_len, padding="post", truncating="post")

X_tr = to_padded(df_train[INPUT_COL], tok)
X_va = to_padded(df_val[INPUT_COL], tok)
X_te = to_padded(df_test[INPUT_COL], tok)

#### **Step 4. Build Model**

In [None]:
def build_model(vocab_size, max_len):
    inp = layers.Input(shape=(max_len,), dtype="int32")
    emb = layers.Embedding(input_dim=vocab_size,
                           output_dim=EMBED_DIM,
                           mask_zero=True)(inp)
    x = layers.Bidirectional(layers.LSTM(LSTM_UNITS, return_sequences=True))(emb)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(DROPOUT)(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(DROPOUT)(x)
    out = layers.Dense(1, activation="sigmoid")(x)
    model = models.Model(inp, out)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LR),
                  loss="binary_crossentropy",
                  metrics=[tf.keras.metrics.AUC(name="auc"), "accuracy"])
    return model

vocab_size = min(NUM_WORDS, len(tok.word_index) + 1)
model = build_model(vocab_size, MAX_LEN)
model.summary()

#### **Step 5. Train the Model**

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_auc", mode="max",
                                     patience=2, restore_best_weights=True)
]

history = model.fit(
    X_tr, y_tr,
    validation_data=(X_va, y_va),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

#### **Step 6. Saving the model and Tokenizers**

In [None]:
# Save the trained model
model.save(WORK_DIR / "best_lstm.keras")

# Save the tokenizer
import json
with open(WORK_DIR / "tokenizer.json", "w", encoding="utf-8") as f:
    f.write(tok.to_json())

#### **Step 7. Metrics**

In [None]:
y_prob = model.predict(X_te, batch_size=2*BATCH_SIZE).ravel()
y_pred = (y_prob >= 0.5).astype(int)

acc = accuracy_score(y_te, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_te, y_pred, average="binary")
roc = roc_auc_score(y_te, y_prob)

metrics = {"accuracy": float(acc), "precision": float(prec),
           "recall": float(rec), "f1": float(f1), "roc_auc": float(roc)}
metrics
y_pred_lstm_clean = y_pred  # alias for comparison table (ground truth is y_te here)

#### **Step 8. Saving Metrics**

In [None]:
with open(WORK_DIR / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

### **4.5 LSTM on Combined Text**

**Step 1. Data Preparation**

In [None]:
 # ==================== DATA PREPARATION ====================
# Use the combined_text column
X_train = df_train['combined_text'].fillna('').astype(str).values
X_val = df_val['combined_text'].fillna('').astype(str).values
X_test = df_test['combined_text'].fillna('').astype(str).values

# Convert labels to binary (FAKE=1, TRUE=0)
y_train = (df_train['label'] == 'FAKE').astype(int).values
y_val = (df_val['label'] == 'FAKE').astype(int).values
y_test = (df_test['label'] == 'FAKE').astype(int).values

print(f"Labels - FAKE: {y_train.sum()}, REAL: {len(y_train) - y_train.sum()}")


#### **Step 2. Tokenization**

In [None]:
# ==================== TOKENIZATION  ====================
NUM_WORDS = 50000    # Tokenizer vocabulary size
MAX_LEN = 256        # Maximum sequence length  
OOV_TOKEN = "<UNK>"  # Out-of-vocabulary token
    
EMBED_DIM = 128      # Embedding dimension
LSTM_UNITS = 128     # LSTM units
DROPOUT = 0.3        # Dropout rate
LR = 1e-3            # Learning rate
BATCH_SIZE = 64      # Batch size
EPOCHS = 8           # Number of epochs

# Create tokenizer 
tokenizer = Tokenizer(num_words=NUM_WORDS, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(X_train)

# Convert to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to MAX_LEN
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

print(f"\nData shapes :")
print(f"X_train_pad: {X_train_pad.shape}")
print(f"Vocabulary size: {len(tokenizer.word_index)}")
print(f"Max sequence length: {MAX_LEN}")


#### **Step 3. Build LSTM Model**

In [None]:
# ==================== BUILD LSTM MODEL  ====================
from tensorflow.keras.metrics import Precision, Recall
def create_lstm_model(vocab_size, embed_dim, lstm_units, dropout_rate, sequence_length):
        """
        Build LSTM model 
        """
        model = Sequential([
            # Embedding layer with EMBED_DIM
            Embedding(
                input_dim=vocab_size, 
                output_dim=embed_dim, 
                input_length=sequence_length,
                name='embedding_layer'
            ),
            
            # Spatial dropout
            SpatialDropout1D(dropout_rate, name='spatial_dropout'),
            
            # Bidirectional LSTM with LSTM_UNITS
            Bidirectional(
                LSTM(lstm_units, return_sequences=True, dropout=dropout_rate),
                name='bidirectional_lstm_1'
            ),
            
            # Second Bidirectional LSTM
            Bidirectional(
                LSTM(lstm_units // 2, dropout=dropout_rate),  # Half the units for second layer
                name='bidirectional_lstm_2'
            ),
            
            # Dense layer
            Dense(lstm_units // 2, activation='relu', name='dense_1'),
            Dropout(dropout_rate, name='dropout_1'),
            
            # Output layer
            Dense(1, activation='sigmoid', name='output_layer')
        ])
        
        # Compile 
        optimizer = Adam(learning_rate=LR)
        model.compile(
            optimizer=optimizer,
            loss='binary_crossentropy', 
            metrics=['accuracy', Precision(), Recall()]
        )
        return model

    # Create model 
vocab_size = min(NUM_WORDS, len(tokenizer.word_index) + 1)
lstm_model = create_lstm_model(
        vocab_size=vocab_size,
        embed_dim=EMBED_DIM,
        lstm_units=LSTM_UNITS,
        dropout_rate=DROPOUT,
        sequence_length=MAX_LEN
    )
    
print(" Model created successfully!")
print(f" Model Parameters:")
print(f"   - Vocabulary size: {vocab_size}")
print(f"   - Embedding dimension: {EMBED_DIM}")
print(f"   - LSTM units: {LSTM_UNITS}")
print(f"   - Dropout rate: {DROPOUT}")
print(f"   - Learning rate: {LR}")
print(f"   - Batch size: {BATCH_SIZE}")
print(f"   - Epochs: {EPOCHS}")
    
lstm_model.summary()


#### **Step 4. Train Model**

In [None]:
# ==================== TRAIN MODEL ====================
early_stopping = EarlyStopping(
        monitor='val_loss', 
        patience=3,  # Smaller patience for 8 epochs
        restore_best_weights=True,
        verbose=1
    )
    
print(f"\n Starting training...")
print(f"   - Batch size: {BATCH_SIZE}")
print(f"   - Epochs: {EPOCHS}")
    
history = lstm_model.fit(
        X_train_pad, y_train,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=(X_val_pad, y_val),
        callbacks=[early_stopping],
        verbose=1
    )


#### **Step 5. Evaluate Model**

In [None]:
# ==================== EVALUATE MODEL ====================
test_loss, test_accuracy, test_precision, test_recall = lstm_model.evaluate(
    X_test_pad, y_test, verbose=0
)
test_f1 = 2 * (test_precision * test_recall) / (test_precision + test_recall)

print("\n" + "="*60)
print(" LSTM MODEL RESULTS ")
print("="*60)
print(f" Accuracy:  {test_accuracy:.4f}")
print(f" Precision: {test_precision:.4f}")
print(f" Recall:    {test_recall:.4f}")
print(f"  F1-Score:  {test_f1:.4f}")

#### **Step 6. Make Predictions**

In [None]:
# ==================== MAKE PREDICTIONS ====================
y_pred_proba = lstm_model.predict(X_test_pad, verbose=0)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()
    
# Classification report
print("\n Classification Report:")
print(classification_report(y_test, y_pred, target_names=['REAL', 'FAKE']))
    
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(" Confusion Matrix:")
print(cm)

y_pred_lstm_combined = y_pred  # alias for comparison table (ground truth is y_test here)

#### **Step 7. Visualise Results**

In [None]:
# ==================== VISUALIZE RESULTS ====================
plt.figure(figsize=(15, 5))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy', linewidth=2)
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
plt.title('Model Accuracy ', fontsize=14, fontweight='bold')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True, alpha=0.3)

# Loss plot
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss', linewidth=2)
plt.plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
plt.title('Model Loss ', fontsize=14, fontweight='bold')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/lstm_training_leader_params.png', dpi=300, bbox_inches='tight')
plt.show()

# Confusion matrix heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['REAL', 'FAKE'], 
            yticklabels=['REAL', 'FAKE'],
            annot_kws={"size": 14})
plt.title('LSTM Confusion Matrix ', fontsize=16, fontweight='bold')
plt.ylabel('Actual Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.savefig('../results/lstm_confusion_leader_params.png', dpi=300, bbox_inches='tight')
plt.show()

#### **Step 8 . Save Model and Results**

In [None]:
# ==================== SAVE MODEL AND RESULTS ====================
# Save the model
model_path = '../models/lstm_fake_news_model.h5'
lstm_model.save(model_path)
print(f"\n Model saved as '{model_path}'")
    
# Save tokenizer
import pickle
with open('../models/tokenizer.pkl', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print(" Tokenizer saved as '../models/tokenizer.pkl'")
    
# Save results 
results = {
        'parameters_used': {
            'NUM_WORDS': NUM_WORDS,
            'MAX_LEN': MAX_LEN,
            'EMBED_DIM': EMBED_DIM,
            'LSTM_UNITS': LSTM_UNITS,
            'DROPOUT': DROPOUT,
            'LR': LR,
            'BATCH_SIZE': BATCH_SIZE,
            'EPOCHS': EPOCHS
        },
        'performance': {
            'accuracy': float(test_accuracy),
            'precision': float(test_precision),
            'recall': float(test_recall),
            'f1_score': float(test_f1),
            'test_loss': float(test_loss)
        },
        'confusion_matrix': cm.tolist()
    }
    
import json
with open('../results/lstm_results.json', 'w') as f:
        json.dump(results, f, indent=4)
print(" Results saved as '../results/lstm_results.json'")
    
print(f"\n LSTM training completed successfully!")
print(f" Final Accuracy: {test_accuracy:.4f}")


### **4.6 BERT Model on Text Only**

#### **Step 1. Defining Parameters**

In [None]:

PARAMS = {
    "model_name": "bert-base-uncased",
    "max_len": 128,      # shorter length to save on memory
    "batch_size": 8,
    "learning_rate": 2e-5,
    "epochs": 3,
    "train_test_split": 0.2
}

#### **Step 2. Encoding Labels**

In [None]:
#Encoding labels
#Using hugging face expects numeric value hence changing False/True to O/1

lbl_enc = LabelEncoder()
df_train["label_id"] = lbl_enc.fit_transform(df_train["label"])
df_val["label_id"]   = lbl_enc.transform(df_val["label"])
df_test["label_id"]  = lbl_enc.transform(df_test["label"])

id2label = {i: l for i,l in enumerate(lbl_enc.classes_)}
label2id = {l: i for i,l in enumerate(lbl_enc.classes_)}

#### **Step 3. Building Hugging Face Datasets**

In [None]:
#Building hugging face datasets

ds_train = Dataset.from_pandas(df_train[["text_clean","label_id"]])
ds_val   = Dataset.from_pandas(df_val[["text_clean","label_id"]])
ds_test  = Dataset.from_pandas(df_test[["text_clean","label_id"]])

#### **Step 4. Tokenization**

In [None]:
#Tokenizer

tokenizer = AutoTokenizer.from_pretrained(PARAMS["model_name"], use_fast=True)

def tokenize_fn(batch):
    texts = [str(t) if t is not None else "" for t in batch["text_clean"]]
    return tokenizer(
        texts,
        truncation=True,
        padding=False,
        max_length=PARAMS["max_len"]
    )

In [None]:
# Mapping

ds_train = ds_train.map(tokenize_fn, batched=True)
ds_val   = ds_val.map(tokenize_fn, batched=True)
ds_test  = ds_test.map(tokenize_fn, batched=True)

In [None]:
print(ds_train.column_names)

#### **Step 5. Pytorch Format**

In [None]:
import torch

# 1) Rename label column
ds_train = ds_train.rename_column("label_id", "labels")
ds_val   = ds_val.rename_column("label_id", "labels")
ds_test  = ds_test.rename_column("label_id", "labels")

# 2) Remove non-tensor columns (like the original text string)
def keep_cols(ds):
    keep = ["input_ids", "attention_mask", "labels"]
    if "token_type_ids" in ds.column_names:
        keep.append("token_type_ids")
    drop = [c for c in ds.column_names if c not in keep]
    return ds.remove_columns(drop)

ds_train = keep_cols(ds_train)
ds_val   = keep_cols(ds_val)
ds_test  = keep_cols(ds_test)

# 3) Set torch format with explicit columns
cols = ds_train.column_names  # after pruning
ds_train.set_format(type="torch", columns=cols)
ds_val.set_format(type="torch", columns=cols)
ds_test.set_format(type="torch", columns=cols)

# 4) Sanity checks
print("Train columns:", ds_train.column_names)
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA device:", torch.cuda.get_device_name(0))


In [None]:
# PyTorch format
ds_train.set_format("torch")
ds_val.set_format("torch")
ds_test.set_format("torch")

#### **Step 6. Defining Data Collator**

In [None]:
# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#### **Step 7. Loading the model**

In [None]:
#Loading the model

model = AutoModelForSequenceClassification.from_pretrained(
    PARAMS["model_name"],
    num_labels=len(lbl_enc.classes_),
    id2label=id2label,
    label2id=label2id
)

#### **Step 8. Defining Evaluation Function**

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Some HF models return (logits,) as a tuple
    if isinstance(logits, (tuple, list)):
        logits = logits[0]

    # Convert to numpy if needed
    if hasattr(logits, "detach"):  # torch.Tensor
        logits = logits.detach().cpu().numpy()
    if hasattr(labels, "detach"):  # torch.Tensor
        labels = labels.detach().cpu().numpy()

    # Argmax to predicted class ids
    preds = np.argmax(logits, axis=-1)

    # Ensure 1-D arrays
    preds = np.asarray(preds).ravel()
    labels = np.asarray(labels).ravel()

    return {
        "accuracy":  accuracy_score(labels, preds),
        "f1":        f1_score(labels, preds, average="weighted", zero_division=0),
        "precision": precision_score(labels, preds, average="weighted", zero_division=0),
        "recall":    recall_score(labels, preds, average="weighted", zero_division=0),
    }

#### **Step 9. Training Arguments**

In [None]:
training_args = TrainingArguments(
    output_dir="./bert-fake-news",
    eval_strategy="epoch",      # eval each epoch
    save_strategy="epoch",            # save each epoch
    learning_rate=PARAMS["learning_rate"],
    per_device_train_batch_size=PARAMS["batch_size"],  # 8
    per_device_eval_batch_size=PARAMS["batch_size"],
    num_train_epochs=PARAMS["epochs"],                  # 3
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=50,
    disable_tqdm=False,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",                  # disable W&B logging
    fp16=torch.cuda.is_available(),    # faster on Kaggle GPU
    dataloader_pin_memory=True,
    dataloader_num_workers=0,          # avoid worker hangs on Kaggle
    seed=42,
    run_name="bert_fake_news_base",    # optional: nice run name
)

#### **Step 10. Training the Model**

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
#Train

trainer.train()

In [None]:
results = trainer.evaluate(ds_test)
print(results)

pred_clean = trainer.predict(ds_test)

y_pred_bert_clean = np.argmax(pred_clean.predictions, axis=1)
y_true_bert_clean = pred_clean.label_ids


### **4.7 BERT Model on Combined Text Only**

In this section, we are applying **BERT (Bidirectional Encoder Representations from Transformers)** to predict whether a given news article is *fake* or *true*.  

#### Why BERT?
- BERT is a **transformer-based model** that has been pre-trained on massive amounts of text.  
- It understands **context in both directions** (left and right of a word), which makes it powerful for language understanding tasks.  
- For text classification tasks like **fake news detection**, BERT has shown state-of-the-art performance compared to traditional machine learning methods.  

#### Workflow
We will follow these steps in our modeling pipeline:

1. **Install dependencies** – Set up HuggingFace Transformers, PyTorch, and Scikit-learn.  
2. **Load data** – Import the prepared train, validation, and test CSVs.  
3. **Explore data** – Check dataset shape, column names, and class balance.  
4. **Preprocess & tokenize** – Convert raw text into tokens using the bert-base-uncased tokenizer.  
5. **Create PyTorch dataset loaders** – Wrap the tokenized inputs and labels into PyTorch Dataset and DataLoader objects for training.  
6. **Model setup** – Load BertForSequenceClassification with two labels (fake vs true).  
7. **Training loop** – Train BERT with AdamW optimizer, scheduler, and backpropagation.  
8. **Evaluation** – Measure accuracy, precision, recall, and F1 score on validation and test data.  
9. **Save the model** – Store the trained model and tokenizer for future use.

By the end of this workflow, we will have a fine-tuned BERT model that can classify unseen news articles as either fake or true.


#### **Step 1. Explore the Data**

In [None]:
# Check unique labels
print("Unique labels:", df_train['label'].unique())

# Convert labels to numeric (FAKE=0, TRUE=1)
label_mapping = {"FAKE": 0, "TRUE": 1}
df_train['label_num'] = df_train['label'].map(label_mapping)
df_val['label_num']   = df_val['label'].map(label_mapping)
df_test['label_num']  = df_test['label'].map(label_mapping)

# Check distribution of classes
print("Training label distribution:\n", df_train['label_num'].value_counts(normalize=True))
print("Validation label distribution:\n", df_val['label_num'].value_counts(normalize=True))
print("Test label distribution:\n", df_test['label_num'].value_counts(normalize=True))

# sneak peek after conversion
df_train[['combined_text', 'label', 'label_num']].head()


#### **Step 2. Define Parameters.**

At this stage, after inspecting the dataset, we define all key parameters in one dictionary (PARAMS):

  **model_name**: the pre-trained model to load (bert-base-uncased is common for English).  
  **max_len**: maximum token length for each sequence. This controls how long inputs are padded/truncated during tokenization.  
  **batch_size**: number of samples per batch for training.  
  **learning_rate**: optimizer learning rate (2e-5 is a typical starting point for BERT).  
  **epochs**: how many times we train over the full dataset.  
  **device**: whether to run on GPU (cuda) or CPU.

Defining these here ensures that tokenization, dataloaders, and model training are consistent and reproducible.

In [None]:
# Defining all important parameters in one place
PARAMS = {
    "model_name": "bert-base-uncased",   # pre-trained BERT model
    "max_len": 256,                      # max token length for each input
    "batch_size": 16,                    # batch size for DataLoader
    "learning_rate": 2e-5,               # learning rate for AdamW optimizer
    "epochs": 3,                         # number of training epochs
    "device": "cuda" if torch.cuda.is_available() else "cpu"  # use GPU if available
}


PARAMS


### **Step 3. Tokenization**

In [None]:
# Load the tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained(PARAMS["model_name"])

# sanity check;tokenize a single sentence
sample_text = df_train["combined_text"].iloc[0]
tokens = tokenizer.encode_plus(
    sample_text,
    max_length=PARAMS["max_len"],
    padding="max_length",
    truncation=True,
    return_tensors="pt"   # return PyTorch tensors
)

print("Original text:\n", sample_text[:200], "...\n")  # first 200 chars
print("Token IDs:\n", tokens["input_ids"])
print("Attention mask:\n", tokens["attention_mask"])


##### When we tokenized a sample, we got two main results:

 **Input IDs**: Each word or subword is mapped to a numeric ID from BERT’s vocabulary.  
    The sequence starts with [CLS] (ID 101) and ends with [SEP] (ID 102).  
    If the text is shorter than 256 tokens, the rest is padded with 0s.  
    If longer, it is truncated to 256 tokens.  

 **Attention Mask**: A sequence of 1s and 0s.  
    1 means a real token.  
    0 means a padding token.  

This ensures that all inputs have the same length (max_len=256), and BERT can ignore padding during training.

#### **Step 4. Create Pytorch dataset loaders**



To prepare the data for BERT, we wrap it in a **custom PyTorch Dataset**:

 **NewsDataset**:  
   Takes the cleaned text (combined_text) and labels.  
   Uses the BERT tokenizer with our chosen `max_len=256`.  
   Returns `input_ids`, `attention_mask`, and `labels` tensors for each sample.  

We then create **DataLoaders** for train, validation, and test sets:  
 These efficiently batch and shuffle the data.  
 Each batch contains:
   input_ids → tokenized text  
   attention_mask → marks real tokens vs. padding  
   labels → 0 (FAKE) or 1 (TRUE)  

These DataLoaders will feed data into BERT during training and evaluation.

In [None]:
# Make sure labels are clean 1-D ints
def ensure_label_vector(series):
    arr = np.asarray(series)
    if arr.dtype == object:
        arr = np.array([int(x[0]) if isinstance(x, (list, np.ndarray)) else int(x) for x in arr], dtype=np.int64)
    else:
        arr = arr.astype(np.int64)
    return arr

df_train["label_num"] = ensure_label_vector(df_train["label_num"])
df_val["label_num"]   = ensure_label_vector(df_val["label_num"])
df_test["label_num"]  = ensure_label_vector(df_test["label_num"])

In [None]:
from torch.utils.data import Dataset as TorchDataset, DataLoader

# 1. Create a custom dataset class
class NewsDataset(TorchDataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = list(texts)
        self.labels = np.asarray(labels, dtype=np.int64)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if isinstance(idx, torch.Tensor):
            idx = idx.item()
        idx = int(idx)

        text = str(self.texts[idx])
        label = int(self.labels[idx])

        enc = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }


# 2. Helper function to create DataLoaders
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = NewsDataset(
        texts=df["combined_text"].to_numpy(),
        labels=df["label_num"].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(ds, batch_size=batch_size, shuffle=True)

# 3. Create DataLoaders for train, val, test
train_data_loader = create_data_loader(df_train, tokenizer, PARAMS["max_len"], PARAMS["batch_size"])
val_data_loader = create_data_loader(df_val, tokenizer, PARAMS["max_len"], PARAMS["batch_size"])
test_data_loader = create_data_loader(df_test, tokenizer, PARAMS["max_len"], PARAMS["batch_size"])

len(train_data_loader), len(val_data_loader), len(test_data_loader)


#### **Step 5. Model Setup**

In this step, we build a custom **BERT-based classifier** for our fake news detection task.  
The model architecture is as follows:

 **BERT Base Model**: We load `bert-base-uncased` as the backbone to generate contextual embeddings from text.  
 **Dropout Layer**: Added to reduce overfitting by randomly deactivating some neurons during training.  
 **Linear Layer**: A fully connected layer that maps BERT’s hidden size (768 dimensions) to our two output classes (`FAKE` and `TRUE`).  
  **Freezing Option**: We can choose to freeze BERT’s pre-trained layers (`freeze_bert=True`) so only the classifier head trains (faster but may underfit). If set to `False`, the entire model is fine-tuned (slower but usually yields better results).  
 **Device Placement**: The model is moved to GPU (`cuda`) if available, otherwise runs on CPU.

This classifier combines the rich semantic understanding from BERT with a lightweight classification head tailored to our dataset.


In [None]:
# Define a custom BERT-based classifier
class BertClassifier(nn.Module):
    def __init__(self, freeze_bert=True):
        super(BertClassifier, self).__init__()
        
        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        
        # Add a dropout layer for regularization
        self.dropout = nn.Dropout(p=0.3)
        
        # Add a linear classifier on top (binary classification: FAKE vs TRUE)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)
        
        # Optionally freeze BERT weights (so only classifier trains)
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        # Get BERT outputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # Pooled output is [CLS] token representation
        pooled_output = outputs.pooler_output
        
        # Apply dropout then classifier
        x = self.dropout(pooled_output)
        x = self.classifier(x)
        return x

# Instantiate model
model = BertClassifier(freeze_bert=False)  # set to True if you want to freeze BERT
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


**Understanding the BERT Model Architecture**

When we print the model, we see the layers and components that make up our `BertClassifier`:

1. **`bert` (the backbone)**  
    This is the pre-trained **BERT encoder** that processes the text.  
    It has:
      **Embeddings**:
        *Word embeddings*: map each token ID to a 768-dimensional vector.  
        *Position embeddings*: capture word order in a sentence (since transformers have no sense of sequence by default).  
        *Token type embeddings*: allow BERT to distinguish between sentence A vs sentence B (useful for tasks like QA).  
      **Encoder layers**: 12 stacked layers (for `bert-base-uncased`).  
        Each layer has **self-attention**, **feed-forward**, and **layer normalization**.  
        These layers let BERT capture contextual meaning — e.g., “bank” in *river bank* vs *money bank*.  
      **Pooler**: takes the representation of the special `[CLS]` token and transforms it into a fixed-size vector (used for classification).

2. **Dropout (0.3)**  
    A regularization layer we added to reduce overfitting.  
    Randomly “drops” 30% of the neurons during training to make the model more robust.

3. **Classifier (Linear layer)**  
    Input: 768-dim vector from BERT (the pooled `[CLS]` token).  
    Output: 2 logits → `[FAKE, TRUE]`.  
    This is the final prediction layer.


 In short:  
 The **BERT encoder** extracts deep contextual features from the text.  
 The **dropout** improves generalization.  
 The **classifier** maps the features to our labels (fake vs true news).


In [None]:
# Map existing DataLoaders to the expected names
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import get_linear_schedule_with_warmup

train_loader = train_data_loader
val_loader   = val_data_loader
test_loader  = test_data_loader

# Optimizer: AdamW is the recommended optimizer for BERT
optimizer = AdamW(model.parameters(), lr=PARAMS["learning_rate"])

# Scheduler: adjusts the learning rate during training
total_steps = len(train_loader) * PARAMS["epochs"]
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,          # warmup can be tuned
    num_training_steps=total_steps
)

# Loss function: CrossEntropy for multi-class classification (2 classes: FAKE/TRUE)
criterion = CrossEntropyLoss()


In [None]:
print(train_loader)
print(optimizer)
print(scheduler)
print(criterion)

#### **Step 6. Training Loop**

This step is where the model actually **learns** from data. During training:  
1. Batches of text + labels are fed into the model.  
2. The model makes predictions.  
3. A **loss** is calculated to measure errors.  
4. Backpropagation updates the model weights to improve performance.  

We repeat this process over several **epochs** (full passes through the training set).  
At the end of each epoch, we track:  
 **Average loss** → how well the model is fitting.  
 **Accuracy** → how well it’s predicting.  

This step is the core of fine-tuning BERT for our dataset.

In [None]:
from tqdm import tqdm
import torch
import numpy as np

def train_epoch(model, data_loader, optimizer, criterion, scheduler, device, epoch=None, log_interval=50):
    model = model.train()
    losses = []
    correct_predictions = 0
    total = 0

    progress = tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} [Train]", leave=False)

    for step, batch in progress:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)  

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs
        _, preds = torch.max(logits, dim=1)
        loss = criterion(logits, labels)

        correct_predictions += torch.sum(preds == labels).item()
        total += labels.size(0)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        # Update tqdm bar with running loss and accuracy
        if (step + 1) % log_interval == 0 or (step + 1) == len(data_loader):
            avg_loss = np.mean(losses)
            acc = correct_predictions / total
            progress.set_postfix({"loss": f"{avg_loss:.4f}", "acc": f"{acc:.4f}"})

    return correct_predictions / total, np.mean(losses)


def eval_model(model, data_loader, criterion, device, epoch=None):
    model = model.eval()
    losses = []
    correct_predictions = 0
    total = 0

    progress = tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} [Eval]", leave=False)

    with torch.no_grad():
        for step, batch in progress:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)   

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs
            _, preds = torch.max(logits, dim=1)
            loss = criterion(logits, labels)

            correct_predictions += torch.sum(preds == labels).item()
            total += labels.size(0)
            losses.append(loss.item())

            if (step + 1) % 50 == 0 or (step + 1) == len(data_loader):
                avg_loss = np.mean(losses)
                acc = correct_predictions / total
                progress.set_postfix({"loss": f"{avg_loss:.4f}", "acc": f"{acc:.4f}"})

    return correct_predictions / total, np.mean(losses)

# Main Training Loop
history = {"train_acc": [], "train_loss": [], "val_acc": [], "val_loss": []}

for epoch in range(PARAMS["epochs"]):
    print(f"Epoch {epoch + 1}/{PARAMS['epochs']}")

    train_acc, train_loss = train_epoch(
        model, train_data_loader, optimizer, criterion, scheduler, device, epoch=epoch+1
    )
    val_acc, val_loss = eval_model(
        model, val_data_loader, criterion, device, epoch=epoch+1
    )

    history["train_acc"].append(train_acc)
    history["train_loss"].append(train_loss)
    history["val_acc"].append(val_acc)
    history["val_loss"].append(val_loss)

    print(f"Train loss {train_loss:.4f}, accuracy {train_acc:.4f}")
    print(f"Val   loss {val_loss:.4f}, accuracy {val_acc:.4f}")
    print("-" * 50)



**Training Epoch Outputs**

Each epoch output shows the model's performance on both the training and validation sets:  

 **Train loss / accuracy**: How well the model is fitting the training data. A decreasing loss and increasing accuracy indicate learning.  
 **Validation loss / accuracy**: How well the model generalizes to unseen data. Stable or slightly higher validation loss compared to training is normal.  
 **Epoch progression**: Each epoch represents a full pass through the training dataset.  

From the outputs, the model quickly learned to distinguish FAKE and TRUE news, achieving very high accuracy and low loss by the final epoch.


#### **Step 7. Evaluation**

Now that our BERT model has been trained for 3 epochs, we need to evaluate its performance on the test set.  
This involves:
 Switching the model to **evaluation mode** (disables dropout, gradient updates).  
 Running the model on the test dataloader.  
 Collecting predictions and true labels.  
 Computing evaluation metrics:
   **Accuracy** (overall correctness).  
   **Precision, Recall, F1-score** (per-class and averaged).  
   **Confusion Matrix** (to see where the model makes mistakes).

In [None]:
from sklearn.metrics import classification_report, accuracy_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # HuggingFace model returns logits directly
        logits = outputs.logits if hasattr(outputs, "logits") else outputs
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Accuracy
acc = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {acc:.4f}")

# Detailed metrics
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=["FAKE", "TRUE"]))

y_pred_bert_combined = np.array(all_preds)
y_true_bert_combined = np.array(all_labels)

The model achieved outstanding performance on the test set:

 **Accuracy:** 99.96%  
 **Precision, Recall, and F1-score:** All values are essentially 1.00 (100%) for both classes (FAKE and TRUE).  
The test set contained 2,348 FAKE and 2,142 TRUE examples, meaning the dataset is relatively balanced.  

####  Interpretation
The model is almost perfectly distinguishing between FAKE and TRUE news articles.  
 An accuracy of 99.96% suggests that only about **2 samples out of 4,490** were misclassified.  
 High precision and recall indicate that the model is not only making correct predictions but also covering nearly all true cases.  

####  Conclusion
These results show that the fine-tuned BERT model generalizes very well on unseen data.

### **4.8 Model and Evaluation Metrics Comparison**

We define a function that creates a table that compares all the different model metrics and export the table to ensure we don't have to rerun on the GPU again

In [None]:
# --- Unified artifact saver: writes to Kaggle (if present) and to local ../data when available ---
import os
from pathlib import Path

def save_artifact(df, filename: str):
    """
    Save a pandas DataFrame to:
      - /kaggle/working/<filename> when running in Kaggle
      - ../data/<filename> if ../data exists (local runs)
    Returns the paths that were actually written.
    """
    written = []
    # 1) Kaggle working dir
    kaggle_dir = Path("/kaggle/working")
    if kaggle_dir.exists():
        out_k = kaggle_dir / filename
        df.to_csv(out_k, index=False)
        print(f"[Kaggle] Saved: {out_k}")
        written.append(str(out_k))

    # 2) Local ../data (if present)
    local_dir = Path("../data")
    if local_dir.exists():
        out_l = local_dir / filename
        df.to_csv(out_l, index=False)
        print(f"[Local]  Saved: {out_l}")
        written.append(str(out_l))

    # 3) Fallback: current directory
    if not written:
        out_f = Path("./") / filename
        df.to_csv(out_f, index=False)
        print(f"[Fallback] Saved: {out_f}")
        written.append(str(out_f))

    return written

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

# Helper: coerce any y (strings, bools, logits, probs, tensors) -> int IDs using your existing lbl_enc/id2label
def to_label_ids(y, lbl_enc, id2label=None):
    # Convert tensors to numpy
    if hasattr(y, "detach"):  # torch tensor
        y = y.detach().cpu().numpy()
    y = np.asarray(y)

    # If y are logits/probabilities shape [N, C], argmax to class ids
    if y.ndim == 2:
        y = y.argmax(axis=1)

    # If booleans, cast to int
    if y.dtype == np.bool_:
        return y.astype(np.int64)

    # If already ints, just cast
    if np.issubdtype(y.dtype, np.integer):
        return y.astype(np.int64)

    # Handle strings/object labels -> use lbl_enc to map to ids
    y_str = y.astype(str)
    try:
        return lbl_enc.transform(y_str).astype(np.int64)
    except Exception:
        # Fallback via id2label mapping if provided
        if id2label is None:
            raise
        label2id_fallback = {v: k for k, v in id2label.items()}
        return np.array([label2id_fallback[s] for s in y_str], dtype=np.int64)

def row(model, variant, y_true, y_pred):
    acc  = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="weighted", zero_division=0
    )
    return {"model": model, "variant": variant,
            "accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

rows = []

# --- Normalize all y_* to integer IDs consistently ---
# Ground truths
y_true_lr          = to_label_ids(y_test,                 lbl_enc, id2label)
y_true_lstm_clean  = to_label_ids(y_te,                   lbl_enc, id2label)       # if your LSTM-clean uses y_te
y_true_lstm_comb   = to_label_ids(y_test,                 lbl_enc, id2label)
y_true_bert_c      = to_label_ids(y_true_bert_clean,      lbl_enc, id2label)
y_true_bert_comb   = to_label_ids(y_true_bert_combined,   lbl_enc, id2label)

# Predictions (handle strings, tensors, or logits)
y_pred_lr_clean_id   = to_label_ids(y_pred_lr_clean,        lbl_enc, id2label)
y_pred_lr_comb_id    = to_label_ids(y_pred_lr_combined,     lbl_enc, id2label)

y_pred_lstm_clean_id = to_label_ids(y_pred_lstm_clean,      lbl_enc, id2label)
y_pred_lstm_comb_id  = to_label_ids(y_pred_lstm_combined,   lbl_enc, id2label)

y_pred_bert_c_id     = to_label_ids(y_pred_bert_clean,      lbl_enc, id2label)
y_pred_bert_comb_id  = to_label_ids(y_pred_bert_combined,   lbl_enc, id2label)

# (Optional) sanity: the label sets should match
assert set(np.unique(y_true_lr)) <= set(lbl_enc.transform(lbl_enc.classes_)), "Unexpected y_true labels"

# --- Build the table using ONLY normalized arrays ---
rows.append(row("LogReg", "clean",    y_true_lr,         y_pred_lr_clean_id))
rows.append(row("LogReg", "combined", y_true_lr,         y_pred_lr_comb_id))

rows.append(row("LSTM",   "clean",    y_true_lstm_clean, y_pred_lstm_clean_id))
rows.append(row("LSTM",   "combined", y_true_lstm_comb,  y_pred_lstm_comb_id))

rows.append(row("BERT",   "clean",    y_true_bert_c,     y_pred_bert_c_id))
rows.append(row("BERT",   "combined", y_true_bert_comb,  y_pred_bert_comb_id))

metrics_df = pd.DataFrame(rows).sort_values(["model","variant"])
display(metrics_df.sort_values("f1", ascending=False))

out_path = RESULTS_OUT / "model_comparison_clean_vs_combined.csv"
save_artifact(metrics_df, "model_comparison_clean_vs_combined.csv")
print(f"Saved metrics table to: {out_path}")