In [None]:
!pip install -U "transformers>=4.40.0"



In [None]:
from google.colab import files
uploaded = files.upload()

Saving archive.zip to archive (1).zip


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import zipfile
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import StratifiedShuffleSplit

# Download resources once
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### **2.2 Load and Preview Data**

In [None]:
with zipfile.ZipFile("archive.zip", 'r') as zip_ref:
    zip_ref.extractall("unzipped_data")

print("Files extracted successfully!")

Files extracted successfully!


In [None]:
os.listdir("unzipped_data")

['True.csv', 'Fake.csv']

In [None]:
fake_df = pd.read_csv("unzipped_data/Fake.csv")
true_df = pd.read_csv("unzipped_data/True.csv")

print("Fake News Dataset:", fake_df.shape)
print("True News Dataset:", true_df.shape)

fake_df.head()

Fake News Dataset: (23481, 4)
True News Dataset: (21417, 4)


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [None]:
#merge and label

#Add a label column
fake_df["label"] = "FAKE"
true_df["label"] = "TRUE"

#Merge into one dataset
df = pd.concat([fake_df, true_df], ignore_index = True)

#Shuffle the rows so FAKE and TRUE are mixed
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

#Check the structure
print(df.shape)
print(df["label"].value_counts())
print(df.info())
df.head()

(44898, 5)
label
FAKE    23481
TRUE    21417
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB
None


Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",FAKE
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",TRUE
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",TRUE
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",FAKE
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",TRUE


## **Chapter 3. Data Preparation**
In this section, we will

### **3.1 Lowercasing & URL removal**

**1. Defining Preprocessing Function**

In [None]:
def preprocess_text_lowercase_url(text):
    """
    MAIN PREPROCESSING FUNCTION:
    - Converts text to lowercase
    - Removes URLs, hyperlinks, and website addresses
    - Handles missing values safely
    - Cleans extra whitespace
    """
    # Handle missing values
    if pd.isna(text) or text is None:
        return ""

    # Convert to string to ensure consistent processing
    text = str(text)

    # COMPREHENSIVE URL REMOVAL PATTERN:
    url_pattern = r'https?://\S+|www\.\S+|\S+\.(com|org|net|edu|gov|io|co|uk)\S*|bit\.ly/\S+|t\.co/\S+'

    # Remove all URLs from text
    text = re.sub(url_pattern, '', text)

    # Convert entire text to lowercase for consistency
    text = text.lower()

    # Clean up extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

print("Preprocessing functions defined!\n")

Preprocessing functions defined!



**2. Quality check functions**

In [None]:
def contains_url(text):
    """Check if text contains any URLs"""
    url_pattern = r'https?://|www\.|\.[a-z]{2,}'
    return bool(re.search(url_pattern, str(text).lower()))

def count_uppercase(text):
    """Count uppercase characters in text"""
    return sum(1 for char in str(text) if char.isupper())

### **3.2 Remove Non-Alphabetic Characters**

In [None]:
URL_RE   = re.compile(r'https?://\S+|www\.\S+')
HTML_RE  = re.compile(r'<.*?>')
NONALPH  = re.compile(r'[^a-z\s]+')     # keep letters & spaces only
WS_RE    = re.compile(r'\s+')

# Defining Preprocessing Function
def _keep_alpha_only(text: str) -> str:
    text = NONALPH.sub(" ", text)    # remove non-letters
    text = WS_RE.sub(" ", text).strip()
    return text

### **3.3 Lemmatization**

In [None]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Define preprocessing + lemmatization function
def preprocess_and_lemmatize(text):
    if isinstance(text, str):  # make sure it's a string
        # Lowercase
        text = text.lower()

        # Remove punctuation, numbers, special chars
        text = re.sub(r'[^a-z\s]', '', text)

        # Tokenize
        tokens = nltk.word_tokenize(text)

        # Remove stopwords + lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

        return " ".join(tokens)
    else:
        return ""


### **3.4 Apply Preprocessing**

**Defining Function**

In [None]:
# Defining function to apply preprocessing
def apply_preprocessing(text: str) -> str:
    """
      1) preprocess_text_lowercase_url  [lowercase + URL removal + whitespace clean]
      2) _keep_alpha_only                [remove non-alphabetic, collapse spaces]
      3) preprocess_and_lemmatize [tokenize, drop stopwords, lemmatize]
    """
    # Step 1 (Teammate 4)
    text = preprocess_text_lowercase_url(text)

    # Step 2 (Teammate 4)
    text = _keep_alpha_only(text)

    # Step 3 (Teammate 5, adapter)
    text = preprocess_and_lemmatize(text)

    return text


**Applying Preprocessing**

In [None]:
df['title_clean'] = df['title'].apply(apply_preprocessing)
df['text_clean'] = df['text'].apply(apply_preprocessing)

In [None]:
df["combined_text"] = (df["title_clean"] + " " + df["text_clean"]).str.strip()

In [None]:
df.head()

Unnamed: 0,title,text,subject,date,label,title_clean,text_clean,combined_text
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",FAKE,ben stein call th circuit court committed coup...,st century wire say ben stein reputable profes...,ben stein call th circuit court committed coup...
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",TRUE,trump drop steve bannon national security council,washington reuters u president donald trump re...,trump drop steve bannon national security coun...
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",TRUE,puerto rico expects u lift jones act shipping ...,reuters puerto rico governor ricardo rossello ...,puerto rico expects u lift jones act shipping ...
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",FAKE,oops trump accidentally confirmed leaked israe...,monday donald trump embarrassed country accide...,oops trump accidentally confirmed leaked israe...
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",TRUE,donald trump head scotland reopen golf resort,glasgow scotland reuters u presidential candid...,donald trump head scotland reopen golf resort ...


**Checking length of combined texts**

In [None]:
# Character lengths
COL = "combined_text"
df["_char_len"] = df[COL].str.len()

# Simple whitespace token count [plain: word count proxy]
df["_tok_len_ws"] = df[COL].str.split().apply(len)

# Summaries
print("Char length stats:", df["_char_len"].describe(percentiles=[.5,.9,.95,.99]).to_dict())
print("Token length stats:", df["_tok_len_ws"].describe(percentiles=[.5,.9,.95,.99]).to_dict())

# Flags for extremes (adjust thresholds to your data)
too_short = df["_tok_len_ws"] < 3           # [plain: likely junk]
too_long_char = df["_char_len"] > 8000      # [plain: abnormally long articles]
print({"too_short": int(too_short.sum()), "too_long_char": int(too_long_char.sum())})

Char length stats: {'count': 44898.0, 'mean': 1752.6512762261125, 'std': 1504.6273547302997, 'min': 0.0, '50%': 1542.0, '90%': 3202.0, '95%': 3917.300000000003, '99%': 6340.029999999999, 'max': 37972.0}
Token length stats: {'count': 44898.0, 'mean': 242.22885206467993, 'std': 204.33721510097453, 'min': 0.0, '50%': 215.0, '90%': 440.0, '95%': 538.0, '99%': 863.0, 'max': 4968.0}
{'too_short': 9, 'too_long_char': 266}


**Insights**

Since some combined texts are too long, those training BERT and LSTM should set paramters for the maximum length allowed

## **Chapter 4. Modelling**

### **4.1 Stratified Train Test Split**

We create a function to do two Stratified Train Test Splits to our data ensruing that we have 10% in both the validation split and the test split.

A Stratified Split ensures we maintain the ratio of classes `Main/Fake` throughhout our splits

We export the splits into csv files for GPU training

In [None]:
def train_val_test_split_stratified(df, label_col="label", test_size=0.1, val_size=0.1, seed=42):
    y = df[label_col].values  # these are "FAKE"/"TRUE" strings

    # First split: train+val vs test
    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
    idx_trainval, idx_test = next(sss1.split(df, y))
    df_trainval = df.iloc[idx_trainval].reset_index(drop=True)
    df_test = df.iloc[idx_test].reset_index(drop=True)

    # Second split: train vs val
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=val_size/(1-test_size), random_state=seed)
    y_tv = df_trainval[label_col].values
    idx_train, idx_val = next(sss2.split(df_trainval, y_tv))
    df_train = df_trainval.iloc[idx_train].reset_index(drop=True)
    df_val = df_trainval.iloc[idx_val].reset_index(drop=True)

    return df_train, df_val, df_test

# Run the split
df_train, df_val, df_test = train_val_test_split_stratified(
    df, label_col="label", test_size=0.10, val_size=0.10, seed=42
)

# Print sizes
print({k: len(v) for k,v in {"train": df_train, "val": df_val, "test": df_test}.items()})

# Save the raw splits
from pathlib import Path
split_dir = Path("../data")        # define as Path, not string
split_dir.mkdir(parents=True, exist_ok=True)  # make sure folder exists

df_train.to_csv(split_dir / "train.csv", index=False)
df_val.to_csv(split_dir / "val.csv", index=False)
df_test.to_csv(split_dir / "test.csv", index=False)
print("Files saved succesfully under data/")

{'train': 35918, 'val': 4490, 'test': 4490}
Files saved succesfully under data/


## MY PART

In [None]:
# Install and import
import numpy as np
from datasets import Dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import torch
from sklearn.preprocessing import LabelEncoder

In [None]:
#Parameters

PARAMS = {
    "model_name": "bert-base-uncased",
    "max_len": 128,      # shorter length to save on memory
    "batch_size": 8,
    "learning_rate": 2e-5,
    "epochs": 3,
    "train_test_split": 0.2
}

In [None]:
#Loading the splits

train_df = pd.read_csv("../data/train.csv")
val_df   = pd.read_csv("../data/val.csv")
test_df  = pd.read_csv("../data/test.csv")

print(train_df.shape, val_df.shape, test_df.shape)

(35918, 10) (4490, 10) (4490, 10)


In [None]:
#Encoding labels
#Using hugging face expects numeric value hence changing False/True to O/1

lbl_enc = LabelEncoder()
train_df["label_id"] = lbl_enc.fit_transform(train_df["label"])
val_df["label_id"]   = lbl_enc.transform(val_df["label"])
test_df["label_id"]  = lbl_enc.transform(test_df["label"])

id2label = {i: l for i,l in enumerate(lbl_enc.classes_)}
label2id = {l: i for i,l in enumerate(lbl_enc.classes_)}

In [None]:
#Building hugging face datasets

ds_train = Dataset.from_pandas(train_df[["combined_text","label_id"]])
ds_val   = Dataset.from_pandas(val_df[["combined_text","label_id"]])
ds_test  = Dataset.from_pandas(test_df[["combined_text","label_id"]])

In [None]:
#Tokenizer

tokenizer = AutoTokenizer.from_pretrained(PARAMS["model_name"], use_fast=True)

def tokenize_fn(batch):
    texts = [str(t) if t is not None else "" for t in batch["combined_text"]]
    return tokenizer(
        texts,
        truncation=True,
        padding=False,
        max_length=PARAMS["max_len"]
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Mapping

ds_train = ds_train.map(tokenize_fn, batched=True)
ds_val   = ds_val.map(tokenize_fn, batched=True)
ds_test  = ds_test.map(tokenize_fn, batched=True)

Map:   0%|          | 0/35918 [00:00<?, ? examples/s]

Map:   0%|          | 0/4490 [00:00<?, ? examples/s]

Map:   0%|          | 0/4490 [00:00<?, ? examples/s]

In [None]:
print(ds_train.column_names)

['combined_text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask']


In [None]:
# PyTorch format
ds_train.set_format("torch")
ds_val.set_format("torch")
ds_test.set_format("torch")

In [None]:
# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
#Loading the model

model = AutoModelForSequenceClassification.from_pretrained(
    PARAMS["model_name"],
    num_labels=len(lbl_enc.classes_),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Evaluation metrics

accuracy_metric  = evaluate.load("accuracy")
f1_metric        = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric    = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1_metric.compute(predictions=preds, references=labels, average="weighted")["f1"],
        "precision": precision_metric.compute(predictions=preds, references=labels, average="weighted")["precision"],
        "recall": recall_metric.compute(predictions=preds, references=labels, average="weighted")["recall"],
    }



In [None]:
import transformers, sys
print("transformers version:", transformers.__version__)
print("transformers file:", transformers.__file__)
print("sys.path[0:5]:", sys.path[0:5])

transformers version: 4.56.2
transformers file: /usr/local/lib/python3.12/dist-packages/transformers/__init__.py
sys.path[0:5]: ['/content', '/env/python', '/usr/lib/python312.zip', '/usr/lib/python3.12', '/usr/lib/python3.12/lib-dynload']


In [None]:
# In a cell
import transformers
print(transformers.__version__)
from transformers import TrainingArguments
help(TrainingArguments)

4.56.2
Help on class TrainingArguments in module transformers.training_args:

class TrainingArguments(builtins.object)
 |
 |  TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
 |  itself**.
 |
 |  Using [`HfArgumentParser`] we can turn this class into
 |  [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
 |  command line.
 |
 |  Parameters:
 |      output_dir (`str`, *optional*, defaults to `"trainer_output"`):
 |          The output directory where the model predictions and checkpoints will be written.
 |      overwrite_output_dir (`bool`, *optional*, defaults to `False`):
 |          If `True`, overwrite the content of the output directory. Use this to continue training if `output_dir`
 |          points to a checkpoint directory.
 |      do_train (`bool`, *optional*, defaults to `False`):
 |          Whether to run training or not. This argument is not directl

In [None]:
import inspect
print(inspect.getfile(TrainingArguments))

/usr/local/lib/python3.12/dist-packages/transformers/training_args.py


In [None]:
#Training arguments

training_args = TrainingArguments(
    output_dir="./bert-fake-news",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=PARAMS["learning_rate"],
    per_device_train_batch_size=PARAMS["batch_size"],
    per_device_eval_batch_size=PARAMS["batch_size"],
    num_train_epochs=PARAMS["epochs"],
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [69]:
#Train

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0001,0.040348,0.993764,0.993765,0.993834,0.993764
2,0.0,0.016892,0.998218,0.998218,0.998218,0.998218


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0001,0.040348,0.993764,0.993765,0.993834,0.993764
2,0.0,0.016892,0.998218,0.998218,0.998218,0.998218
3,0.0,0.015863,0.997996,0.997996,0.997996,0.997996


TrainOutput(global_step=13470, training_loss=0.0034811090277728717, metrics={'train_runtime': 2692.4116, 'train_samples_per_second': 40.021, 'train_steps_per_second': 5.003, 'total_flos': 7087817164815360.0, 'train_loss': 0.0034811090277728717, 'epoch': 3.0})

In [70]:
results = trainer.evaluate(ds_test)
print(results)

{'eval_loss': 0.005134142469614744, 'eval_accuracy': 0.9993318485523385, 'eval_f1': 0.9993318416772765, 'eval_precision': 0.9993319341547163, 'eval_recall': 0.9993318485523385, 'eval_runtime': 34.4571, 'eval_samples_per_second': 130.307, 'eval_steps_per_second': 16.31, 'epoch': 3.0}
