In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import zipfile
from wordcloud import WordCloud
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download resources once
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")


### **2.2 Load and Preview Data**

In [None]:
with zipfile.ZipFile("../data/archive.zip", 'r') as zip_ref:
    zip_ref.extractall("unzipped_data")
    
print("Files extracted successfully!")

In [None]:
os.listdir("unzipped_data")

In [None]:
fake_df = pd.read_csv("unzipped_data/Fake.csv")
true_df = pd.read_csv("unzipped_data/True.csv")

print("Fake News Dataset:", fake_df.shape)
print("True News Dataset:", true_df.shape)

fake_df.head()

In [None]:
#merge and label

#Add a label column
fake_df["label"] = "FAKE"
true_df["label"] = "TRUE"

#Merge into one dataset
df = pd.concat([fake_df, true_df], ignore_index = True)

#Shuffle the rows so FAKE and TRUE are mixed
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

#Check the structure
print(df.shape)
print(df["label"].value_counts())
print(df.info())
df.head()

## **Chapter 3. Data Preparation**
In this section, we will 

### **3.1 Lowercasing & URL removal**

**1. Defining Preprocessing Function**

In [None]:
def preprocess_text_lowercase_url(text):
    """
    MAIN PREPROCESSING FUNCTION:
    - Converts text to lowercase
    - Removes URLs, hyperlinks, and website addresses
    - Handles missing values safely
    - Cleans extra whitespace
    """
    # Handle missing values
    if pd.isna(text) or text is None:
        return ""
    
    # Convert to string to ensure consistent processing
    text = str(text)
    
    # COMPREHENSIVE URL REMOVAL PATTERN:
    url_pattern = r'https?://\S+|www\.\S+|\S+\.(com|org|net|edu|gov|io|co|uk)\S*|bit\.ly/\S+|t\.co/\S+'
    
    # Remove all URLs from text
    text = re.sub(url_pattern, '', text)
    
    # Convert entire text to lowercase for consistency
    text = text.lower()
    
    # Clean up extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

print("Preprocessing functions defined!\n")

**2. Quality check functions**

In [None]:
def contains_url(text):
    """Check if text contains any URLs"""
    url_pattern = r'https?://|www\.|\.[a-z]{2,}'
    return bool(re.search(url_pattern, str(text).lower()))

def count_uppercase(text):
    """Count uppercase characters in text"""
    return sum(1 for char in str(text) if char.isupper())

### **3.2 Remove Non-Alphabetic Characters**

In [None]:
URL_RE   = re.compile(r'https?://\S+|www\.\S+')
HTML_RE  = re.compile(r'<.*?>')
NONALPH  = re.compile(r'[^a-z\s]+')     # keep letters & spaces only
WS_RE    = re.compile(r'\s+')

# Defining Preprocessing Function
def _keep_alpha_only(text: str) -> str:
    text = NONALPH.sub(" ", text)    # remove non-letters
    text = WS_RE.sub(" ", text).strip()
    return text

### **3.3 Lemmatization**

In [None]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Define preprocessing + lemmatization function
def preprocess_and_lemmatize(text):
    if isinstance(text, str):  # make sure it's a string
        # Lowercase
        text = text.lower()

        # Remove punctuation, numbers, special chars
        text = re.sub(r'[^a-z\s]', '', text)

        # Tokenize
        tokens = nltk.word_tokenize(text)

        # Remove stopwords + lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

        return " ".join(tokens)
    else:
        return ""


### **3.4 Apply Preprocessing**

**Defining Function**

In [None]:
# Defining function to apply preprocessing
def apply_preprocessing(text: str) -> str:
    """
      1) preprocess_text_lowercase_url  [lowercase + URL removal + whitespace clean]
      2) _keep_alpha_only                [remove non-alphabetic, collapse spaces]
      3) preprocess_and_lemmatize [tokenize, drop stopwords, lemmatize]
    """
    # Step 1 (Teammate 4)
    text = preprocess_text_lowercase_url(text)

    # Optional (if HTML present in some sources): strip simple tags BEFORE alpha-only
    # text = HTML_RE.sub(" ", text)  # Uncomment if needed

    # Step 2 (Teammate 4)
    text = _keep_alpha_only(text)

    # Step 3 (Teammate 5, adapter)
    text = preprocess_and_lemmatize(text)

    return text


**Applying Preprocessing**

In [None]:
df['title_clean'] = df['title'].apply(apply_preprocessing)
df['text_clean'] = df['text'].apply(apply_preprocessing)

### **3.5 Preprocessing Tests**

These tests validate that our cleaned columns (`title_clean`, `text_clean`) are
correctly produced from the raw text (`title`, `text`) using the
`apply_preprocessing` pipeline.

### What the tests check
1. **Column existence & type**  
   - Both `*_clean` columns exist, are pandas Series, and contain strings.

2. **Basic cleaning guarantees**  
   - No URLs remain in the text.  
   - All text is lowercase.  
   - Only alphabetic characters and spaces are present.  
   - No leading, trailing, or repeated whitespace.  
   - No `NaN` values; all entries are strings.

3. **Pipeline correctness**  
   - Each `*_clean` column equals exactly one pass of `apply_preprocessing` on the raw column.

4. **Non-empty output for meaningful inputs**  
   - If a raw row has meaningful alphabetic content (not just URLs, HTML, or stopwords),
     the cleaned version is not empty.

5. **Length sanity**  
   - Cleaned text is usually shorter or equal in length compared to raw text.  
   - The majority of meaningful rows remain non-empty after preprocessing.

6. **Summary statistics**  
   - Reports row counts, percentage of non-empty values, median lengths,
     and how often cleaned length ≤ raw length.

### Why these tests matter
They ensure our preprocessing pipeline:
- Safely handles edge cases (URLs, HTML, missing values).  
- Produces consistent, normalized text suitable for tokenization.  
- Doesn’t over-clean and erase meaningful content.  
- Stays aligned with the defined stopword/lemmatization rules.


In [None]:
# --- Neutralize any previously-defined idempotence checker lingering in memory ---
def _assert_idempotent(*args, **kwargs):
    return  # disabled by design

# --- Patterns ---
ALPHA_SPACE_RE = re.compile(r"^[a-z ]*$")  # letters + spaces only
URL_RE = re.compile(r"(https?://\S+|www\.\S+|\S+\.(com|org|net|edu|gov|io|co|uk)\S*|bit\.ly/\S+|t\.co/\S+)")

# --- Helpers ---
def _example(series, mask):
    m = mask.values if hasattr(mask, "values") else mask
    idx = np.flatnonzero(m)
    if len(idx):
        i = idx[0]
        try:
            return str(series.iloc[i])[:200]
        except Exception:
            return "<unavailable>"
    return "<none>"

def _assert_series_exists_and_string(s: pd.Series, name: str):
    assert isinstance(s, pd.Series), f"{name} is not a pandas Series."
    assert s.dtype == "object" or pd.api.types.is_string_dtype(s), f"{name} must be string-like dtype."

def _assert_no_urls(s: pd.Series, name: str):
    mask = s.fillna("").str.contains(URL_RE)
    assert not mask.any(), f"{name}: URLs found. Example: { _example(s, mask)!r }"

def _assert_lowercase_only(s: pd.Series, name: str):
    mask = s.fillna("").str.contains(r"[A-Z]")
    assert not mask.any(), f"{name}: Uppercase letters found. Example: { _example(s, mask)!r }"

def _assert_alpha_space_only(s: pd.Series, name: str):
    mask = s.fillna("").apply(lambda x: bool(x) and ALPHA_SPACE_RE.match(x) is None)
    assert not mask.any(), f"{name}: Non alpha/space characters found. Example: { _example(s, mask)!r }"

def _assert_no_extra_whitespace(s: pd.Series, name: str):
    s2 = s.fillna("")
    leading = s2.str.match(r"^\s")
    trailing = s2.str.contains(r"\s$")
    doubles  = s2.str.contains(r"\s{2,}")
    assert not leading.any(),  f"{name}: Leading spaces found. Example: { _example(s, leading)!r }"
    assert not trailing.any(), f"{name}: Trailing spaces found. Example: { _example(s, trailing)!r }"
    assert not doubles.any(),  f"{name}: Multiple consecutive spaces found. Example: { _example(s, doubles)!r }"

def _assert_no_nans_and_strings(s: pd.Series, name: str):
    assert not s.isna().any(), f"{name}: NaNs present."
    assert s.map(lambda x: isinstance(x, str)).all(), f"{name}: Non-string detected."

def _assert_matches_single_pass(raw: pd.Series, cleaned: pd.Series, name: str, fn):
    recomputed = raw.apply(fn)
    diff = cleaned != recomputed
    if diff.any():
        i = np.flatnonzero(diff.values)[0]
        raise AssertionError(
            f"{name}: cleaned column != single-pass pipeline.\n"
            f"  raw:                       {raw.iloc[i]!r}\n"
            f"  cleaned (your column):     {cleaned.iloc[i]!r}\n"
            f"  recomputed({fn.__name__}): {recomputed.iloc[i]!r}"
        )

# --- Meaningfulness heuristic aligned with your pipeline's stopwords ---
# Prefer your NLTK stopword set if present; otherwise use a richer fallback.
try:
    STOPWORDS_FOR_TEST = set(stop_words)  # uses your actual pipeline's stopwords if defined
    if not isinstance(STOPWORDS_FOR_TEST, set):
        STOPWORDS_FOR_TEST = set(STOPWORDS_FOR_TEST)
except NameError:
    STOPWORDS_FOR_TEST = set("""
    a an the and or but if then else when while is are was were be been being am
    to of in on at by for from as that this it its into over under about with
    i you he she we they me my mine your yours his her hers our ours their theirs
    do does did doing done have has had having be been being not no nor
    s t d ll re ve m y
    """.split())

def _raw_has_meaningful_letters(s: str) -> bool:
    """Decide if raw text *should* yield non-empty output after your pipeline.
       Mirrors your cleaning (strip URLs/HTML/non-alpha) and uses your stopwords.
    """
    if s is None:
        return False
    s = re.sub(URL_RE, " ", str(s))          # strip URLs
    s = re.sub(r"<.*?>", " ", s)             # strip simple HTML tags
    s = re.sub(r"[^A-Za-z\s]+", " ", s)      # keep letters + spaces
    toks = [t.lower() for t in s.split()]
    toks = [t for t in toks if t not in STOPWORDS_FOR_TEST]
    return any(len(t) >= 2 for t in toks)

def _assert_non_empty_when_input_had_letters(raw: pd.Series, cleaned: pd.Series, name: str):
    # Only rows that truly have content beyond your stopwords must remain non-empty.
    meaningful = raw.fillna("").map(_raw_has_meaningful_letters)
    empties = cleaned == ""
    bad = meaningful & empties
    assert not bad.any(), (
        f"{name}: Became empty despite meaningful alphabetic input. "
        f"Example raw: { _example(raw, bad)!r }"
    )

def _length_sanity(raw: pd.Series, cleaned: pd.Series, name: str, min_non_empty_ratio=0.8):
    mask_meaningful = raw.fillna("").map(_raw_has_meaningful_letters)
    rl = raw[mask_meaningful].fillna("").str.len()
    cl = cleaned[mask_meaningful].fillna("").str.len()
    if len(rl) == 0:
        return
    assert (cl <= rl).mean() >= 0.5, f"{name}: Too many cleaned rows longer than original (meaningful subset)."
    ratio_nonempty = (cl > 0).mean()
    assert ratio_nonempty >= min_non_empty_ratio, (
        f"{name}: Too many empty cleaned rows among meaningful inputs "
        f"(ratio_nonempty={ratio_nonempty:.2f})."
    )

# --- Run checks on both columns ---
for raw_col, clean_col in [("title", "title_clean"), ("text", "text_clean")]:
    assert raw_col in df.columns,   f"Missing original column: {raw_col}"
    assert clean_col in df.columns, f"Missing cleaned column: {clean_col}"

    s_raw = df[raw_col]
    s_cln = df[clean_col]

    _assert_series_exists_and_string(s_cln, clean_col)
    _assert_no_urls(s_cln, clean_col)
    _assert_lowercase_only(s_cln, clean_col)
    _assert_alpha_space_only(s_cln, clean_col)
    _assert_no_extra_whitespace(s_cln, clean_col)
    _assert_no_nans_and_strings(s_cln, clean_col)

    # Authoritative: cleaned equals ONE single pass over raw
    _assert_matches_single_pass(s_raw, s_cln, clean_col, apply_preprocessing)

    # Heuristic-based expectations (aligned with your stopwords)
    _assert_non_empty_when_input_had_letters(s_raw, s_cln, clean_col)
    _length_sanity(s_raw, s_cln, clean_col, min_non_empty_ratio=0.8)

# --- Brief summary ---
def _summary_block(name: str, raw: pd.Series, clean: pd.Series):
    rl = raw.fillna("").str.len()
    cl = clean.fillna("").str.len()
    print(f"\n[{name}] rows={len(raw)}")
    print(f"  Non-empty (raw/clean): {(rl>0).mean():.2%} / {(cl>0).mean():.2%}")
    print(f"  Median length (raw/clean): {int(rl.median() if len(rl) else 0)} / {int(cl.median() if len(cl) else 0)}")
    print(f"  <= length preserved ratio: {(cl<=rl).mean():.2%}")

_summary_block("TITLE", df["title"], df["title_clean"])
_summary_block("TEXT",  df["text"],  df["text_clean"])

print("\n All dataframe preprocessing checks passed (single-pass contract; stopword-aware meaningfulness).")
