Fake News Detector model

In [13]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [14]:
data = pd.read_csv(r"D:\Studies\Ai Projects\1 Fake News Detector\fake_or_real_news.csv")

In [15]:
data['fake'] = data['label'].apply(lambda x: 0 if x == 'REAL' else 1)

In [16]:
data = data.drop('label', axis=1)

In [17]:
x, y = data['text'], data['fake']

In [18]:
x

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
                              ...                        
6330    The State Department told the Republican Natio...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332     Anti-Trump Protesters Are Tools of the Oligar...
6333    ADDIS ABABA, Ethiopia —President Obama convene...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: text, Length: 6335, dtype: object

In [19]:
y

0       1
1       1
2       0
3       1
4       0
       ..
6330    0
6331    1
6332    1
6333    0
6334    0
Name: fake, Length: 6335, dtype: int64

In [20]:
x_train , x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [27]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

In [28]:
clf = LinearSVC()
clf.fit(x_train_vectorized, y_train)

In [29]:
clf.score(x_test_vectorized, y_test)

0.9392265193370166

## test the model

### trial 1

In [31]:
# ─── New Cell: Simple Fake/Real Tester ───────────────────────────────────────

# Define the tester using your existing names:
def predict_fake_or_real(text: str):
    """
    Vectorizes a single string and returns:
      - label: 'REAL' or 'FAKE'
      - score: distance from the SVM hyperplane (larger magnitude = more confident)
    """
    X = vectorizer.transform([text])
    pred = clf.predict(X)[0]
    label = 'FAKE' if pred == 1 else 'REAL'
    score = None
    if hasattr(clf, 'decision_function'):
        score = clf.decision_function(X)[0]
    return label, score

# Sample “obviously fake” news article
sample_article = """
BREAKING: Scientists Uncover Evidence That the Moon Is Actually Hollow

In a groundbreaking discovery announced today, a coalition of top astrophysicists revealed data suggesting the Moon is mostly hollow. According to Dr. Helena Marsden, lead researcher at the International Space Observatory, seismic readings from the Apollo missions indicate vast empty chambers beneath the lunar surface. “We’ve never seen anything like it,” Marsden said. “The Moon’s internal structure defies all current geological models.”

The team speculates these chambers were carved out by an ancient alien civilization, and that the Moon is in fact a giant observatory left behind to watch over Earth. Conspiracy theorists are already calling this “the proof” of extraterrestrial involvement in human history. The U.S. and Russia have both denied any prior knowledge of such findings, but are reportedly sending new missions to investigate further.
"""

# Run the test
label, score = predict_fake_or_real(sample_article)
print(f"Prediction: {label}")
if score is not None:
    print(f"Confidence (distance from boundary): {score:.2f}")

Prediction: FAKE
Confidence (distance from boundary): 1.09


In [32]:
sample_article = """
U.S. AND CHINA AGREE TO BOOST CLIMATE ACTION

Washington, May 10 (Reuters) – The United States and China announced on Friday a joint initiative to curb greenhouse gas emissions, marking a significant step forward in global climate talks. Under the pact, both countries will increase their commitments under the Paris Agreement, with the U.S. aiming to cut emissions by 50% below 2005 levels by 2030 and China targeting carbon neutrality by 2060.

Speaking at the White House, President Jane Doe said, “This partnership with China demonstrates that even the world’s largest economies can come together to tackle the climate crisis.” Chinese Premier Li Wei, joining via video link, added that “cooperation is the only path to safeguarding our planet for future generations.”

The agreement covers clean energy technology sharing, joint research into carbon capture and storage, and funding for sustainable development in vulnerable nations. Negotiators said they hope this U.S.-China framework will encourage other major emitters to raise their ambitions at the upcoming COP31 summit in Glasgow.
"""

label, score = predict_fake_or_real(sample_article)
print(f"Prediction: {label}")
if score is not None:
    print(f"Confidence (distance from boundary): {score:.2f}")


Prediction: FAKE
Confidence (distance from boundary): 0.02


### Trial 2

In [36]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, confusion_matrix

# 1) Calibrate your existing clf to get predict_proba
#    Note: use `estimator=` instead of `base_estimator`
clf = CalibratedClassifierCV(estimator=clf, cv=5)
clf.fit(x_train_vectorized, y_train)
print("Classifier recalibrated for probability outputs.\n")

Classifier recalibrated for probability outputs.



In [37]:
# 2) New tester that returns probabilities
def predict_fake_or_real(text: str, threshold: float = 0.5):
    """
    Returns:
      - label: 'REAL' or 'FAKE'
      - proba: array [P(REAL), P(FAKE)]
    """
    X = vectorizer.transform([text])
    proba = clf.predict_proba(X)[0]
    label = 'FAKE' if proba[1] >= threshold else 'REAL'
    return label, proba

In [38]:
# 3) Test on your sample fake article
sample_article = """
BREAKING: Scientists Uncover Evidence That the Moon Is Actually Hollow

In a groundbreaking discovery announced today, a coalition of top astrophysicists revealed data suggesting the Moon is mostly hollow. According to Dr. Helena Marsden, lead researcher at the International Space Observatory, seismic readings from the Apollo missions indicate vast empty chambers beneath the lunar surface. “We’ve never seen anything like it,” Marsden said. “The Moon’s internal structure defies all current geological models.”

The team speculates these chambers were carved out by an ancient alien civilization, and that the Moon is in fact a giant observatory left behind to watch over Earth. Conspiracy theorists are already calling this “the proof” of extraterrestrial involvement in human history. The U.S. and Russia have both denied any prior knowledge of such findings, but are reportedly sending new missions to investigate further.
"""
label, proba = predict_fake_or_real(sample_article)
print(f"Sample Fake Article → Prediction: {label}")
print(f"P(REAL) = {proba[0]:.2f}, P(FAKE) = {proba[1]:.2f}\n")

Sample Fake Article → Prediction: FAKE
P(REAL) = 0.01, P(FAKE) = 0.99



In [39]:
# 4) Evaluate on your held-out test set
y_pred = clf.predict(x_test_vectorized)
print("Test‐set Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=['REAL','FAKE']))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Test‐set Classification Report:

              precision    recall  f1-score   support

        REAL       0.95      0.93      0.94       648
        FAKE       0.93      0.95      0.94       619

    accuracy                           0.94      1267
   macro avg       0.94      0.94      0.94      1267
weighted avg       0.94      0.94      0.94      1267

Confusion Matrix:
 [[604  44]
 [ 29 590]]


In [44]:
# ─── Replace sample_article with this REAL news snippet ───────────────────────

sample_article = """
EUROPEAN CENTRAL BANK KEEPS RATES STEADY AMID SLOWER INFLATION

Frankfurt, May 8 (Reuters) – The European Central Bank (ECB) on Thursday held its key interest rates unchanged, citing signs that inflation in the euro zone is moderating. In a statement following its two-day policy meeting, the bank said it expects consumer price growth to dip to around 2.5% by the end of the year, down from a peak of 8.1% in October 2023.

ECB President Christine Lagarde said the decision “strikes the right balance between supporting the recovery and ensuring price stability.” She added that the bank stands ready to adjust its stance should inflationary pressures re-emerge. The ECB maintained its deposit rate at 4.00% and its main refinancing operations rate at 4.50%, where they have stood since March.

Markets had largely anticipated the hold, with analysts pointing to easing energy costs and subdued wage growth as factors bringing inflation closer to the ECB’s 2% target. Investors will now look ahead to U.S. inflation data due later this week for further cues on global monetary policy direction.
"""

# ─── Then re-run your calibrated tester cell ─────────────────────────────────

label, proba = predict_fake_or_real(sample_article)
print(f"Prediction: {label}")
print(f"P(REAL) = {proba[0]:.2f}, P(FAKE) = {proba[1]:.2f}")


Prediction: FAKE
P(REAL) = 0.34, P(FAKE) = 0.66


### Trial 3

In [42]:
# ─── Full Cell: Bigram + LogisticRegression + Calibration ───────────────────

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, confusion_matrix

# --- 1) Rebuild vectorizer with bigrams and retrain
vectorizer = TfidfVectorizer(stop_words='english',
                             max_df=0.7,
                             ngram_range=(1, 2))  # unigrams + bigrams

# Fit on train, transform both
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized  = vectorizer.transform(x_test)

# --- 2) Train calibrated LogisticRegression
base_clf = LogisticRegression(max_iter=1000)
clf = CalibratedClassifierCV(estimator=base_clf, cv=5)
clf.fit(x_train_vectorized, y_train)

print("✅ Trained LogisticRegression + Calibration on bigram TF-IDF\n")

# --- 3) Define helper
def predict_fake_or_real(text: str, threshold: float = 0.5):
    """
    Returns:
      - label: 'REAL' or 'FAKE'
      - proba: array [P(REAL), P(FAKE)]
    """
    X = vectorizer.transform([text])
    proba = clf.predict_proba(X)[0]
    label = 'FAKE' if proba[1] >= threshold else 'REAL'
    return label, proba

# --- 4) Test on your REAL ECB article
sample_article = """
EUROPEAN CENTRAL BANK KEEPS RATES STEADY AMID SLOWER INFLATION

Frankfurt, May 8 (Reuters) – The European Central Bank (ECB) on Thursday held its key interest rates unchanged, citing signs that inflation in the euro zone is moderating. In a statement following its two-day policy meeting, the bank said it expects consumer price growth to dip to around 2.5% by the end of the year, down from a peak of 8.1% in October 2023.

ECB President Christine Lagarde said the decision “strikes the right balance between supporting the recovery and ensuring price stability.” She added that the bank stands ready to adjust its stance should inflationary pressures re-emerge. The ECB maintained its deposit rate at 4.00% and its main refinancing operations rate at 4.50%, where they have stood since March.

Markets had largely anticipated the hold, with analysts pointing to easing energy costs and subdued wage growth as factors bringing inflation closer to the ECB’s 2% target. Investors will now look ahead to U.S. inflation data due later this week for further cues on global monetary policy direction.
"""

label, proba = predict_fake_or_real(sample_article)
print(f"Sample Real Article → Prediction: {label}")
print(f"P(REAL) = {proba[0]:.2f}, P(FAKE) = {proba[1]:.2f}\n")

# --- 5) Quick eval on your test set
y_pred = clf.predict(x_test_vectorized)
print("Test‐set Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=['REAL','FAKE']))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Trained LogisticRegression + Calibration on bigram TF-IDF

Sample Real Article → Prediction: FAKE
P(REAL) = 0.34, P(FAKE) = 0.66

Test‐set Classification Report:

              precision    recall  f1-score   support

        REAL       0.89      0.93      0.91       648
        FAKE       0.92      0.89      0.90       619

    accuracy                           0.91      1267
   macro avg       0.91      0.91      0.91      1267
weighted avg       0.91      0.91      0.91      1267

Confusion Matrix:
 [[601  47]
 [ 71 548]]
