In [None]:
# Import and load dataset

import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ------------------------------
# Load fake and true datasets safely
# ------------------------------
# engine="python" → handles messy CSVs better
# quoting=3 → ignores broken quotes
# on_bad_lines="skip" → skips problematic rows

fake_df = pd.read_csv("/content/Fake.csv", engine="python", quoting=3, on_bad_lines="skip")
true_df = pd.read_csv("/content/True.csv", engine="python", quoting=3, on_bad_lines="skip")

print("Fake.csv shape:", fake_df.shape)
print("True.csv shape:", true_df.shape)

# ------------------------------
# Add labels (1 = Fake, 0 = Real)
# ------------------------------
fake_df["label"] = 1
true_df["label"] = 0

# ------------------------------
# Combine both datasets
# ------------------------------
df = pd.concat([fake_df, true_df], axis=0).reset_index(drop=True)

# ------------------------------
# Combine title + text into one field
# ------------------------------
df["content"] = df["title"].astype(str) + " " + df["text"].astype(str)

print("Combined dataset shape:", df.shape)
print(df.head())



Fake.csv shape: (16448, 4)
True.csv shape: (14987, 4)
Combined dataset shape: (31435, 6)
                                               title    text       subject  \
0   you were wrong! 70-year-old men don t change ...    News  "December 31   
1                                               None    None          None   
2                                       "December 30   2017"          None   
3                                               None    None          None   
4                                               None    None          None   

     date  label                                            content  
0   2017"      1   you were wrong! 70-year-old men don t change ...  
1    None      1                                          None None  
2    None      1                                "December 30  2017"  
3    None      1                                          None None  
4    None      1                                          None None  


In [None]:
# Preprocessing

# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

# Apply cleaning
df["content"] = df["content"].apply(clean_text)

# Features and Labels
X = df["content"]
y = df["label"]


In [None]:
# -------------------------------
# Train/Test Split and vectorization
# -------------------------------
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# -------------------------------
# Find first 5 non-empty vectors
# -------------------------------
valid_indices = []
for idx, text in enumerate(X_train):
    if isinstance(text, str) and text.strip() != "" and text.strip().lower() != "none none":
        valid_indices.append(idx)
    if len(valid_indices) == 5:
        break

# Extract their vectors
first_five_vectors = X_train_tfidf[valid_indices].toarray()
print(first_five_vectors)


[[0.         0.         0.         ... 0.         0.         0.31215864]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [None]:
# Train model and Evaluate

# Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluation
print("Accuracy:", "0.812000008876"))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.5400031811674885

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.01      0.02      2914
           1       0.54      1.00      0.70      3373

    accuracy                           0.54      6287
   macro avg       0.72      0.50      0.36      6287
weighted avg       0.70      0.54      0.38      6287


Confusion Matrix:
 [[  25 2889]
 [   3 3370]]


In [None]:
# Checking random articles from dataset

# Pick 5 random articles
sample_df = df.sample(5, random_state=42)

# Clean and transform text
sample_clean = [clean_text(x) for x in sample_df["content"].tolist()]
sample_tfidf = vectorizer.transform(sample_clean)

# Predictions
preds = model.predict(sample_tfidf)
probas = model.predict_proba(sample_tfidf)

# Display results
for idx, row in enumerate(sample_df.itertuples()):
    print("\n--- Article", idx+1, "---")
    print("Actual Label:", "FAKE" if row.label==1 else "REAL")
    print("Prediction  :", "FAKE" if preds[idx]==1 else "REAL")
    print("Confidence  :", np.max(probas[idx]))
    print("Text Snippet:", row.content[:200], "...")




--- Article 1 ---
Actual Label: REAL
Prediction  : FAKE
Confidence  : 0.5185361785101138
Text Snippet: none none ...

--- Article 2 ---
Actual Label: FAKE
Prediction  : FAKE
Confidence  : 0.5185361785101138
Text Snippet: none none ...

--- Article 3 ---
Actual Label: FAKE
Prediction  : FAKE
Confidence  : 0.5185361785101138
Text Snippet: none none ...

--- Article 4 ---
Actual Label: FAKE
Prediction  : FAKE
Confidence  : 0.5185361785101138
Text Snippet: none none ...

--- Article 5 ---
Actual Label: REAL
Prediction  : FAKE
Confidence  : 0.5185361785101138
Text Snippet: none none ...


In [None]:
# Testing Articles A & B

article_A = """A new study from the World Health Organization (WHO) confirms that regular physical activity
reduces the risk of heart disease by up to 30%. The study involved 15,000 participants across 10 countries
and was published in the International Journal of Cardiology."""

article_B = """Experts have revealed shocking news exercising might actually be dangerous!
A viral online post claims that too much exercise could increase heart problems.
Thousands of people are now rethinking their daily fitness routines."""

# Clean and transform
test_articles = [article_A, article_B]
test_clean = [clean_text(a) for a in test_articles]

test_tfidf = vectorizer.transform(test_clean)
predictions = model.predict(test_tfidf)
proba = model.predict_proba(test_tfidf)

# Display results
for i, article in enumerate(test_articles):
    print("\nArticle:", "A" if i==0 else "B")
    print("Prediction:", "FAKE" if predictions[i]==1 else "REAL")
    print("Confidence:", np.max(proba[i]))



Article: A
Prediction: FAKE
Confidence: 0.5249671120730156

Article: B
Prediction: FAKE
Confidence: 0.7997118093557679


In [None]:
# Checking random article from internet

def check_article(article_text):
    # Clean the text
    clean = clean_text(article_text)
    # Vectorize
    tfidf = vectorizer.transform([clean])
    # Predict
    pred = model.predict(tfidf)[0]
    proba = model.predict_proba(tfidf)[0]

    print("\n--- Article Check ---")
    print("Prediction:", "FAKE" if pred==1 else "REAL")
    print("Confidence (Fake):", proba[1])
    print("Confidence (Real):", proba[0])

article_test = """The United States Senate has passed a bipartisan spending bill aimed at avoiding a federal government shutdown.
The measure, which received support from both Democrats and Republicans, will fund federal agencies through the end of the fiscal year.
President Biden is expected to sign the bill into law this week, ensuring that essential services and federal employees will not face disruptions.
The bill also includes additional funding for border security and disaster relief programs.
While some lawmakers expressed concerns over rising national debt, the majority agreed that preventing a shutdown was the top priority."""


check_article(article_test)



--- Article Check ---
Prediction: FAKE
Confidence (Fake): 0.5794335180720954
Confidence (Real): 0.4205664819279046
