In [7]:
!pip install lime




In [8]:
# ============================================
# AI-Based Fake News & Misinformation Detector
# ============================================

import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from lime.lime_text import LimeTextExplainer


# ============================================
# 1. Text Cleaning Module
# ============================================

class TextProcessor:

    @staticmethod
    def clean(text):
        text = str(text).lower()
        text = re.sub(r"http\S+", "", text)
        text = re.sub(r"[^a-zA-Z ]", " ", text)
        text = re.sub(r"\s+", " ", text)
        return text.strip()


# ============================================
# 2. Fake News Detection Model
# ============================================

class FakeNewsSystem:

    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=7000)
        self.model = LogisticRegression(max_iter=1000)
        self.explainer = LimeTextExplainer(class_names=["FAKE", "REAL"])
        self.processor = TextProcessor()


    # ----------------------------------------
    # Load Dataset (Robust Parsing)
    # ----------------------------------------
    def load_dataset(self, true_path, fake_path):

        real_df = pd.read_csv(true_path, engine="python", on_bad_lines="skip")
        fake_df = pd.read_csv(fake_path, engine="python", on_bad_lines="skip")

        real_df["label"] = 1
        fake_df["label"] = 0

        data = pd.concat([real_df, fake_df])
        data = data.sample(frac=1, random_state=42).reset_index(drop=True)

        data["text"] = data["text"].apply(self.processor.clean)

        return data


    # ----------------------------------------
    # Train Model
    # ----------------------------------------
    def train(self, dataset):

        X = dataset["text"]
        y = dataset["label"]

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        X_train_vec = self.vectorizer.fit_transform(self.X_train)
        self.model.fit(X_train_vec, self.y_train)

        print("âœ… Model Training Completed Successfully")


    # ----------------------------------------
    # Evaluate Model
    # ----------------------------------------
    def evaluate(self):

        X_test_vec = self.vectorizer.transform(self.X_test)
        predictions = self.model.predict(X_test_vec)

        acc = accuracy_score(self.y_test, predictions)
        prec = precision_score(self.y_test, predictions)
        rec = recall_score(self.y_test, predictions)
        f1 = f1_score(self.y_test, predictions)

        print("\nðŸ“Š Model Evaluation Metrics")
        print("--------------------------------")
        print("Accuracy  :", round(acc, 4))
        print("Precision :", round(prec, 4))
        print("Recall    :", round(rec, 4))
        print("F1 Score  :", round(f1, 4))
        print("--------------------------------")


    # ----------------------------------------
    # Prediction
    # ----------------------------------------
    def predict(self, article):

        cleaned = self.processor.clean(article)
        vectorized = self.vectorizer.transform([cleaned])

        prediction = self.model.predict(vectorized)[0]
        probability = self.model.predict_proba(vectorized)[0]

        confidence = float(np.max(probability))
        label = "REAL" if prediction == 1 else "FAKE"

        return label, confidence


    # ----------------------------------------
    # LIME Explanation
    # ----------------------------------------
    def explain(self, article):

        def predict_proba(texts):
            vectors = self.vectorizer.transform(texts)
            return self.model.predict_proba(vectors)

        explanation = self.explainer.explain_instance(
            article,
            predict_proba,
            num_features=8
        )

        important_words = [word for word, weight in explanation.as_list()]
        return important_words


    # ----------------------------------------
    # Generate Explanation
    # ----------------------------------------
    def generate_reason(self, label, confidence):

        if label == "FAKE":
            return (
                f"The article is classified as FAKE with {confidence:.2f} confidence. "
                "Highlighted words suggest patterns commonly found in misleading or exaggerated content."
            )
        else:
            return (
                f"The article is classified as REAL with {confidence:.2f} confidence. "
                "The content structure and language resemble verified journalistic reporting."
            )


# ============================================
# 3. MAIN EXECUTION
# ============================================

if __name__ == "__main__":

    detector = FakeNewsSystem()

    # ðŸ”¥ YOUR EXACT DATASET PATHS
    TRUE_DATA_PATH = "/content/sample_data/True.csv"
    FAKE_DATA_PATH = "/content/sample_data/Fake.csv"

    dataset = detector.load_dataset(TRUE_DATA_PATH, FAKE_DATA_PATH)

    detector.train(dataset)

    detector.evaluate()

    # ========================================
    # Sample Model Interaction Output
    # ========================================

    sample_article = detector.X_test.iloc[0]

    predicted_label, confidence_value = detector.predict(sample_article)
    suspicious_terms = detector.explain(sample_article)
    explanation_text = detector.generate_reason(predicted_label, confidence_value)

    print("\n==========================================")
    print("      FAKE NEWS DETECTION RESULT")
    print("==========================================")
    print("Prediction            :", predicted_label)
    print("Confidence Score      :", round(confidence_value * 100, 2), "%")
    print("Suspicious Phrases    :", suspicious_terms)
    print("Generated Explanation :", explanation_text)
    print("==========================================")

    # ========================================
    # User Input Section
    # ========================================

    user_text = input("\nPaste a news article to verify:\n")

    label, conf = detector.predict(user_text)
    phrases = detector.explain(user_text)
    reasoning = detector.generate_reason(label, conf)

    print("\n==========================================")
    print("Prediction            :", label)
    print("Confidence Score      :", round(conf * 100, 2), "%")
    print("Suspicious Phrases    :", phrases)
    print("Generated Explanation :", reasoning)
    print("==========================================")


âœ… Model Training Completed Successfully

ðŸ“Š Model Evaluation Metrics
--------------------------------
Accuracy  : 0.9869
Precision : 0.9834
Recall    : 0.9893
F1 Score  : 0.9864
--------------------------------

      FAKE NEWS DETECTION RESULT
Prediction            : REAL
Confidence Score      : 96.65 %
Suspicious Phrases    : [np.str_('said'), np.str_('reuters'), np.str_('minister'), np.str_('washington'), np.str_('that'), np.str_('in'), np.str_('on'), np.str_('of')]
Generated Explanation : The article is classified as REAL with 0.97 confidence. The content structure and language resemble verified journalistic reporting.

Paste a news article to verify:
The United Nations held a climate summit on Monday where world leaders discussed new strategies to reduce carbon emissions. Several countries pledged to achieve net-zero emissions by 2050, emphasizing renewable energy investments and stricter environmental regulations. Experts stated that international cooperation will be crucial 