In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from flask import Flask, request, render_template_string, make_response
import sys
import os
import webbrowser
import threading
import time
import socket
import subprocess
import platform

# Unique print statement to confirm this script is running
print("Running the updated fake news detector script (version 2025-03-24 v2)")

# Initialize Flask app
app = Flask(__name__)

# Function to find an available port
def find_available_port(start_port=5000):
    port = start_port
    while True:
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            try:
                s.bind(('0.0.0.0', port))
                return port
            except OSError:
                port += 1

# Load dataset and subsample for faster processing
try:
    fake_news = pd.read_csv("Fake-1.csv", usecols=['text'])
    true_news = pd.read_csv("True-1.csv", usecols=['text'])
except FileNotFoundError as e:
    print(f"Error: Could not find the dataset files. Please ensure 'Fake-1.csv' and 'True-1.csv' are in the current directory.")
    print(f"Current directory: {os.getcwd()}")
    sys.exit(1)

fake_news['label'] = 0
true_news['label'] = 1
df = pd.concat([fake_news, true_news])

# Subsample the dataset to 5,000 articles for faster processing
df = df.sample(n=5000, random_state=42)

# Data preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"\d+", "", text)
    return text

df["text"] = df["text"].apply(clean_text)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

# --- TF-IDF-based Models ---
nb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.7, max_features=5000)),
    ('model', MultinomialNB())
])

rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.7, max_features=5000)),
    ('model', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))
])

svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.7, max_features=5000)),
    ('model', LinearSVC())
])

nb_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)
svm_pipeline.fit(X_train, y_train)

print("Naive Bayes Model:")
nb_y_pred = nb_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, nb_y_pred))
print("Classification Report:\n", classification_report(y_test, nb_y_pred))

print("\nRandom Forest Model:")
rf_y_pred = rf_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, rf_y_pred))
print("Classification Report:\n", classification_report(y_test, rf_y_pred))

print("\nSVM Model:")
svm_y_pred = svm_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, svm_y_pred))
print("Classification Report:\n", classification_report(y_test, svm_y_pred))

# Dictionary to store all models
models = {
    "Naive Bayes": nb_pipeline,
    "Random Forest": rf_pipeline,
    "SVM": svm_pipeline
}

# Function to predict on new input and highlight fake parts in red
def predict_news(news_article, model_choice):
    cleaned_text = clean_text(news_article)

    pipeline = models[model_choice]
    prediction = pipeline.predict([cleaned_text])[0]
    probabilities = pipeline.predict_proba([cleaned_text])[0] if model_choice != "SVM" else [0.5, 0.5]

    label = "Real" if prediction == 1 else "Fake"
    prob_fake = probabilities[0] * 100
    prob_real = probabilities[1] * 100

    tfidf = pipeline.named_steps['tfidf']
    model = pipeline.named_steps['model']
    tfidf_vector = tfidf.transform([cleaned_text])
    feature_names = tfidf.get_feature_names_out()
    tfidf_scores = tfidf_vector.toarray()[0]

    if model_choice == "Naive Bayes":
        fake_log_probs = model.feature_log_prob_[0]
        print(f"Naive Bayes - fake_log_probs shape: {fake_log_probs.shape}")
        word_contributions = {}
        for idx, score in enumerate(tfidf_scores):
            if score > 0:
                word = feature_names[idx]
                contribution = score * fake_log_probs[idx]
                word_contributions[word] = contribution
    else:
        if model_choice == "Random Forest":
            coef = model.feature_importances_
        else:  # SVM
            if len(model.coef_.shape) > 1:
                coef = model.coef_[0]
            else:
                coef = model.coef_
        print(f"{model_choice} - original coef shape: {model.coef_.shape if model_choice != 'Random Forest' else 'N/A'}, extracted coef shape: {coef.shape}")
        word_contributions = {}
        for idx, score in enumerate(tfidf_scores):
            if score > 0:
                if idx >= len(coef):
                    print(f"Error: idx {idx} is out of bounds for coef with length {len(coef)}")
                    continue
                word = feature_names[idx]
                contribution = score * coef[idx]
                word_contributions[word] = contribution

    top_fake_words = sorted(word_contributions.items(), key=lambda x: x[1], reverse=True)[:5]
    fake_words = set(word for word, _ in top_fake_words)

    words = news_article.split()
    highlighted_article = []
    for word in words:
        cleaned_word = clean_text(word)
        if cleaned_word in fake_words:
            highlighted_article.append(f'<span style="color: #e63946;">{word}</span>')
        else:
            highlighted_article.append(word)
    highlighted_text = " ".join(highlighted_article)

    output = f"<p><strong>Prediction:</strong> {label}</p>"
    output += f"<p><strong>Probability of being Fake:</strong> {prob_fake:.2f}%</p>"
    output += f"<p><strong>Probability of being Real:</strong> {prob_real:.2f}%</p>"
    output += "<p><strong>Highlighted Article (red parts indicate potential fake content):</strong></p>"
    output += f"<p>{highlighted_text}</p>"

    return output

# HTML template for Flask with luxurious and formal styling
HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta http-equiv="Cache-Control" content="no-cache, no-store, must-revalidate">
    <meta http-equiv="Pragma" content="no-cache">
    <meta http-equiv="Expires" content="0">
    <title>Fake News Detector</title>
    <link href="https://fonts.googleapis.com/css2?family=Montserrat:wght@500;700&family=Playfair+Display:wght@400;700&display=swap" rel="stylesheet">
    <style>
        body {
            font-family: 'Playfair Display', serif;
            margin: 0;
            padding: 0;
            background: linear-gradient(135deg, #1a2a44 0%, #2e3b55 100%);
            color: #f5f5f5;
            min-height: 100vh;
            display: flex;
            justify-content: center;
            align-items: center;
        }
        .container {
            max-width: 900px;
            width: 90%;
            background: #ffffff;
            padding: 40px;
            border-radius: 12px;
            box-shadow: 0 10px 30px rgba(0, 0, 0, 0.3);
            margin: 40px 20px;
            border: 1px solid #d4af37;
        }
        h1 {
            font-family: 'Montserrat', sans-serif;
            color: #d4af37;
            text-align: center;
            font-size: 2.5em;
            margin-bottom: 20px;
            letter-spacing: 1px;
        }
        p {
            font-size: 1.1em;
            line-height: 1.8;
            color: #666;
            text-align: center;
            margin-bottom: 30px;
        }
        label {
            font-family: 'Montserrat', sans-serif;
            font-weight: 500;
            margin-top: 20px;
            display: block;
            color: #1a2a44;
            font-size: 1.1em;
            letter-spacing: 0.5px;
        }
        textarea, select {
            width: 100%;
            padding: 12px;
            margin-top: 10px;
            margin-bottom: 20px;
            border: 1px solid #d4af37;
            border-radius: 8px;
            font-family: 'Playfair Display', serif;
            font-size: 1em;
            background-color: #f9f9f9;
            color: #333;
            transition: border-color 0.3s ease, box-shadow 0.3s ease;
        }
        textarea:focus, select:focus {
            outline: none;
            border-color: #d4af37;
            box-shadow: 0 0 8px rgba(212, 175, 55, 0.3);
        }
        button {
            background: linear-gradient(135deg, #d4af37 0%, #b8972e 100%);
            color: #1a2a44;
            padding: 12px 30px;
            border: none;
            border-radius: 8px;
            font-family: 'Montserrat', sans-serif;
            font-size: 1.1em;
            font-weight: 700;
            cursor: pointer;
            display: block;
            margin: 20px auto;
            transition: background 0.3s ease, transform 0.2s ease;
        }
        button:hover {
            background: linear-gradient(135deg, #b8972e 0%, #d4af37 100%);
            transform: translateY(-2px);
        }
        .output {
            margin-top: 30px;
            padding: 20px;
            border: 1px solid #d4af37;
            border-radius: 8px;
            background-color: #f9f9f9;
            box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
            font-size: 1.1em;
            line-height: 1.8;
            color: #333;
        }
        .output p {
            margin: 10px 0;
            text-align: left;
        }
        .output strong {
            color: #1a2a44;
            font-family: 'Montserrat', sans-serif;
            font-weight: 700;
        }
    </style>
</head>
<body>
    <div class="container">
        <h1>Fake News Detector</h1>
        <p>Analyze the authenticity of news articles with precision. Select a model and identify potential fake content with highlighted insights.</p>
        <form method="POST" action="/">
            <label for="news_article">Enter News Article:</label>
            <textarea id="news_article" name="news_article" rows="6" placeholder="Paste your article here...">{{ news_article if news_article else '' }}</textarea>

            <label for="model_choice">Select Model:</label>
            <select id="model_choice" name="model_choice">
                <option value="Naive Bayes" {% if model_choice == "Naive Bayes" %}selected{% endif %}>Naive Bayes</option>
                <option value="Random Forest" {% if model_choice == "Random Forest" %}selected{% endif %}>Random Forest</option>
                <option value="SVM" {% if model_choice == "SVM" %}selected{% endif %}>SVM</option>
            </select>

            <button type="submit">Analyze</button>
        </form>

        {% if output %}
        <div class="output">
            {{ output | safe }}
        </div>
        {% endif %}
    </div>
</body>
</html>
"""

# Flask routes
@app.route('/', methods=['GET', 'POST'])
def index():
    news_article = None
    model_choice = "Naive Bayes"
    output = None

    if request.method == 'POST':
        news_article = request.form.get('news_article', '')
        model_choice = request.form.get('model_choice', 'Naive Bayes')
        if news_article:
            output = predict_news(news_article, model_choice)

    response = make_response(render_template_string(HTML_TEMPLATE, news_article=news_article, model_choice=model_choice, output=output))
    response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
    response.headers['Pragma'] = 'no-cache'
    response.headers['Expires'] = '0'
    return response

# Function to run Flask server in a thread and open the browser
def run_flask():
    global browser_opened
    port = find_available_port(start_port=5000)
    url = f"http://127.0.0.1:{port}"
    full_url = f"{url}?nocache={int(time.time())}"
    print(f"Starting Flask server on {full_url}...")

    # Reset browser_opened flag to ensure the browser opens if the server starts successfully
    if 'browser_opened' in globals():
        del globals()['browser_opened']

    try:
        if not globals().get('browser_opened', False):
            time.sleep(2)  # Increased delay to ensure server is ready
            system = platform.system()
            print(f"Detected system: {system}")
            if system == "Darwin":  # macOS
                print("Attempting to open Safari on macOS...")
                try:
                    subprocess.run(["open", "-a", "Safari", full_url], check=True)
                    print("Successfully opened Safari.")
                except subprocess.CalledProcessError as e:
                    print(f"Failed to open Safari with subprocess: {e}. Falling back to webbrowser...")
                    webbrowser.open_new_tab(full_url)
            else:
                print("Opening default web browser...")
                webbrowser.open_new_tab(full_url)
            globals()['browser_opened'] = True
        app.run(host='0.0.0.0', port=port, debug=False, use_reloader=False)
    except Exception as e:
        print(f"Failed to start Flask server: {e}")
        if 'browser_opened' in globals():
            del globals()['browser_opened']  # Reset flag if server fails

# Start Flask in a separate thread
threading.Thread(target=run_flask, daemon=True).start()

Running the updated fake news detector script (version 2025-03-24 v2)
Naive Bayes Model:
Accuracy: 0.91
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.91      0.91       510
           1       0.91      0.91      0.91       490

    accuracy                           0.91      1000
   macro avg       0.91      0.91      0.91      1000
weighted avg       0.91      0.91      0.91      1000


Random Forest Model:
Accuracy: 0.987
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       510
           1       0.98      0.99      0.99       490

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       0.99      0.99      0.99      1000


SVM Model:
Accuracy: 0.98
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       

Detected system: Darwin
Attempting to open Safari on macOS...
Successfully opened Safari.
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5001
 * Running on http://10.35.124.192:5001
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [24/Mar/2025 21:29:21] "GET /?nocache=1742851757 HTTP/1.1" 200 -
127.0.0.1 - - [24/Mar/2025 21:29:21] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [24/Mar/2025 21:29:33] "POST / HTTP/1.1" 200 -


Naive Bayes - fake_log_probs shape: (5000,)


127.0.0.1 - - [24/Mar/2025 21:29:38] "POST / HTTP/1.1" 200 -


Random Forest - original coef shape: N/A, extracted coef shape: (5000,)


127.0.0.1 - - [24/Mar/2025 21:29:42] "POST / HTTP/1.1" 200 -


SVM - original coef shape: (1, 5000), extracted coef shape: (5000,)
