<a href="https://colab.research.google.com/github/divyasri2609/Creditcard-fraud-detection/blob/main/fakenews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk scikit-learn --quiet

import pandas as pd
import numpy as np
import re
import nltk
import joblib
import zipfile
import os

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

from google.colab import files
uploaded = files.upload()

for fn in uploaded.keys():
    if fn.endswith(".zip"):
        with zipfile.ZipFile(fn, 'r') as zip_ref:
            zip_ref.extractall()
        print(f"Extracted {fn}")
        fake_csv_path = "Fake.csv"
        true_csv_path = "True.csv"
        break
    else:
        fake_csv_path = "Fake.csv"
        true_csv_path = "True.csv"
fake_df = pd.read_csv(fake_csv_path)
true_df = pd.read_csv(true_csv_path)

fake_df["label"] = 1
true_df["label"] = 0

df = pd.concat([fake_df, true_df], axis=0).reset_index(drop=True)
print("Dataset shape:", df.shape)
display(df.head())
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = text.split()
    tokens = [stemmer.stem(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(clean_text)

X = df["clean_text"].values
y = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print("Train size:", X_train.shape[0], "Test size:", X_test.shape[0])

tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF shape:", X_train_tfidf.shape)

models = {
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "SVM (LinearSVC)": LinearSVC(max_iter=5000),
    "Passive Aggressive": PassiveAggressiveClassifier(max_iter=1000),
}
best_acc = -1
best_model = None

for name, clf in models.items():
    print(f"\n Training {name} ...")
    clf.fit(X_train_tfidf, y_train)
    preds = clf.predict(X_test_tfidf)

    acc = accuracy_score(y_test, preds)
    print(f"Accuracy: {acc:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, preds))
    print("Classification Report:\n", classification_report(y_test, preds))

    if acc > best_acc:
        best_acc = acc
        best_model = clf
        best_name = name

print(f"\n Best Model: {best_name} with Accuracy = {best_acc:.4f}")

joblib.dump({"model": best_model, "tfidf": tfidf}, "fake_news_model.pkl")
print("Best model saved as fake_news_model.pkl")

sample_text = "Breaking: Scientists confirm water found on Mars surface."
clean_sample = clean_text(sample_text)
vec = tfidf.transform([clean_sample])
pred = best_model.predict(vec)[0]
print("\nSample Prediction â†’", "Fake News" if pred == 1 else "Real News")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Saving archive (5).zip to archive (5).zip
Extracted archive (5).zip
Dataset shape: (44898, 5)


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Yearâ€™...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obamaâ€™s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


Train size: 31428 Test size: 13470
TF-IDF shape: (31428, 20000)

ðŸ”¹ Training Logistic Regression ...
Accuracy: 0.9870
Confusion Matrix:
 [[6346   79]
 [  96 6949]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      6425
           1       0.99      0.99      0.99      7045

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470


ðŸ”¹ Training Random Forest ...
Accuracy: 0.9965
Confusion Matrix:
 [[6416    9]
 [  38 7007]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      6425
           1       1.00      0.99      1.00      7045

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470


ðŸ”¹ Training SVM (LinearSVC) ...
A

In [None]:
!pip install nltk scikit-learn matplotlib --quiet

import pandas as pd
import numpy as np
import re
import nltk
import joblib
import zipfile
import os
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    precision_score, recall_score, f1_score
)

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

from google.colab import files
uploaded = files.upload()

fake_csv_path = "Fake.csv"
true_csv_path = "True.csv"

for fn in uploaded.keys():
    if fn.endswith(".zip"):
        with zipfile.ZipFile(fn, 'r') as zip_ref:
            zip_ref.extractall()
        print(f"Extracted {fn}")
        break

if not os.path.exists(fake_csv_path) or not os.path.exists(true_csv_path):
    print("Error: CSV files not found after extraction.")
else:
    fake_df = pd.read_csv(fake_csv_path)
    true_df = pd.read_csv(true_csv_path)

    fake_df["label"] = 1
    true_df["label"] = 0

    df = pd.concat([fake_df, true_df], axis=0).reset_index(drop=True)
    print("Dataset shape:", df.shape)
    display(df.head())

    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r"http\S+|www\S+|https\S+", "", text)
        text = re.sub(r"[^a-z\s]", "", text)
        tokens = text.split()
        tokens = [stemmer.stem(w) for w in tokens if w not in stop_words]
        return " ".join(tokens)

    df["clean_text"] = df["text"].apply(clean_text)

    X = df["clean_text"].values
    y = df["label"].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    print("Train size:", X_train.shape[0], "Test size:", X_test.shape[0])

    tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    print("TF-IDF shape:", X_train_tfidf.shape)

    models = {
        "Logistic Regression": LogisticRegression(max_iter=2000),
        "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
        "SVM (LinearSVC)": LinearSVC(max_iter=5000),
        "Passive Aggressive": PassiveAggressiveClassifier(max_iter=1000),
    }

    metrics = {"Model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []}

    best_acc = -1
    best_model = None
    best_name = ""

    for name, clf in models.items():
        print(f"\n Training {name} ...")
        clf.fit(X_train_tfidf, y_train)
        preds = clf.predict(X_test_tfidf)

        acc = accuracy_score(y_test, preds)
        prec = precision_score(y_test, preds)
        rec = recall_score(y_test, preds)
        f1 = f1_score(y_test, preds)

        print(f"Accuracy: {acc:.4f}")
        print("Confusion Matrix:\n", confusion_matrix(y_test, preds))
        print("Classification Report:\n", classification_report(y_test, preds))

        metrics["Model"].append(name)
        metrics["Accuracy"].append(acc)
        metrics["Precision"].append(prec)
        metrics["Recall"].append(rec)
        metrics["F1"].append(f1)

        if acc > best_acc:
            best_acc = acc
            best_model = clf
            best_name = name

    print(f"\n Best Model: {best_name} with Accuracy = {best_acc:.4f}")

    metrics_df = pd.DataFrame(metrics)
    print("\nModel Performance:\n", metrics_df)

    joblib.dump({"model": best_model, "tfidf": tfidf}, "fake_news_model.pkl")
    print("Best model saved as fake_news_model.pkl")

    sample_text = "Breaking: Scientists confirm water found on Mars surface."
    clean_sample = clean_text(sample_text)
    vec = tfidf.transform([clean_sample])
    pred = best_model.predict(vec)[0]
    print("\nSample Prediction â†’", "Fake News" if pred == 1 else "Real News")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Saving archive (5).zip to archive (5) (2).zip
Extracted archive (5) (2).zip
Dataset shape: (44898, 5)


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Yearâ€™...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obamaâ€™s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


Train size: 31428 Test size: 13470
TF-IDF shape: (31428, 20000)

ðŸ”¹ Training Logistic Regression ...
Accuracy: 0.9870
Confusion Matrix:
 [[6346   79]
 [  96 6949]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      6425
           1       0.99      0.99      0.99      7045

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470


ðŸ”¹ Training Random Forest ...
Accuracy: 0.9965
Confusion Matrix:
 [[6416    9]
 [  38 7007]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      6425
           1       1.00      0.99      1.00      7045

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470


ðŸ”¹ Training SVM (LinearSVC) ...
A