In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import re
import string
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load datasets
df_fake = pd.read_csv("Fake.csv")
df_true = pd.read_csv("True.csv")

In [3]:
# Add class labels
df_fake["class"] = 0
df_true["class"] = 1

In [10]:
df_fake_manual_testing = df_fake.sample(n=5000, random_state=42)

# Remove the selected rows from df_fake
df_fake = df_fake.drop(df_fake_manual_testing.index)

# Randomly select 10,000 rows from df_true for manual testing
df_true_manual_testing = df_true.sample(n=5000, random_state=42)

# Remove the selected rows from df_true
df_true = df_true.drop(df_true_manual_testing.index)

df_fake_manual_testing["class"] = 0
df_true_manual_testing["class"] = 1

df_manual_testing = pd.concat([df_fake_manual_testing, df_true_manual_testing], axis=0)
df_manual_testing.to_csv("manual_testing.csv", index=False)


In [11]:
# Merging datasets
df_merge = pd.concat([df_fake_manual_testing, df_true_manual_testing], axis=0)

# Data preprocessing
df_merge = df_merge.drop(["title", "subject", "date"], axis=1)
df_merge = df_merge.sample(frac=1).reset_index(drop=True)

# Text preprocessing function
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [12]:
# Apply text preprocessing
df_merge["text"] = df_merge["text"].apply(wordopt)

# Splitting the data
x = df_merge["text"]
y = df_merge["class"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [13]:
# Vectorization
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [14]:
# Initialize classifiers
lr_model = LogisticRegression()
dt_model = DecisionTreeClassifier()
gb_model = GradientBoostingClassifier()
rf_model = RandomForestClassifier()

# Train the models
lr_model.fit(xv_train, y_train)
dt_model.fit(xv_train, y_train)
gb_model.fit(xv_train, y_train)
rf_model.fit(xv_train, y_train)

# Prediction
lr_pred = lr_model.predict(xv_test)
dt_pred = dt_model.predict(xv_test)
gb_pred = gb_model.predict(xv_test)
rf_pred = rf_model.predict(xv_test)

# Model Evaluation
lr_accuracy = accuracy_score(y_test, lr_pred)
dt_accuracy = accuracy_score(y_test, dt_pred)
gb_accuracy = accuracy_score(y_test, gb_pred)
rf_accuracy = accuracy_score(y_test, rf_pred)

In [15]:
print("Logistic Regression Accuracy:", lr_accuracy)
print("Decision Tree Accuracy:", dt_accuracy)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Random Forest Accuracy:", rf_accuracy)

print("\nLogistic Regression Classification Report:\n", classification_report(y_test, lr_pred))
print("Decision Tree Classification Report:\n", classification_report(y_test, dt_pred))
print("Gradient Boosting Classification Report:\n", classification_report(y_test, gb_pred))
print("Random Forest Classification Report:\n", classification_report(y_test, rf_pred))



Logistic Regression Accuracy: 0.972
Decision Tree Accuracy: 0.9948
Gradient Boosting Accuracy: 0.9952
Random Forest Accuracy: 0.9808

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97      1271
           1       0.97      0.98      0.97      1229

    accuracy                           0.97      2500
   macro avg       0.97      0.97      0.97      2500
weighted avg       0.97      0.97      0.97      2500

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      1271
           1       0.99      1.00      0.99      1229

    accuracy                           0.99      2500
   macro avg       0.99      0.99      0.99      2500
weighted avg       0.99      0.99      0.99      2500

Gradient Boosting Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99