In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import re
import string

# Loading the datasets
data_fake = pd.read_csv("fake.csv")
data_true = pd.read_csv("True.csv")

In [2]:
# Adding class labels
data_fake["class"] = 0
data_true["class"] = 1

# Removing last 10 entries for manual testing
data_fake_manual_testing = data_fake.tail(10)
data_true_manual_testing = data_true.tail(10)
data_fake = data_fake.iloc[:-10]
data_true = data_true.iloc[:-10]

In [3]:

# Merging the datasets
data_merge = pd.concat([data_fake, data_true], axis=0)

In [4]:
# Dropping unnecessary columns
data = data_merge.drop(["title", "subject", "date"], axis=1)

In [5]:
# Shuffling the data
data = data.sample(frac=1).reset_index(drop=True)

In [6]:
# Text preprocessing function
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub("https?://\S+|www\S+", "", text)
    text = re.sub("<.*?>+", "", text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub("\n", "", text)
    text = re.sub("\w*\d\w*", "", text)
    return text

In [7]:

# Applying text preprocessing
data['text'] = data['text'].apply(wordopt)

# Splitting the data
x = data['text']
y = data['class']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)


In [8]:

# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [9]:

# List of models to evaluate
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier


ModuleNotFoundError: No module named 'xgboost'

In [None]:
models = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=0)),
    ('Random Forest', RandomForestClassifier(random_state=0)),
    ('Support Vector Machine', SVC(kernel='linear')),
    ('Naive Bayes', MultinomialNB()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('AdaBoost', AdaBoostClassifier(random_state=0)),
    ('Extra Trees', ExtraTreesClassifier(random_state=0)),
    ('Linear Discriminant Analysis', LinearDiscriminantAnalysis()),
    ('Quadratic Discriminant Analysis', QuadraticDiscriminantAnalysis()),
    ('XGBoost', XGBClassifier(random_state=0)),
    ('Passive Aggressive', PassiveAggressiveClassifier(random_state=0)),
    ('Bagging', BaggingClassifier(random_state=0)),
    ('Perceptron', Perceptron(random_state=0))

In [None]:
# Evaluating each model
results = {}
for name, model in models:
    model.fit(xv_train, y_train)
    pred = model.predict(xv_test)
    accuracy = accuracy_score(y_test, pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy}")
    print(classification_report(y_test, pred))

In [None]:

# Selecting the top 5 models
top_models = sorted(results.items(), key=lambda item: item[1], reverse=True)[:5]
top_model_names = [model[0] for model in top_models]

In [None]:

# Creating a Voting Classifier with the top 5 models
voting_estimators = [(name, dict(models)[name]) for name in top_model_names]
voting_clf = VotingClassifier(estimators=voting_estimators, voting='hard')
voting_clf.fit(xv_train, y_train)
pred_voting = voting_clf.predict(xv_test)
print("Voting Classifier Accuracy:", voting_clf.score(xv_test, y_test))
print(classification_report(y_test, pred_voting))


In [None]:
# Manual testing function
def output_label(n):
    return 'Fake News' if n == 0 else 'True News'

In [None]:

def manual_testing(news):
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test['text'] = new_def_test['text'].apply(wordopt)
    new_x_test = new_def_test['text']
    new_xv_test = vectorization.transform(new_x_test)
    
    pred_voting = voting_clf.predict(new_xv_test)
    
    print(f"Voting Classifier prediction: {output_label(pred_voting[0])}")

In [None]:
# Test the manual testing function
manual_testing("Your sample news text goes here")