In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import re
import string

In [2]:
data_fake = pd.read_csv("fake.csv")
data_true = pd.read_csv("True.csv")

In [3]:
# Adding class labels
data_fake["class"] = 0
data_true["class"] = 1

# Removing last 10 entries for manual testing
data_fake_manual_testing = data_fake.tail(10)
data_true_manual_testing = data_true.tail(10)
data_fake = data_fake.iloc[:-10]
data_true = data_true.iloc[:-10]


In [4]:
# Merging the datasets
data_merge = pd.concat([data_fake, data_true], axis=0)

# Dropping unnecessary columns
data = data_merge.drop(["title", "subject", "date"], axis=1)

# Shuffling the data
data = data.sample(frac=1).reset_index(drop=True)


In [5]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub("https?://\S+|www\S+", "", text)
    text = re.sub("<.*?>+", "", text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub("\n", "", text)
    text = re.sub("\w*\d\w*", "", text)
    return text

In [6]:
# Applying text preprocessing
data['text'] = data['text'].apply(wordopt)

# Splitting the data
x = data['text']
y = data['class']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [7]:
# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [8]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(xv_train, y_train)
pred_lr = LR.predict(xv_test)
print("Logistic Regression Accuracy:", LR.score(xv_test, y_test))
print(classification_report(y_test, pred_lr))

Logistic Regression Accuracy: 0.9861853832442068
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5893
           1       0.98      0.99      0.99      5327

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [9]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)
pred_dt = DT.predict(xv_test)
print("Decision Tree Accuracy:", DT.score(xv_test, y_test))
print(classification_report(y_test, pred_dt))

Decision Tree Accuracy: 0.996078431372549
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5893
           1       1.00      1.00      1.00      5327

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



In [10]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
GB = GradientBoostingClassifier(random_state=0)
GB.fit(xv_train, y_train)
pred_gb = GB.predict(xv_test)
print("Gradient Boosting Accuracy:", GB.score(xv_test, y_test))
print(classification_report(y_test, pred_gb))

Gradient Boosting Accuracy: 0.9955436720142602
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5893
           1       0.99      1.00      1.00      5327

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



In [11]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(random_state=0)
RF.fit(xv_train, y_train)
pred_rf = RF.predict(xv_test)
print("Random Forest Accuracy:", RF.score(xv_test, y_test))
print(classification_report(y_test, pred_rf))

Random Forest Accuracy: 0.991711229946524
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5893
           1       0.99      0.99      0.99      5327

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [None]:
# Support Vector Machine
from sklearn.svm import SVC
SVM = SVC(kernel='linear')
SVM.fit(xv_train, y_train)
pred_svm = SVM.predict(xv_test)
print("Support Vector Machine Accuracy:", SVM.score(xv_test, y_test))
print(classification_report(y_test, pred_svm))

In [None]:
# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB()
NB.fit(xv_train, y_train)
pred_nb = NB.predict(xv_test)
print("Naive Bayes Accuracy:", NB.score(xv_test, y_test))
print(classification_report(y_test, pred_nb))


In [None]:
# Neural Network using Keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
# Tokenization and padding
max_words = 5000
max_len = 500
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

x_train_nn = pad_sequences(train_sequences, maxlen=max_len)
x_test_nn = pad_sequences(test_sequences, maxlen=max_len)

In [None]:
# Building the model
model = Sequential()
model.add(Dense(512, input_shape=(max_len,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
# Training the model
history = model.fit(x_train_nn, y_train,
                    epochs=5,
                    batch_size=128,
                    validation_split=0.1,
                    verbose=2)

# Evaluating the model
loss, accuracy = model.evaluate(x_test_nn, y_test)
print("Neural Network Accuracy:", accuracy)
pred_nn = (model.predict(x_test_nn) > 0.5).astype("int32")
print(classification_report(y_test, pred_nn))

In [None]:
# Manual testing function
def output_label(n):
    return 'Fake News' if n == 0 else 'True News'

In [None]:
def manual_testing(news):
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test['text'] = new_def_test['text'].apply(wordopt)
    new_x_test = new_def_test['text']
    new_xv_test = vectorization.transform(new_x_test)
    
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GB = GB.predict(new_xv_test)
    pred_RF = RF.predict(new_xv_test)
    pred_SVM = SVM.predict(new_xv_test)
    pred_NB = NB.predict(new_xv_test)
    
    new_seq_test = tokenizer.texts_to_sequences(new_x_test)
    new_x_test_nn = pad_sequences(new_seq_test, maxlen=max_len)
    pred_NN = (model.predict(new_x_test_nn) > 0.5).astype("int32")
    
    print("\n\nLR prediction: {} \nDT Prediction: {} \nGBC prediction: {} \nRFC prediction: {} \nSVM prediction: {} \nNB prediction: {} \nNN prediction: {}".format(
        output_label(pred_LR[0]),
        output_label(pred_DT[0]),
        output_label(pred_GB[0]),
        output_label(pred_RF[0]),
        output_label(pred_SVM[0]),
        output_label(pred_NB[0]),
        output_label(pred_NN[0])
    ))

In [None]:
# Manual testing
news = input("Enter news text: ")
manual_testing(news)