# Fake and real news classification
Building an NLP pipeline for English news classification. Fake news or real?


# Connecting to Google Drive

In [None]:
from google.colab import drive 

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# NLTK settings

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [100]:
from sklearn.utils import shuffle
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

## Dataset loading

In [None]:
path_fake = "/content/drive/MyDrive/.../Fake.csv"
path_true = "/content/drive/MyDrive/.../True.csv"

dataset_fake = pd.read_csv(path_fake)
dataset_true = pd.read_csv(path_true)

In [None]:
dataset_true.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [None]:
dataset_true.describe()

Unnamed: 0,title,text,subject,date
count,21417,21417,21417,21417
unique,20826,21192,2,716
top,Factbox: Trump fills top jobs for his administ...,(Reuters) - Highlights for U.S. President Dona...,politicsNews,"December 20, 2017"
freq,14,8,11272,182


In [None]:
dataset_true['subject'].unique()

array(['politicsNews', 'worldnews'], dtype=object)

In [None]:
dataset_fake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


In [None]:
dataset_fake.describe()

Unnamed: 0,title,text,subject,date
count,23481,23481.0,23481,23481
unique,17903,17455.0,6,1681
top,MEDIA IGNORES Time That Bill Clinton FIRED His...,,News,"May 10, 2017"
freq,6,626.0,9050,46


In [None]:
dataset_fake['subject'].unique()

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east'], dtype=object)

## Dataset preparation

In [None]:
dataset_fake = dataset_fake.drop('subject', axis=1)
dataset_fake = dataset_fake.drop('date', axis=1)
dataset_true = dataset_true.drop('subject', axis=1)
dataset_true = dataset_true.drop('date', axis=1)
dataset_fake = dataset_fake.drop('title', axis=1)
dataset_true = dataset_true.drop('title', axis=1)
dataset_true['Fake'] = 0
dataset_fake['Fake'] = 1

## Dataset creation

In [None]:
dataset = pd.concat([dataset_fake, dataset_true], ignore_index=True)
dataset = shuffle(dataset)
dataset

Unnamed: 0,text,Fake
2816,Donald Trump has placed gag orders on multiple...,1
21558,Nothing says tolerance like putting a loaded g...,1
30481,BEIJING (Reuters) - China said on Wednesday it...,0
5325,A Detroit man has been sentenced to 25-50 year...,1
7258,California Democrats are taking a page from Or...,1
...,...,...
35324,BEIJING/TAIPEI (Reuters) - China has condemned...,0
19549,Google Maps was alerted to a mysterious chan...,1
9009,We all rely on the media every day to know wha...,1
38894,BEIJING (Reuters) - China s foreign ministry o...,0


## Data cleaning

In [None]:
wn = WordNetLemmatizer()
def text_preprocessing(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [word for word in text if word not in stopwords.words('english')]
    text = [wn.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

In [None]:
dataset['text'] = dataset['text'].apply(text_preprocessing)

In [81]:
dataset

Unnamed: 0,text,Fake
2816,donald trump placed gag order multiple federal...,1
21558,nothing say tolerance like putting loaded gun ...,1
30481,beijing reuters china said wednesday wanted de...,0
5325,detroit man sentenced year prison year old son...,1
7258,california democrat taking page oregon book ap...,1
...,...,...
35324,beijing taipei reuters china condemned taiwan ...,0
19549,google map alerted mysterious change name trum...,1
9009,rely medium every day know going world many me...,1
38894,beijing reuters china foreign ministry wednesd...,0


# Train - validation - test split

In [83]:
X = dataset['text']
y = dataset['Fake']

In [86]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=123)
X_validation, X_test, y_validation, y_test = train_test_split(X_validation, y_validation, test_size=0.2, random_state=123)

print('Training Data : ', X_train.shape)
print('Validation Data : ', X_validation.shape)
print('Test Data : ', X_test.shape)

Training Data :  (35918,)
Validation Data :  (7184,)
Test Data :  (1796,)


# Feature Extraction - TFIDF - SGD

In [98]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42)),
                      ])

text_clf_svm = text_clf_svm.fit(X_train, y_train)

# Validation set performance

In [101]:
y_pred = text_clf_svm.predict(X_validation)

accuracy = accuracy_score(y_validation, y_pred)
precision = precision_score(y_validation, y_pred)
recall = recall_score(y_validation, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.9721603563474388
Precision: 0.9780725500812127
Recall: 0.9681136120042872


# Test set performance

In [103]:
y_pred = text_clf_svm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.9693763919821826
Precision: 0.9739130434782609
Recall: 0.9665587918015103
