**Part 1: Data Engineering**

In [35]:
# mounting Google drive so that we can access our dataset

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
# setting filepath that leads to our data
filename = '/content/drive/My Drive/news.csv'

In [37]:
# reading the .csv dataset using pandas

import pandas as pd
df = pd.read_csv(filename)

In [38]:
# analyzing our dataset 
df.shape, type(df), df.columns

((6335, 4),
 pandas.core.frame.DataFrame,
 Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object'))

In [39]:
# removing our serial number column from the dataset because it is unnecessary
frame = df.drop(columns = ['Unnamed: 0'])

In [40]:
# verifying our operation
frame.shape, frame.columns

((6335, 3), Index(['title', 'text', 'label'], dtype='object'))

In [41]:
# using numpy for array operations
import numpy as np

In [42]:
# merging two columns and using this as our x axis
x = np.array(df['title'])+np.array(df['text'])

In [43]:
x.shape

(6335,)

In [44]:
# setting the y axis equal to the labels (either 'FAKE' or 'REAL')
y = np.array(df['label'])

In [45]:
y.shape

(6335,)

In [48]:
# splitting our dataset into train and test data
from sklearn.model_selection import train_test_split

# 80% of data for training, 20% for testing
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2

In [49]:
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((5068,), (1267,), (5068,), (1267,))

In [50]:
# importing our necessary class that will allow us to convert our text to a suitable format for the model
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
# instantiating object of class
vc = TfidfVectorizer()

In [52]:
# converting text to vectorized format

# only train vectors need to be fitted, test vectors only need to be transformed
train_x_vectors = vc.fit_transform(train_x)

test_x_vectors = vc.transform(test_x)

In [53]:
train_x_vectors.shape, test_x_vectors.shape

((5068, 65067), (1267, 65067))

**Part 2: Data Science**

In [55]:
# Using a stochastic gradient descent classifier as it trains very quickly
from sklearn.linear_model import SGDClassifier
clf_sgd = SGDClassifier()

In [58]:
# Training our model
clf_sgd.fit(train_x_vectors, train_y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [57]:
# Implementing another classifier: support vector machine
from sklearn import svm
clf_svm = svm.SVC(kernel = 'linear')

In [59]:
# Training our second model
clf_svm.fit(train_x_vectors, train_y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

**Part 3: Evaluation**

In [60]:
# Let's check our accuracy for each algorithm

# Stochastic Gradient Descent Classifier
print("Accuracy of Stochastic Gradient Descent Classifier is",clf_sgd.score(test_x_vectors, test_y)*100)

# Support Vector Classifier
print("Accuracy of Support Vector Classifier is",clf_svm.score(test_x_vectors, test_y)*100)

Accuracy of Stochastic Gradient Descent Classifier is 94.31728492501973
Accuracy of Support Vector Classifier is 94.1594317284925


In [61]:
# Quite impressive! Both models are around the 94% mark, and this is without optimization and fine tuning
# Let's check the accuracy for each category using F1

from sklearn.metrics import f1_score

In [64]:
# F1 score for SGDc:
f1_score(test_y, clf_sgd.predict(test_x_vectors), average=None,  labels=['REAL', 'FAKE'])

array([0.94230769, 0.94401244])

In [65]:
# F1 score for SVC:
f1_score(test_y, clf_svm.predict(test_x_vectors), average=None,  labels=['REAL', 'FAKE'])

array([0.94051447, 0.94263566])

In [1]:
# As you can see, the models have very high accuracies for both labels, which means they are pretty good

**Part 4: Summary**

We see that it's very easy to build a model that can detect fake news with good accuracy. Since the accuracy is already in the mid-90s, I won't bother with any optimization and fine-tuning. If you would like a more accurate model, though, I recommend using the GridSearchCV method.