### Fake News Detection using SVM

In [5]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [6]:
data = pd.read_csv("fake_or_real_news.csv")

In [9]:
data.head()

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [13]:
# Applying custom lambda expression to make a fake column which classifies news articles:

data["fake"] = data["label"].apply(lambda x: 0 if x == "REAL" else 1)

In [14]:
data.head()

Unnamed: 0,id,title,text,label,fake
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,0


In [17]:
# splitting the data from the dataframe into x and y where x contains the text and y contains the fake value.

x, y = data["text"], data["fake"]

In [19]:
x

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
                              ...                        
6330    The State Department told the Republican Natio...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332     Anti-Trump Protesters Are Tools of the Oligar...
6333    ADDIS ABABA, Ethiopia —President Obama convene...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: text, Length: 6335, dtype: object

In [20]:
y

0       1
1       1
2       0
3       1
4       0
       ..
6330    0
6331    1
6332    1
6333    0
6334    0
Name: fake, Length: 6335, dtype: int64

In [22]:
# performing train_test_split, allocating 20% of the data as test data.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [25]:
x_train

364     0 comments \nFamilies united in prayer on Than...
5846    . \nYour Most Honorableness, October 30th, 201...
3859    Federal laws already allow people across the c...
4812    ISIS, the world understands, is a violent jiha...
2724    Posted on October 26, 2016 by Michael DePinto ...
                              ...                        
110     Whether an October surprise may have come a mo...
2831    (CNN) After he shot two journalists on live TV...
3070    It came two days after the announcement of the...
2475    Why NATO is put on war footing against Russia ...
4516    Six weeks before he was set to open Florida’s ...
Name: text, Length: 5068, dtype: object

In [26]:
len(x_train)

5068

In [27]:
len(x_test)

1267

In [28]:
# now we are going to vectorize the training data:

vectorizer = TfidfVectorizer(stop_words = "english", max_df=0.7)

In [30]:
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

In [32]:
# creating a classifier:

clf = LinearSVC()
clf.fit(x_train_vectorized, y_train)

LinearSVC()

In [34]:
clf.score(x_test_vectorized, y_test)

0.9400157853196527

In [35]:
len(y_test)

1267

In [38]:
# checking how many articles were classified correctly:

len(y_test) * 0.94

# pretty good i would say :)

1190.98