# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing Dataset

In [3]:
df = pd.read_csv('FakeNewsNet.csv')
df.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


# Text Preprocessing

In [8]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
port_stem = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content
df['title'] = df['title'].apply(stemming)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Divide it to Feature and Target

In [21]:
X = df.iloc[:, 0].values
y = df.iloc[:, 4].values

In [23]:
print(X)

['kandi burruss explod rape accus real housew atlanta reunion video'
 'peopl choic award best red carpet look'
 'sophia bush send sweet birthday messag one tree hill co star hilari burton breyton eva'
 ... 'jessica chastain recal moment mother boyfriend slap kick genit'
 'tristan thompson feel dump khlo kardashian refus let move la home exclus'
 'kelli clarkson perform medley kendrick lamar humbl hit billboard music award']


# converting the textual data to numerical data

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase=False)
X = vectorizer.fit_transform(X)

In [27]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 182971 stored elements and shape (23196, 12546)>
  Coords	Values
  (0, 5895)	0.40600285234371225
  (0, 1524)	0.40600285234371225
  (0, 3739)	0.3720213547026605
  (0, 8975)	0.31659115198843385
  (0, 53)	0.2721149519149911
  (0, 9020)	0.2424928018856143
  (0, 5246)	0.2737247638213213
  (0, 576)	0.3220576151506555
  (0, 9287)	0.2664785410247072
  (0, 11959)	0.22178153496429145
  (1, 8315)	0.4067948402953306
  (1, 1975)	0.43378159796649224
  (1, 643)	0.31690546637483147
  (1, 986)	0.34762560771735407
  (1, 9079)	0.387076508000102
  (1, 1710)	0.3978429114527965
  (1, 6573)	0.3420111696667716
  (2, 10407)	0.28363832407671646
  (2, 1529)	0.2679606830015344
  (2, 9880)	0.2644920245799396
  (2, 10939)	0.23249317565272812
  (2, 1069)	0.19438975915734968
  (2, 7117)	0.23353803309354326
  (2, 7939)	0.202150964798257
  (2, 11479)	0.30365636268224055
  :	:
  (23193, 10239)	0.3970768905861499
  (23193, 4411)	0.4463434677864564
  (23194, 33

In [28]:
print(y)

[1 1 1 ... 1 0 1]


# Splitting Data into Training and Test Set

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

# Building and Training Model

In [31]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Accuracy score on the training data

In [36]:
from sklearn.metrics import accuracy_score
X_pred = classifier.predict(X_train)
accuracy_score(y_train, X_pred)

0.8599913774520371

# Accuracy score on the test data

In [37]:
from sklearn.metrics import accuracy_score
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.8359913793103448

In [39]:
print(np.concatenate((y_test.reshape(len(y_test),1), y_pred.reshape(len(y_pred),1)), axis=1))

[[0 1]
 [1 1]
 [1 1]
 ...
 [1 1]
 [1 1]
 [1 1]]
