# **Aided Differentiation of Real and Fake News**

**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amamo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
print(stopwords.words('english'))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

**Load Datasets**

In [3]:
true_data = pd.read_csv('True.csv')
fake_data = pd.read_csv('Fake.csv')


**Create Target Columns For True and Fake Data**

In [4]:
true_data['Target'] = 0
fake_data['Target'] = 1


**Concatenate True and Fake Data Into Common Dataframe**

In [5]:
data = pd.concat([true_data,fake_data]).sample(frac=1, random_state = 1).reset_index(drop=True)
data


Unnamed: 0,title,text,subject,date,Target
0,EPA chief says Paris climate agreement 'bad de...,WASHINGTON (Reuters) - The United States shoul...,politicsNews,"April 2, 2017",0
1,BREAKING NEWS: President Trump Announces Major...,President Trump just tweeted out a new policy ...,politics,"Jul 26, 2017",1
2,Trump says New Hampshire win not necessary to ...,WASHINGTON (Reuters) - U.S. Republican preside...,politicsNews,"February 7, 2016",0
3,Kremlin: U.S. sanctions aimed at turning busin...,MOSCOW (Reuters) - The Kremlin said on Thursda...,worldnews,"November 30, 2017",0
4,MUST WATCH: Kellyanne Conway PUNCHES BACK Afte...,Kellyanne Conway s response to Williams criti...,left-news,"Dec 27, 2016",1
...,...,...,...,...,...
44893,THIS YEAR: Let’s Make Christmas Great Again…,"This year, let s try something a little differ...",US_News,"December 25, 2016",1
44894,DEMOCRATS SELL Promo T-Shirt: “Democrats give ...,"Yes, the Democrats think it s a good thing to ...",politics,"Apr 20, 2017",1
44895,White House aides told to preserve materials i...,WASHINGTON (Reuters) - The White House counsel...,politicsNews,"March 2, 2017",0
44896,'Congratulations': EU moves to Brexit phase tw...,BRUSSELS (Reuters) - The European Union agreed...,worldnews,"December 15, 2017",0


**Preprocess Dataset**

In [6]:
data.shape


(44898, 5)

In [7]:
data.head()


Unnamed: 0,title,text,subject,date,Target
0,EPA chief says Paris climate agreement 'bad de...,WASHINGTON (Reuters) - The United States shoul...,politicsNews,"April 2, 2017",0
1,BREAKING NEWS: President Trump Announces Major...,President Trump just tweeted out a new policy ...,politics,"Jul 26, 2017",1
2,Trump says New Hampshire win not necessary to ...,WASHINGTON (Reuters) - U.S. Republican preside...,politicsNews,"February 7, 2016",0
3,Kremlin: U.S. sanctions aimed at turning busin...,MOSCOW (Reuters) - The Kremlin said on Thursda...,worldnews,"November 30, 2017",0
4,MUST WATCH: Kellyanne Conway PUNCHES BACK Afte...,Kellyanne Conway s response to Williams criti...,left-news,"Dec 27, 2016",1


In [9]:
data['content'] = data['title'] + ' ' + data['text']
print(data['content'])


0        EPA chief says Paris climate agreement 'bad de...
1        BREAKING NEWS: President Trump Announces Major...
2        Trump says New Hampshire win not necessary to ...
3        Kremlin: U.S. sanctions aimed at turning busin...
4        MUST WATCH: Kellyanne Conway PUNCHES BACK Afte...
                               ...                        
44893    THIS YEAR: Let’s Make Christmas Great Again… T...
44894    DEMOCRATS SELL Promo T-Shirt: “Democrats give ...
44895    White House aides told to preserve materials i...
44896    'Congratulations': EU moves to Brexit phase tw...
44897    STANDING OVATION! NIGEL FARAGE TROLLS CNN Duri...
Name: content, Length: 44898, dtype: object


In [10]:
X = data.drop(columns='Target', axis=1)
Y = data['Target']

print(X)
print(Y)


                                                   title  \
0      EPA chief says Paris climate agreement 'bad de...   
1      BREAKING NEWS: President Trump Announces Major...   
2      Trump says New Hampshire win not necessary to ...   
3      Kremlin: U.S. sanctions aimed at turning busin...   
4      MUST WATCH: Kellyanne Conway PUNCHES BACK Afte...   
...                                                  ...   
44893       THIS YEAR: Let’s Make Christmas Great Again…   
44894  DEMOCRATS SELL Promo T-Shirt: “Democrats give ...   
44895  White House aides told to preserve materials i...   
44896  'Congratulations': EU moves to Brexit phase tw...   
44897  STANDING OVATION! NIGEL FARAGE TROLLS CNN Duri...   

                                                    text       subject  \
0      WASHINGTON (Reuters) - The United States shoul...  politicsNews   
1      President Trump just tweeted out a new policy ...      politics   
2      WASHINGTON (Reuters) - U.S. Republican preside... 

**Perform Lemmatization to Convert Word To Base Form**

In [11]:
def clean(doc): 
        stop = stopwords.words('english') 
        punct = string.punctuation
        wnl = WordNetLemmatizer()
        stopwords_free = " ".join([i for i in doc.lower().split() if i not in stop])
        punctuations_free = "".join(ch for ch in stopwords_free if ch not in punct)
        normalized = " ".join(wnl.lemmatize(word) for word in punctuations_free.split())
        return normalized


In [12]:
print(data['content'])


0        EPA chief says Paris climate agreement 'bad de...
1        BREAKING NEWS: President Trump Announces Major...
2        Trump says New Hampshire win not necessary to ...
3        Kremlin: U.S. sanctions aimed at turning busin...
4        MUST WATCH: Kellyanne Conway PUNCHES BACK Afte...
                               ...                        
44893    THIS YEAR: Let’s Make Christmas Great Again… T...
44894    DEMOCRATS SELL Promo T-Shirt: “Democrats give ...
44895    White House aides told to preserve materials i...
44896    'Congratulations': EU moves to Brexit phase tw...
44897    STANDING OVATION! NIGEL FARAGE TROLLS CNN Duri...
Name: content, Length: 44898, dtype: object


In [13]:
X = data['content'].values
Y = data['Target'].values


In [14]:
print(X)


["EPA chief says Paris climate agreement 'bad deal' for U.S. WASHINGTON (Reuters) - The United States should continue to be “engaged” in international climate change discussions but the Paris climate change agreement is a “bad deal” for the country, the head of the Environmental Protection Agency said Sunday. EPA Administrator Scott Pruitt did not confirm whether the United States would remain in the global climate change pact, under which nearly all countries agreed in 2015 to halt or curb their greenhouse gas emissions, even as the world’s biggest emitter China reaffirmed its commitment to the agreement.     Chinese President Xi Jinping is due to have his first meeting with President Donald Trump on April 6-7. Xi and other Chinese officials have pledged to remain in the agreement. “To demonstrate the leadership that we have shown on this issue with China and India and other nations is very important and discussions should ensue,” Pruitt said on Fox News Sunday, “but what Paris repres

In [15]:
print(Y)


[0 1 0 ... 0 0 1]


**Convert Textual Data Into Numerical Data**

In [16]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

print(X)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9561245 stored elements and shape (44898, 122513)>
  Coords	Values
  (0, 120222)	0.10268133916290739
  (0, 119630)	0.04777642349643156
  (0, 119557)	0.026208329593618084
  (0, 119143)	0.028590585353315455
  (0, 119114)	0.04720041114904256
  (0, 118399)	0.046035398657429255
  (0, 118321)	0.016839390713892732
  (0, 118316)	0.05612037098298098
  (0, 118245)	0.018123426718480738
  (0, 117784)	0.07231850149486758
  (0, 117638)	0.032030481088513404
  (0, 117335)	0.0218718865218626
  (0, 115624)	0.02454499692372913
  (0, 113697)	0.08823681124312757
  (0, 113146)	0.048038442872451674
  (0, 111256)	0.03205146721131842
  (0, 109758)	0.01984721602728008
  (0, 109626)	0.14873460939861854
  (0, 109254)	0.030013851992945723
  (0, 108730)	0.04078714282995528
  (0, 108385)	0.03342433021774011
  (0, 108256)	0.2432290513826749
  (0, 108220)	0.06563578774375863
  (0, 106676)	0.024395112331356918
  (0, 105352)	0.096352669357546
  :	:
  (44897, 

**Split Dataset Into Training And Testing**

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify= Y, random_state=5)


**Build Model**

In [18]:
model = LogisticRegression()
model.fit(X_train, Y_train)


**Evaluate Model**

In [19]:
X_train_prediction= model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

print('Accuracy score of the training data : ', training_data_accuracy)


Accuracy score of the training data :  0.9922601481151512


In [20]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

print('Accuracy score of the test data : ', test_data_accuracy)


Accuracy score of the test data :  0.9891982182628062


In [21]:
print(confusion_matrix(X_test_prediction,Y_test))


[[4240   53]
 [  44 4643]]
