#### The columns present in the dataset are:-

#### 1) Title -> Title of the News

#### 2) Text -> Text or Content of the News

#### 3) Label -> Labelling the news as Fake or Real

In [2]:
# importung the dependencies
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [9]:
# Importing dataset.Ensure that a copy of the dataset is available when importing it into Google Colab.
df = pd.read_csv('/content/news.csv.zip')


In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
# printing the stopwords in english
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data Preprocessing

In [16]:
df.shape

(6335, 4)

In [17]:
# changing the name of the column
df.rename(columns={'Unnamed: 0': 'id'}, inplace = True)

In [18]:
# printing the first five rows
df.head()

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [19]:
#print the last five rows
df.tail()

Unnamed: 0,id,title,text,label
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL
6334,4330,Jeb Bush Is Suddenly Attacking Trump. Here's W...,Jeb Bush Is Suddenly Attacking Trump. Here's W...,REAL


In [20]:
# checking the number of missing values in dataset
df.isnull().sum()

Unnamed: 0,0
id,0
title,0
text,0
label,0


In [21]:
#separating the data and labels
X = df.drop(columns='label',axis=1)
Y = df['label']

In [22]:
print(X)
print(Y)

         id                                              title  \
0      8476                       You Can Smell Hillary’s Fear   
1     10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2      3608        Kerry to go to Paris in gesture of sympathy   
3     10142  Bernie supporters on Twitter erupt in anger ag...   
4       875   The Battle of New York: Why This Primary Matters   
...     ...                                                ...   
6330   4490  State Department says it can't find emails fro...   
6331   8062  The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...   
6332   8622  Anti-Trump Protesters Are Tools of the Oligarc...   
6333   4021  In Ethiopia, Obama seeks progress on peace, se...   
6334   4330  Jeb Bush Is Suddenly Attacking Trump. Here's W...   

                                                   text  
0     Daniel Greenfield, a Shillman Journalism Fello...  
1     Google Pinterest Digg Linkedin Reddit Stumbleu...  
2     U.S. Secretary of State Joh

Stemming

In [23]:
port_stemming = PorterStemmer()

In [24]:
def stemming(text):
  stemmed_text = re.sub('[^a-zA-Z]',' ',text)
  stemmed_text = stemmed_text.lower()
  stemmed_text = stemmed_text.split()
  stemmed_text = [port_stemming.stem(word) for word in stemmed_text if not word in stopwords.words('english')]
  stemmed_text = ' '.join(stemmed_text)
  return stemmed_text

In [25]:
df['text'] = df['text'].apply (stemming)

In [26]:
print(df['text'])

0       daniel greenfield shillman journal fellow free...
1       googl pinterest digg linkedin reddit stumbleup...
2       u secretari state john f kerri said monday sto...
3       kayde king kaydeek novemb lesson tonight dem l...
4       primari day new york front runner hillari clin...
                              ...                        
6330    state depart told republican nation committe c...
6331    p pb stand plutocrat pentagon post oct wikimed...
6332    anti trump protest tool oligarchi reform alway...
6333    addi ababa ethiopia presid obama conven meet l...
6334    jeb bush suddenli attack trump matter jeb bush...
Name: text, Length: 6335, dtype: object


In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [13]:
#separating the data and label
X = df['text'].values
Y = df['label'].values

In [14]:
print(X)

 'Google Pinterest Digg Linkedin Reddit Stumbleupon Print Delicious Pocket Tumblr \nThere are two fundamental truths in this world: Paul Ryan desperately wants to be president. And Paul Ryan will never be president. Today proved it. \nIn a particularly staggering example of political cowardice, Paul Ryan re-re-re-reversed course and announced that he was back on the Trump Train after all. This was an aboutface from where he was a few weeks ago. He had previously declared he would not be supporting or defending Trump after a tape was made public in which Trump bragged about assaulting women. Suddenly, Ryan was appearing at a pro-Trump rally and boldly declaring that he already sent in his vote to make him President of the United States. It was a surreal moment. The figurehead of the Republican Party dosed himself in gasoline, got up on a stage on a chilly afternoon in Wisconsin, and lit a match. . @SpeakerRyan says he voted for @realDonaldTrump : “Republicans, it is time to come home” h

In [16]:
print(Y)

['FAKE' 'FAKE' 'REAL' ... 'FAKE' 'REAL' 'REAL']


In [18]:
Y.shape

(6335,)

In [19]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [20]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2158282 stored elements and shape (6335, 67659)>
  Coords	Values
  (0, 1621)	0.016229531067969236
  (0, 2306)	0.021002759070217546
  (0, 2399)	0.01241685144719205
  (0, 2486)	0.024110528944251836
  (0, 2636)	0.016094233592590813
  (0, 2640)	0.043376791257086454
  (0, 2749)	0.05135437224587028
  (0, 2978)	0.023340051773888364
  (0, 3033)	0.02038313224006218
  (0, 3214)	0.06004278866572469
  (0, 3234)	0.007165842283234525
  (0, 3256)	0.03381419923018935
  (0, 3265)	0.01624011518858776
  (0, 3274)	0.01540403684145884
  (0, 3278)	0.03637152475123043
  (0, 3329)	0.011766645990727378
  (0, 3375)	0.014600249496516973
  (0, 3749)	0.020301409842698546
  (0, 3761)	0.03255183170443468
  (0, 3773)	0.019337300942003526
  (0, 3812)	0.030843112273660794
  (0, 3834)	0.015249463170958514
  (0, 3900)	0.02188364387038962
  (0, 3905)	0.013747932717198952
  (0, 4066)	0.018136295662544426
  :	:
  (6334, 64842)	0.0211113566867202
  (6334, 64851)	0

Splitting the dataset to training and test data

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

Training the model : Logistic Regression model

In [22]:
model =LogisticRegression()

In [23]:
model.fit(X_train, Y_train)

Evaluation

Accuracy score

In [24]:
# accuracy score of training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [25]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9530386740331491


In [26]:
# accuracy data on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [27]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9123914759273876


Building a Predictive system

In [46]:
Y_train.shape

(5068,)

In [47]:
Y_test.shape

(1267,)

In [52]:
# This is the outcome of train data
X_new = X_train[2126]  # This is a sparse matrix row

# Convert it into 2D format explicitly
prediction = model.predict(X_new.reshape(1, -1))

print(prediction)

if prediction[0] == 'REAL':
    print('The news is Real')
else:
    print('The news is Fake')


['FAKE']
The news is Fake


In [53]:
# verify wether the predicted answer is correct or not.
print(Y_train[2126])

FAKE


In [62]:
# this is the outcome of test data
X_new = X_test[21]  # This is a sparse matrix row

# Convert it into 2D format explicitly
prediction = model.predict(X_new.reshape(1, -1))

print(prediction)

if prediction[0] == 'REAL':
    print('The news is Real')
else:
    print('The news is Fake')

['FAKE']
The news is Fake


In [61]:
# verify wether the predicted answer is correct or not.
print(Y_test[21])

FAKE
