### Problem Statement 

In [1]:
# Text Processing in the Problem 

In [2]:
# import dependencies/packages

import numpy as np
import pandas as pd
import re  # regular expressions

from nltk.corpus import stopwords 
#natrural language tool kit for removing stopwords

from nltk.stem.porter import PorterStemmer  # stemming removes the suffixes & prefixes

from sklearn.feature_extraction.text import TfidfVectorizer # we are converting text into numeric feature vectors 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
 # download all the stop words present
import nltk  
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\navjo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
print(stopwords.words('english')) # language is english 

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
# we will remove all these words from our dataset

In [6]:
# Data Preprocessing 

In [7]:
# loading the dataset to a pandas dataframe
news_dataset = pd.read_csv('FakeNewsNet.csv')

In [8]:
news_dataset.shape # so we have 23196 news articles in the corpus

(23196, 5)

In [9]:
news_dataset.columns # data column names

Index(['title', 'news_url', 'source_domain', 'tweet_num', 'real'], dtype='object')

In [10]:
news_dataset.rename(columns={'real':'label'}, inplace = True )# for my better understanding

In [11]:
news_dataset.head() # first five rows of the data

Unnamed: 0,title,news_url,source_domain,tweet_num,label
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [12]:
news_dataset.sample(5)# random 5 samples

Unnamed: 0,title,news_url,source_domain,tweet_num,label
6327,Robert Pattinson Is Really Crushing On Margot ...,okmagazine.com/photos/robert-pattinson-crush-o...,okmagazine.com,10,0
17802,Law & Order: Special Victims Unit (season 1),https://en.wikipedia.org/wiki/Law_%26_Order:_S...,en.wikipedia.org,32,1
20965,19 Years In The Life Of Jessica Simpson,www.businessinsider.com/jessica-simpson-2011-11,www.businessinsider.com,34,0
21124,Jennifer Aniston And Katy Perry Not Fighting O...,www.inquisitr.com/4826781/jennifer-aniston-and...,www.inquisitr.com,14,0
19156,Kim Kardashian shares adorable video of Saint ...,www.mirror.co.uk/3am/celebrity-news/kim-kardas...,www.mirror.co.uk,2,0


In [13]:
# check for the missing values
news_dataset.isnull().sum()

title              0
news_url         330
source_domain    330
tweet_num          0
label              0
dtype: int64

In [14]:
# 330 news_url & source_domains are missing here, but we have enough dataset to train our model we can either drop it or impute it
# we can impute it with null strings

In [15]:
# replacing null values with empty/null strings 
news_dataset = news_dataset.fillna(' ') # this is a null string

In [16]:
news_dataset['label'].value_counts(normalize = True)# so we have the balanced, proportionate data

label
1    0.751897
0    0.248103
Name: proportion, dtype: float64

In [17]:
# Stemming is a process of reducing a word to base form
# example:
# actor, actress, acting--> act
# in vectorization we will convert these words into the numeric feature vectores which will be fed into the machine

In [18]:
port_stem  = PorterStemmer() # created the instance 

In [19]:
# creating a user defined function for stemming
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)# regex, removed everythiung other than letters A-Za-z
    stemmed_content = stemmed_content.lower() # lowercase
    stemmed_content = stemmed_content.split() # splited text into list of words strings
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [20]:
news = news_dataset['title'].apply(stemming) # applying the stemming to the title column, because that is the content which we will be using

In [21]:
news.head()

0    kandi burruss explod rape accus real housew at...
1               peopl choic award best red carpet look
2    sophia bush send sweet birthday messag one tre...
3    colombian singer maluma spark rumour inappropr...
4    gossip girl year later upper east sider shock ...
Name: title, dtype: object

In [22]:
# separating the data # we have text in X variable & label in y variable
X = news.values
y = news_dataset['label'].values
print(X)
print(y)

['kandi burruss explod rape accus real housew atlanta reunion video'
 'peopl choic award best red carpet look'
 'sophia bush send sweet birthday messag one tree hill co star hilari burton breyton eva'
 ... 'jessica chastain recal moment mother boyfriend slap kick genit'
 'tristan thompson feel dump khlo kardashian refus let move la home exclus'
 'kelli clarkson perform medley kendrick lamar humbl hit billboard music award']
[1 1 1 ... 1 0 1]


In [23]:
print(X.shape, y.shape) 

(23196,) (23196,)


In [24]:
# converting textual data into numerical data
vectorizer = TfidfVectorizer()# defined object 
vectorizer.fit(X)
X = vectorizer.transform(X) # to vectorize the text data into numeric feature vectors, first it will learn than transform

In [25]:
# Tfidf
# term frequency: no of time a word is repeating in a document
# inverse document frequency: it provides heighest weightage is given to less occuring but more meaningful/significant word

In [26]:
print(X) # numeric feature vectors easily machine can understand text

  (0, 11959)	0.22178153496429145
  (0, 9287)	0.2664785410247072
  (0, 9020)	0.2424928018856143
  (0, 8975)	0.31659115198843385
  (0, 5895)	0.40600285234371225
  (0, 5246)	0.2737247638213213
  (0, 3739)	0.3720213547026605
  (0, 1524)	0.40600285234371225
  (0, 576)	0.3220576151506555
  (0, 53)	0.2721149519149911
  (1, 9079)	0.387076508000102
  (1, 8315)	0.4067948402953306
  (1, 6573)	0.3420111696667716
  (1, 1975)	0.43378159796649224
  (1, 1710)	0.3978429114527965
  (1, 986)	0.34762560771735407
  (1, 643)	0.31690546637483147
  (2, 11479)	0.3036563626822405
  (2, 10939)	0.2324931756527281
  (2, 10595)	0.148591495027414
  (2, 10407)	0.2836383240767164
  (2, 9880)	0.2644920245799396
  (2, 7939)	0.20215096479825698
  (2, 7117)	0.23353803309354323
  (2, 5104)	0.24183607544147448
  :	:
  (23193, 1894)	0.3741692367316018
  (23193, 1278)	0.2589163123449227
  (23194, 11508)	0.2849153309895167
  (23194, 11212)	0.2829354407102931
  (23194, 9115)	0.334043323633429
  (23194, 7432)	0.29588976326311933

In [27]:
# spliting data set to training & test data for model traing & evaluation
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 1, stratify = y)

In [28]:
# training the model 
logreg = LogisticRegression()
logreg.fit(X_train,y_train) # it will plot the sigmoid graph

In [29]:
# evaluation on training data
y_pred_train= logreg.predict(X_train)
print('Accuracyscore :', round(accuracy_score(y_pred_train,y_train),2)*100) 
print('Classification Report :\n', classification_report(y_pred_train,y_train)) 
print('Confusion Matrix: \n',confusion_matrix(y_pred_train,y_train)) 

Accuracyscore : 86.0
Classification Report :
               precision    recall  f1-score   support

           0       0.52      0.87      0.65      2740
           1       0.97      0.86      0.91     15816

    accuracy                           0.86     18556
   macro avg       0.75      0.86      0.78     18556
weighted avg       0.91      0.86      0.87     18556

Confusion Matrix: 
 [[ 2384   356]
 [ 2220 13596]]


In [30]:
# evaluation on testing data
y_pred_test= logreg.predict(X_test)
print('Accuracyscore :', round(accuracy_score(y_pred_test,y_test),2)*100) 
print('Classification Report :\n', classification_report(y_pred_test,y_test)) 
print('Confusion Matrix: \n',confusion_matrix(y_pred_test,y_test)) 

Accuracyscore : 83.0
Classification Report :
               precision    recall  f1-score   support

           0       0.44      0.77      0.56       660
           1       0.96      0.84      0.89      3980

    accuracy                           0.83      4640
   macro avg       0.70      0.80      0.73      4640
weighted avg       0.88      0.83      0.85      4640

Confusion Matrix: 
 [[ 507  153]
 [ 644 3336]]


In [31]:
# though we dont need regularization as there is just reduction of 3% from training to testing accuracy 
# but just to check what happens when we are doing regularization & using grid serach cv

In [32]:
parameters = {'C':[1,2,3,4,10], 'penalty': ['l1','l2','elasticnet'], 'max_iter':[100,200,300]}

In [33]:
from sklearn.model_selection import GridSearchCV
Model2 = GridSearchCV(estimator = logreg, param_grid = parameters, cv = 5, scoring = 'accuracy' )

In [34]:
Model2.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
150 fits failed out of a total of 225.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the f

In [35]:
# evaluation on testing data
y_pred_test= Model2.predict(X_test)
print('Accuracyscore :', round(accuracy_score(y_pred_test,y_test),2)*100) 
print('Classification Report :\n', classification_report(y_pred_test,y_test)) 
print('Confusion Matrix: \n',confusion_matrix(y_pred_test,y_test)) 

Accuracyscore : 83.0
Classification Report :
               precision    recall  f1-score   support

           0       0.52      0.73      0.61       818
           1       0.94      0.86      0.89      3822

    accuracy                           0.83      4640
   macro avg       0.73      0.79      0.75      4640
weighted avg       0.86      0.83      0.84      4640

Confusion Matrix: 
 [[ 599  219]
 [ 552 3270]]


In [36]:
# so we can see evening using Hyper parameters our accuracy is same 83%

## Making Predictive System

In [37]:
X_new = X_test[0]
# we are just taking one instance from the X-test data & saving it as new data to make the prediction on ot whether its a real/fake news

prediction = logreg.predict(X_new)
print(prediction)

if (prediction[0] == 0):
    print('The news is real')
else:
    print('The news is fake')

[1]
The news is fake


In [38]:
# to reconfirm the label we can see it 
print(y_test[0])

1


In [39]:
print(y_test[100])

0


## Another model, Naive Bayes

In [40]:
from sklearn.naive_bayes import BernoulliNB

In [41]:
classifier = BernoulliNB()
classifier.fit(X_train, y_train)

In [42]:
# Evaluate the classifier
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8189655172413793

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.58      0.61      1151
           1       0.87      0.90      0.88      3489

    accuracy                           0.82      4640
   macro avg       0.76      0.74      0.75      4640
weighted avg       0.81      0.82      0.81      4640



## Making Predictive Model with Naive Bayes

In [43]:
X_new = X_test[0]

prediction = classifier.predict(X_new)
print(prediction)

if (prediction[0] == 0):
    print('The news is real')
else:
    print('The news is fake')

[1]
The news is fake


In [None]:
# both the models are predicting correctly