In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#printing the stopwords in english
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Combining fake and real news dataset

In [10]:
real_news=pd.read_csv("True.csv")
fake_news=pd.read_csv("Fake.csv")

real_news["label"]=1
fake_news["label"]=0

combined_data=pd.concat([real_news,fake_news],axis=0)

combined_data=combined_data.sample(frac=1, random_state=42).reset_index(drop=True)

combined_data.to_csv("combined_news.csv",index=False)

print("Combined Dataset created successfully")
print("Shape: ", combined_data.shape)
print(combined_data.head())

Combined Dataset created successfully
Shape:  (44898, 5)
                                               title  ... label
0   BREAKING: GOP Chairman Grassley Has Had Enoug...  ...     0
1   Failed GOP Candidates Remembered In Hilarious...  ...     0
2   Mike Pence’s New DC Neighbors Are HILARIOUSLY...  ...     0
3  California AG pledges to defend birth control ...  ...     1
4  AZ RANCHERS Living On US-Mexico Border Destroy...  ...     0

[5 rows x 5 columns]


Data Pre-Processing

In [11]:
#loading the dataset into pandas dataframe
news_dataset=pd.read_csv('/content/combined_news.csv')


In [12]:
news_dataset.shape

(44898, 5)

In [13]:
#counting the number of missing values in the dataset
news_dataset.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,0
label,0


In [14]:
#replacing null value with empty string
news_dataset=news_dataset.fillna('')

In [17]:
#merging the title and text column
news_dataset['content']=news_dataset['title']+' '+news_dataset['text']

In [18]:
print(news_dataset['content'])

0         BREAKING: GOP Chairman Grassley Has Had Enoug...
1         Failed GOP Candidates Remembered In Hilarious...
2         Mike Pence’s New DC Neighbors Are HILARIOUSLY...
3        California AG pledges to defend birth control ...
4        AZ RANCHERS Living On US-Mexico Border Destroy...
                               ...                        
44893    Nigeria says U.S. agrees delayed $593 million ...
44894    Boiler Room #62 – Fatal Illusions Tune in to t...
44895    ATHEISTS SUE GOVERNOR OF TEXAS Over Display on...
44896    Republican tax plan would deal financial hit t...
44897    U.N. refugee commissioner says Australia must ...
Name: content, Length: 44898, dtype: object


In [19]:
#separating the data and label
X=news_dataset.drop(columns='label',axis=1)
Y=news_dataset['label']

In [20]:
print(X)
print(Y)

                                                   title  ...                                            content
0       BREAKING: GOP Chairman Grassley Has Had Enoug...  ...   BREAKING: GOP Chairman Grassley Has Had Enoug...
1       Failed GOP Candidates Remembered In Hilarious...  ...   Failed GOP Candidates Remembered In Hilarious...
2       Mike Pence’s New DC Neighbors Are HILARIOUSLY...  ...   Mike Pence’s New DC Neighbors Are HILARIOUSLY...
3      California AG pledges to defend birth control ...  ...  California AG pledges to defend birth control ...
4      AZ RANCHERS Living On US-Mexico Border Destroy...  ...  AZ RANCHERS Living On US-Mexico Border Destroy...
...                                                  ...  ...                                                ...
44893  Nigeria says U.S. agrees delayed $593 million ...  ...  Nigeria says U.S. agrees delayed $593 million ...
44894                  Boiler Room #62 – Fatal Illusions  ...  Boiler Room #62 – Fatal Illusions

Stemming (a process of reducing a word to it's root word)

ex: actor, actress, acting --> act

In [21]:
port_stem = PorterStemmer()

In [22]:
def stemming(content):
  stemmed_content=re.sub('[^a-zA-Z]',' ',content)
  stemmed_content=stemmed_content.lower()
  stemmed_content=stemmed_content.split()
  stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content=' '.join(stemmed_content)
  return stemmed_content

In [24]:
news_dataset['content']=news_dataset['content'].apply(stemming)

In [25]:
print(news_dataset['content'])

0        break gop chairman grassley enough demand trum...
1        fail gop candid rememb hilari mock eulog video...
2        mike penc new dc neighbor hilari troll homopho...
3        california ag pledg defend birth control insur...
4        az rancher live us mexico border destroy nanci...
                               ...                        
44893    nigeria say u agre delay million fighter plane...
44894    boiler room fatal illus tune altern current ra...
44895    atheist sue governor texa display capitol grou...
44896    republican tax plan would deal financi hit u u...
44897    u n refuge commission say australia must stop ...
Name: content, Length: 44898, dtype: object


In [34]:
#separating the data and label
X=news_dataset['content'].values
Y=news_dataset['label'].values

In [35]:
print(X)

['break gop chairman grassley enough demand trump jr testimoni donald trump white hous chao tri cover russia problem mount hour refus acknowledg problem surround fake news hoax howev fact bear thing differ seem crack congression public leadership chuck grassley r iowa head senat judiciari committe fed demand donald trump jr former trump campaign manag paul manafort testifi committe regard infam shadi meet donald trump shadi russian lawyer promis dirt democrat presidenti nomine hillari clinton fact inform due well demand send signal team trump notabl fire special counsel robert mueller circumst despit fact seem seem trump white hous lay groundwork speak speak tweet regard grassley warn also anyon think senat grassley rest senat seriou need look warn alreadi given trump jr manafort either follow order serv subpoena forc compli refus held contempt congress carri seriou jail time even cruel craven creatur within gop sick donald trump corrupt scandal ridden white hous angri stage hostil tak

In [36]:
print(Y)

[0 0 0 ... 0 1 1]


In [37]:
#converting the textual data into numerical data
vectorizer=TfidfVectorizer()
vectorizer.fit(X)

X=vectorizer.transform(X)

In [38]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6904081 stored elements and shape (44898, 89868)>
  Coords	Values
  (0, 570)	0.06055486453372135
  (0, 2286)	0.04253552689623891
  (0, 2301)	0.025666202113816185
  (0, 3031)	0.063093661763644
  (0, 3388)	0.047259753130124114
  (0, 6574)	0.07041583382179394
  (0, 7853)	0.0898677958851911
  (0, 7886)	0.059913473329118644
  (0, 9454)	0.048097596645320734
  (0, 10324)	0.08231690448262517
  (0, 11174)	0.03296114815015033
  (0, 11631)	0.04919300578107092
  (0, 12522)	0.05138750652591612
  (0, 12642)	0.06680808165549404
  (0, 13496)	0.06811316396970238
  (0, 13713)	0.07112856852592032
  (0, 13991)	0.053000278097355684
  (0, 14098)	0.037972261733148126
  (0, 14836)	0.0869210456407381
  (0, 14928)	0.06842646468851935
  (0, 15149)	0.04165264736324387
  (0, 15153)	0.051876038877662234
  (0, 15333)	0.08494945735141231
  (0, 15663)	0.056535937666043194
  (0, 15765)	0.029199968119368196
  :	:
  (44897, 77773)	0.03757368913834899
  (44897,

Splitting the dataset into train and test data

In [39]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

Training the Model: Logistic Regression

In [40]:
model=LogisticRegression()

In [41]:
model.fit(X_train,Y_train)

Evaluation of our model (accuracy score)

In [49]:
#checking accuracy
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction, Y_train)

In [50]:
print("Accuracy score of training data: ",training_data_accuracy)

Accuracy score of training data:  0.9919260537891865


In [51]:
#checking accuracy on test data
X_test_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(X_test_prediction, Y_test)

In [52]:
print("Accuracy score of test data: ",test_data_accuracy)

Accuracy score of test data:  0.9865256124721603


Making a predictive system

In [58]:
X_new=X_test[363]

prediction=model.predict(X_new)
print(prediction)

if(prediction[0]==0):
  print("The news is fake")
else:
  print("The news is real")

[0]
The news is fake


In [59]:
print(Y_test[363])

0
