In [None]:
# importing the necessary dependencies
import numpy as np
import pandas as pd
# for searching text in a document
import re
# importing natural language toolkit to remove non-value adding words
from nltk.corpus import stopwords
# for getting the root word and removing the prefix/suffix
from nltk.stem.porter import PorterStemmer
# for converting text into feature vectors or numbers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# printing the prime stop words in English
print(stopwords.words('english'))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
# Loading the fake news dataset into pandas data frame
news_dataset = pd.read_csv('/content/test.csv')



In [None]:
news_dataset.shape

(5200, 4)

In [None]:
news_dataset.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [None]:
news_dataset.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [None]:
# As only a few data are having missing values , we can just replace them with null/
news_dataset = news_dataset.fillna('')

In [None]:
news_dataset.isnull().sum()

id        0
title     0
author    0
text      0
dtype: int64

In [None]:
# we'll be combining title and author data for our model as it returns a good accuracy score
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']
print(news_dataset['content'])

0       David Streitfeld Specter of Trump Loosens Tong...
1        Russian warships ready to strike terrorists n...
2       Common Dreams #NoDAPL: Native American Leaders...
3       Daniel Victor Tim Tebow Will Attempt Another C...
4       Truth Broadcast Network Keiser Report: Meme Wa...
                              ...                        
5195    Jody Rosen The Bangladeshi Traffic Jam That Ne...
5196    Sheryl Gay Stolberg John Kasich Signs One Abor...
5197    Mike McPhate California Today: What, Exactly, ...
5198     300 US Marines To Be Deployed To Russian Bord...
5199    Teddy Wayne Awkward Sex, Onscreen and Off - Th...
Name: content, Length: 5200, dtype: object


In [None]:
# Separating the data and label , our dataset isn't having label but we'll implement like below if it has label
X = news_dataset.drop(columns='label',axis=1)
Y = news_dataset['label']

In [None]:
# Stemming :
# Stemming is the process of reducing a word to a root word
# eg. : actor,actress,acting,act ----> act is the root word
port_stem = PorterStemmer()

In [None]:
def stemming(content):
  # using regular expression library to convert all characters except a-zA-Z (numbers and punctuations) to space
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  # to remove the probability that machine will think that capital letter word is signi ficant
  stemmed_content = stemmed_content.lower()
  # splitting and converting to list
  stemmed_content = stemmed_content.split()
  #  remove stop words and doing stemming of rest of the words
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content


In [None]:
# applying the stemming to news_dataset
news_dataset['content'] = news_dataset['content'].apply(stemming)


In [None]:
print(news_dataset['content'])

In [None]:
# separating the required data and labels
X = news_dataset['content'].values
Y = news_dataset['label'].values
print(X)
print(X.shape)
print(Y)
print(Y.shape)

In [None]:
# converting the textual data to numerical data , as we know our machine can't understand the textual data , we'll convert that into numbers using vectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)


In [None]:
print(X)
# Now we'll this data into our machine learning model

In [None]:
# stratify=Y is used to contain an equal proportion of 0 and 1 in the test and train dataset
# random_state=2 is used to split the data in a similar manner for different users
X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)
print(X_train.shape)
print(Y_test.shape)

In [None]:
# making and training our logistic regression model
# this model uses a sigmoid function , 1/(1+e^-z)
model = LogisticRegression()
model.fit(X_train,Y_train)

training_data_prediction = model.predict(X_train)
training_accuracy_score = accuracy_score(training_data_prediction,Y_train)

print('training = ' , training_accuracy_score)

In [None]:
test_data_prediction = model.predict(X_test)
test_accuracy_score = accuracy_score(test_data_prediction,Y_test)

print('test = ' , test_accuracy_score)

In [None]:
# making a predictive system
X_new = X_test[0]
prediction = model.predict(X_new)
print(prediction)

if(prediction[0]==0):
  print("This is a real news :)")
else :
  print("This is a fake news :(")

print ('Correct answer is : ',Y_test[0])