In [4]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import re 
import joblib
import string 

In [5]:
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

In [6]:
fake['class'] = 0
true['class'] = 1

In [7]:
data = pd.concat([fake,true],axis=0)

In [8]:
data.sample(10)

Unnamed: 0,title,text,subject,date,class
11481,"MAXINE WATERS: Obama’s Left A “very, very powe...","At around the 4:00 mark in the video, Maxine p...",politics,"Mar 6, 2017",0
18654,MEDIA IGNORES Time That Bill Clinton FIRED His...,"In its 109-year history, only one F.B.I. direc...",left-news,"May 10, 2017",0
13654,Pentagon confirms 'probable' North Korean miss...,WASHINGTON (Reuters) - The Pentagon said on Tu...,worldnews,"November 28, 2017",1
12936,CHELSEA CLINTON Lashes Out At TRUMP “Hate Spee...,Has she seen what s going on with her mom s bu...,politics,"Sep 23, 2016",0
8576,Clinton: 'I will be there' for presidential de...,"ASHLAND, Ohio (Reuters) - Democratic president...",politicsNews,"July 31, 2016",1
88,Trump Admin. Tosses Out Another Obama Rule – ...,President Donald Trump has overturned another ...,News,"November 16, 2017",0
7387,"How the polls, including ours, missed Trump's ...","NEW YORK/LONDON (Reuters) - Two days ago, poll...",politicsNews,"November 9, 2016",1
11540,Somalia releases jailed ex-minister and govern...,MOGADISHU (Reuters) - A court in Somalia on Th...,worldnews,"December 21, 2017",1
7140,White SC Cops Sexually Assault Black Couple D...,A video published by The Washington Post shows...,News,"April 2, 2016",0
1635,China urges North Korea not to go further in a...,UNITED NATIONS (Reuters) - China’s foreign min...,politicsNews,"September 21, 2017",1


In [9]:
data = data.drop(['title','subject','date'],axis=1)

In [10]:
data.reset_index(inplace=True)

In [11]:
data.drop(['index'],axis=1,inplace=True)

In [12]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]',"", text) 
    text = re.sub("\\W", " ", text) 
    text = re.sub("https?:://\S+|www\.\S+", "", text) 
    text = re.sub("<.*?>+", "", text) 
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text) 
    text = re.sub("\n", "", text) 
    text = re.sub("\w*\d\w*", "", text) 
    return text 

In [13]:
# apply cleaning 
data["text"] = data["text"].apply(clean_text) #split 
x = data["text"] 
y = data["class"] 
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=42)

In [14]:
vectorizer = TfidfVectorizer()
xv_train = vectorizer.fit_transform(xtrain)
xv_test = vectorizer.transform(xtest)



In [15]:
lr = LogisticRegression(max_iter=1000)
lr.fit(xv_train, ytrain)

In [16]:
prediction = lr.predict(xv_test)
accuracy = lr.score(xv_test, ytest)
print("Accuracy:", accuracy)
print("Accuracy in % :", accuracy*100)

Accuracy: 0.9858351893095768
Accuracy in % : 98.58351893095768


In [17]:
print(classification_report(ytest,prediction))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5895
           1       0.98      0.99      0.99      5330

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [18]:
joblib.dump(vectorizer, 'vectorizer.jb')
joblib.dump(lr,"lr.jb")

['lr.jb']