### Classify the News into Fake or Real using Logistic Regression

In [2]:
! kaggle competitions download -c fake-news

Downloading fake-news.zip to d:\git_repo\Machine_Learning_Projects\FakeNews_Prediction




  0%|          | 0.00/46.5M [00:00<?, ?B/s]
  2%|▏         | 1.00M/46.5M [00:01<00:53, 896kB/s]
  4%|▍         | 2.00M/46.5M [00:01<00:28, 1.62MB/s]
  6%|▋         | 3.00M/46.5M [00:01<00:19, 2.39MB/s]
  9%|▊         | 4.00M/46.5M [00:01<00:15, 2.93MB/s]
 11%|█         | 5.00M/46.5M [00:02<00:13, 3.23MB/s]
 13%|█▎        | 6.00M/46.5M [00:02<00:12, 3.49MB/s]
 15%|█▌        | 7.00M/46.5M [00:02<00:10, 3.78MB/s]
 17%|█▋        | 8.00M/46.5M [00:02<00:10, 3.89MB/s]
 19%|█▉        | 9.00M/46.5M [00:03<00:10, 3.85MB/s]
 22%|██▏       | 10.0M/46.5M [00:03<00:10, 3.69MB/s]
 24%|██▎       | 11.0M/46.5M [00:03<00:09, 3.98MB/s]
 26%|██▌       | 12.0M/46.5M [00:03<00:08, 4.23MB/s]
 28%|██▊       | 13.0M/46.5M [00:04<00:08, 4.33MB/s]
 30%|███       | 14.0M/46.5M [00:04<00:08, 4.25MB/s]
 32%|███▏      | 15.0M/46.5M [00:04<00:07, 4.43MB/s]
 34%|███▍      | 16.0M/46.5M [00:04<00:07, 4.41MB/s]
 37%|███▋      | 17.0M/46.5M [00:05<00:07, 4.32MB/s]
 39%|███▊      | 18.0M/46.5M [00:05<00:06, 4.40MB/s]
 4

In [4]:
import numpy as np
import pandas as pd
import re # used for text searching
from nltk.corpus import stopwords # remove a words that do have a weight
from nltk.stem.porter import PorterStemmer # remove prefix
from sklearn.feature_extraction.text import TfidfVectorizer # convert text to number
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [5]:
news_train_ds = pd.read_csv('./train.csv')
news_train_ds.shape

(20800, 5)

In [6]:
news_train_ds.head(5)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [8]:
news_train_ds['label'].value_counts() # 1 --> Fake news. 0 --> Real news.

label
1    10413
0    10387
Name: count, dtype: int64

In [10]:
news_train_ds.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [11]:
# replacing the null values with empty string
news_train_ds = news_train_ds.fillna('')

In [13]:
news_train_ds['content'] = news_train_ds['author']+' '+news_train_ds['title']

In [14]:
news_train_ds

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Iranian woman jailed for fictio...
...,...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0,Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0,"Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma..."
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0,Michael J. de la Merced and Rachel Abrams Macy...
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1,"Alex Ansary NATO, Russia To Hold Parallel Exer..."


In [15]:
news_train_ds['content']

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object

In [16]:
X = news_train_ds.drop(columns='label', axis=1)
y = news_train_ds['label']
X.shape, y.shape

((20800, 5), (20800,))

In [17]:
X

Unnamed: 0,id,title,author,text,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Howard Portnoy Iranian woman jailed for fictio...
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,"Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma..."
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,Michael J. de la Merced and Rachel Abrams Macy...
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...","Alex Ansary NATO, Russia To Hold Parallel Exer..."


In [18]:
y

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64

In [23]:
# stem --> process of reducing words to its root word
# actor, actress, acting --root word --> act

port_stem = PorterStemmer()

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word is stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [24]:
news_train_ds['content'] = news_train_ds['content'].apply(stemming)

In [25]:
news_train_ds['content']

0        darrel lucu hous dem aid we didn t even see co...
1        daniel j flynn flynn hillari clinton big woman...
2        consortiumnew com whi the truth might get you ...
3        jessica purkiss civilian kill in singl us airs...
4        howard portnoy iranian woman jail for fiction ...
                               ...                        
20795    jerom hudson rapper t i trump a poster child f...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc and rachel abram maci s i...
20798    alex ansari nato russia to hold parallel exerc...
20799                   david swanson what keep the f aliv
Name: content, Length: 20800, dtype: object

In [27]:
X = news_train_ds['content'].values
y = news_train_ds['label'].values

In [28]:
X, y

(array(['darrel lucu hous dem aid we didn t even see comey s letter until jason chaffetz tweet it',
        'daniel j flynn flynn hillari clinton big woman on campu breitbart',
        'consortiumnew com whi the truth might get you fire', ...,
        'michael j de la merc and rachel abram maci s is said to receiv takeov approach by hudson s bay the new york time',
        'alex ansari nato russia to hold parallel exercis in balkan',
        'david swanson what keep the f aliv'], dtype=object),
 array([1, 0, 1, ..., 0, 1, 1], dtype=int64))

In [29]:
X.shape, y.shape

((20800,), (20800,))

In [35]:
# converting the textual data to numerical data

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)
print(X)

  (0, 7642)	0.15585882712957794
  (0, 15767)	0.2530073032987084
  (0, 2501)	0.32655231027754433
  (0, 7734)	0.22014490291665442
  (0, 16060)	0.28379135534554073
  (0, 8673)	0.25946859467111033
  (0, 2977)	0.21925031305348408
  (0, 13533)	0.22790561725543848
  (0, 4997)	0.20723456165570228
  (0, 4028)	0.26352190272448234
  (0, 16591)	0.19185657544008947
  (0, 272)	0.2399067450673633
  (0, 3811)	0.2402904504663692
  (0, 7041)	0.1942886495938918
  (0, 8953)	0.3229500947064574
  (0, 3619)	0.31966152954272303
  (1, 1910)	0.15325078862748648
  (1, 2241)	0.3778771044604833
  (1, 10754)	0.15877335700396866
  (1, 16897)	0.29690287265706966
  (1, 1512)	0.29025992042009807
  (1, 2831)	0.1885236047225887
  (1, 6850)	0.1880499691597717
  (1, 5528)	0.7052687008120976
  (1, 3587)	0.2603921904305678
  :	:
  (20797, 7614)	0.12723270154056254
  (20797, 7079)	0.21135740479289314
  (20797, 9634)	0.1692421112371834
  (20797, 534)	0.10034413229326822
  (20797, 15374)	0.07910988599881484
  (20797, 17096)	0.0

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [38]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16640, 17232), (4160, 17232), (16640,), (4160,))

In [40]:
model = LogisticRegression()
model

In [41]:
model.fit(X=X_train, y=y_train)

In [42]:
train_pred = model.predict(X_train)
train_acc = accuracy_score(y_pred=train_pred, y_true=y_train)

y_pred = model.predict(X=X_test)
test_acc = accuracy_score(y_pred=y_pred, y_true=y_test)

print(f'Train accuracy: {train_acc}, and Test accuracy: {test_acc}')

Train accuracy: 0.9865985576923076, and Test accuracy: 0.973798076923077


In [52]:
in_data = X_test[344]

In [53]:
prediction = model.predict(in_data)

if (prediction[0] == 0):
    print("Real News")
else:
    print("Fake News")

print(y_test[0])

Real News
0
