<a href="https://colab.research.google.com/github/fiifidawson/Google-Colab/blob/main/Fake_News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Selection & Preprocessing

In [10]:
# Importing Libraries
import pandas as pd
import numpy as np

In [11]:
file_path_true = 'sample_data/True.csv'
true = pd.read_csv(file_path_true)

# New section

In [12]:
file_path_fake = 'sample_data/Fake.csv'
fake = pd.read_csv(file_path_fake)


In [23]:
# Checking out the content in each table
# true.head()
#fake.head()

In [13]:
# Assigning labels
true['label'] = 1
fake['label'] = 0

In [15]:
# Merging the table row wise
news = pd.concat([fake, true], axis=0)

In [16]:
news.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [17]:
news.tail()

Unnamed: 0,title,text,subject,date,label
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


In [18]:
news.isnull()

Unnamed: 0,title,text,subject,date,label
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
21412,False,False,False,False,False
21413,False,False,False,False,False
21414,False,False,False,False,False
21415,False,False,False,False,False


In [19]:
# Using relevant info
news = news.drop(['title', 'subject', 'date'], axis=1)



In [20]:
news.head()

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [21]:
# Orgainzing the news(Shuffle)
news = news.sample(frac=1)


In [22]:
news.head()

Unnamed: 0,text,label
7342,It s clear Ted Cruz isn t even trying anymore....,0
10927,(Reuters) - Employees of U.S. companies seekin...,1
8897,Worst governor in the United States Paul LePag...,0
21684,But what about that whole unfair student debt...,0
5761,The following bullet points are from the U.S. ...,1


In [23]:
news.reset_index(inplace=True)


In [24]:
news.head()

Unnamed: 0,index,text,label
0,7342,It s clear Ted Cruz isn t even trying anymore....,0
1,10927,(Reuters) - Employees of U.S. companies seekin...,1
2,8897,Worst governor in the United States Paul LePag...,0
3,21684,But what about that whole unfair student debt...,0
4,5761,The following bullet points are from the U.S. ...,1


In [25]:
news.drop(['index'], axis=1, inplace = True)

In [26]:
news.head()

Unnamed: 0,text,label
0,It s clear Ted Cruz isn t even trying anymore....,0
1,(Reuters) - Employees of U.S. companies seekin...,1
2,Worst governor in the United States Paul LePag...,0
3,But what about that whole unfair student debt...,0
4,The following bullet points are from the U.S. ...,1


## Feature Extraction

In [27]:
# Importing regular expression model in python
import re

In [29]:
def wordopt(text):
  # Converting text to lower case
  text = text.lower()

  # Removing URLs
  text = re.sub(r'https?://\S+|www\.\S+', '', text)

  # Removing HTML Tags
  text = re.sub(r'<.*?>', '', text)

  # Removing punctuations
  text = re.sub(r'[^\w\s]', '', text)

  # Removing digits
  text = re.sub(r'\d', '', text)

  # Removing newline characters
  text = re.sub(r'\n', ' ', text)

  return text

In [30]:
news['text'] = news['text'].apply(wordopt)


In [31]:
news['text']

0        it s clear ted cruz isn t even trying anymore ...
1        reuters  employees of us companies seeking to ...
2        worst governor in the united states paul lepag...
3        but what about that whole unfair  student debt...
4        the following bullet points are from the us pr...
                               ...                        
44893    tokyotaipei reuters  women in the asiapacific ...
44894    washingtonnew york reuters  the frequent trave...
44895    new york reuters  us presidential candidate hi...
44896    washington reuters  two white house officials ...
44897    it s hard to imagine and very sad that a hospi...
Name: text, Length: 44898, dtype: object

In [32]:
# Converting into vectors
x = news['text']
y = news['label']

In [33]:
# Grouping into training and test data
from sklearn.model_selection import train_test_split

In [50]:
x_train, x_test, y_train ,y_test = train_test_split(x, y, test_size=0.3)


In [51]:
x_train.shape

(31428,)

In [54]:
x_test.shape

(13470,)

In [55]:
# converting textual data to numerical
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
vectorization = TfidfVectorizer()

In [57]:
xv_train = vectorization.fit_transform(x_train)

In [58]:
xv_test = vectorization.transform(x_test)

In [59]:
xv_train

<31428x174861 sparse matrix of type '<class 'numpy.float64'>'
	with 6452698 stored elements in Compressed Sparse Row format>

In [60]:
xv_test

<13470x174861 sparse matrix of type '<class 'numpy.float64'>'
	with 2727873 stored elements in Compressed Sparse Row format>

In [61]:
# Creating ML Model

# Logistic Regression
from sklearn.linear_model import LogisticRegression

In [62]:
LR = LogisticRegression()

In [63]:
# Training model with data
LR.fit(xv_train, y_train)

In [64]:
pred_lr = LR.predict(xv_test)

In [65]:
LR.score(xv_test, y_test)

0.9874536005939124

In [67]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7033
           1       0.98      0.99      0.99      6437

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470

