<a href="https://colab.research.google.com/github/fiifidawson/Google-Colab/blob/main/Fake_News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Selection & Preprocessing

In [17]:
# Importing Libraries
import pandas as pd
import numpy as np

In [21]:
file_path_true = 'sample_data/True.csv'
true = pd.read_csv(file_path_true)

# New section

In [20]:
file_path_fake = 'sample_data/Fake.csv'
fake = pd.read_csv(file_path_fake)


In [22]:
# Checking out the content in each table
# true.head()
#fake.head()

In [23]:
# Assigning labels
true['label'] = 1
fake['label'] = 0

In [24]:
# Merging the table row wise
news = pd.concat([fake, true], axis=0)

In [None]:
news.head()

In [None]:
news.tail()

In [None]:
news.isnull()

In [25]:
# Using relevant info
news = news.drop(['title', 'subject', 'date'], axis=1)



In [None]:
news.head()

In [26]:
# Orgainzing the news(Shuffle)
news = news.sample(frac=1)


In [None]:
news.head()

In [27]:
news.reset_index(inplace=True)


In [None]:
news.head()

In [28]:
news.drop(['index'], axis=1, inplace = True)

In [None]:
news.head()

## Feature Extraction

In [29]:
# Importing regular expression model in python
import re

In [30]:
def wordopt(text):
  # Converting text to lower case
  text = text.lower()

  # Removing URLs
  text = re.sub(r'https?://\S+|www\.\S+', '', text)

  # Removing HTML Tags
  text = re.sub(r'<.*?>', '', text)

  # Removing punctuations
  text = re.sub(r'[^\w\s]', '', text)

  # Removing digits
  text = re.sub(r'\d', '', text)

  # Removing newline characters
  text = re.sub(r'\n', ' ', text)

  return text

In [31]:
news['text'] = news['text'].apply(wordopt)


In [None]:
news['text']

In [33]:
# Converting into vectors
x = news['text']
y = news['label']

In [34]:
# Grouping into training and test data
from sklearn.model_selection import train_test_split

In [35]:
x_train, x_test, y_train ,y_test = train_test_split(x, y, test_size=0.3)


In [36]:
x_train.shape

(31428,)

In [37]:
x_test.shape

(13470,)

In [38]:
# converting textual data to numerical
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
vectorization = TfidfVectorizer()

In [40]:
xv_train = vectorization.fit_transform(x_train)

In [41]:
xv_test = vectorization.transform(x_test)

In [42]:
xv_train

<31428x174194 sparse matrix of type '<class 'numpy.float64'>'
	with 6452567 stored elements in Compressed Sparse Row format>

In [43]:
xv_test

<13470x174194 sparse matrix of type '<class 'numpy.float64'>'
	with 2727217 stored elements in Compressed Sparse Row format>

In [44]:
# Creating ML Model

# Logistic Regression
from sklearn.linear_model import LogisticRegression

In [45]:
LR = LogisticRegression()

In [46]:
# Training model with data
LR.fit(xv_train, y_train)

In [47]:
pred_lr = LR.predict(xv_test)

In [48]:
LR.score(xv_test, y_test)

0.9893838158871566

In [49]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7048
           1       0.99      0.99      0.99      6422

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [50]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

In [51]:
DTC = DecisionTreeClassifier()

In [52]:
DTC.fit(xv_train, y_train)

In [54]:
pred_dtc = DTC.predict(xv_test)


In [55]:
DTC.score(xv_test, y_test)

0.995916852264291

In [57]:
print(classification_report(y_test, pred_dtc))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7048
           1       1.00      1.00      1.00      6422

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [59]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

In [60]:
rfc = RandomForestClassifier()

In [61]:
rfc.fit(xv_train, y_train)

In [62]:
predict_rfc = rfc.predict(xv_test)

In [63]:
rfc.score(xv_test, y_test)

0.9896807720861173

In [64]:
print(classification_report(y_test, predict_rfc))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7048
           1       0.99      0.99      0.99      6422

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [65]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

In [66]:
gbc = GradientBoostingClassifier()

In [67]:
gbc.fit(xv_train, y_train)

In [68]:
predict_gbc = gbc.predict(xv_test)

In [69]:
gbc.score(xv_test, y_test)

0.9958426132145508

In [70]:
print(classification_report(y_test, predict_gbc))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      7048
           1       0.99      1.00      1.00      6422

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [71]:
# Creating a predicting model
def output_label(n):
  if n == 0:
    return "Fake news."
  elif n == 1:
    return "Genuine News"