<a href="https://colab.research.google.com/github/charlie-paterson/CognoRise-Infotech/blob/main/Fake%20News/Fake_News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))
from nltk.sentiment import vader

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Loading the Dataset

In [None]:
df = pd.read_csv('news.csv')
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


## Cleaning

In [None]:
df = df.drop(columns=['Unnamed: 0', 'title'])

In [None]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [None]:
df.describe()

Unnamed: 0,label
count,6335.0
mean,0.500552
std,0.500039
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    6335 non-null   object
 1   label   6335 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 99.1+ KB


In [None]:
label_counts = df['label'].value_counts()
fake_count = label_counts.get(0)
print(f"Number of fake news articles: {fake_count}")
real_count = label_counts.get(1)
print(f"Number of real news articles: {real_count}")
total_count = fake_count + real_count
print(f"Total number of news articles: {total_count}")

Number of fake news articles: 3164
Number of real news articles: 3171
Total number of news articles: 6335


In [None]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [None]:
df['text'] = df['text'].apply(clean)

## Encoding Data

In [None]:
le = preprocessing.LabelEncoder()

In [None]:
le.fit(df.label)
df.label = le.transform(df.label)

In [None]:
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,daniel greenfield shillman journal fellow free...,0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,googl pinterest digg linkedin reddit stumbleup...,0
2,3608,Kerry to go to Paris in gesture of sympathy,us secretari state john f kerri said monday st...,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,— kayde king kaydeek novemb lesson tonight d...,0
4,875,The Battle of New York: Why This Primary Matters,primari day new york frontrunn hillari clinton...,1
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,state depart told republican nation committe c...,1
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,p pbs stand plutocrat pentagon post oct wik...,0
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,antitrump protest tool oligarchi reform alwa...,0
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...",addi ababa ethiopia —presid obama conven meet ...,1


## Training

In [None]:
X = df['text']
y = df['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(X_train)
xv_test = vectorization.transform(X_test)

### Logistic Regression Model

In [None]:
log_reg = LogisticRegression()
log_reg.fit(xv_train, y_train)

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

log_reg = LogisticRegression()
grid_search = GridSearchCV(log_reg, param_grid, cv=5)
grid_search.fit(xv_train, y_train)
best_log_reg = grid_search.best_estimator_

In [None]:
y_pred_lr = best_log_reg.predict(xv_test)

In [None]:
cm = confusion_matrix(y_test, y_pred_lr)
print("Confusion Matrix: ")
print(" ")
print(cm)
print(" ")
cr_lr = classification_report(y_test, y_pred_lr)
print("Classification Report: ")
print(cr_lr)
print(" ")
acc_score_lr = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy: {np.round(acc_score_lr, 2)}")
y_pred_proba = best_log_reg.predict_proba(xv_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {np.round(roc_auc, 2)}")

Confusion Matrix: 
 
[[732  56]
 [ 42 754]]
 
Classification Report: 
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       788
           1       0.93      0.95      0.94       796

    accuracy                           0.94      1584
   macro avg       0.94      0.94      0.94      1584
weighted avg       0.94      0.94      0.94      1584

 
Accuracy: 0.94
ROC AUC: 0.98


### Random Forest Model

In [None]:
rf = RandomForestClassifier()
rf.fit(xv_train, y_train)

In [None]:
y_pred_rf = rf.predict(xv_test)

In [None]:
cm = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix: ")
print(" ")
print(cm)
print(" ")
cr_rf = classification_report(y_test, y_pred_rf)
print("Classification Report: ")
print(cr_rf)
print(" ")
acc_score_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy: {np.round(acc_score_rf, 2)}")
y_pred_proba = rf.predict_proba(xv_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {np.round(roc_auc, 2)}")

Confusion Matrix: 
 
[[732  56]
 [ 86 710]]
 
Classification Report: 
              precision    recall  f1-score   support

           0       0.89      0.93      0.91       788
           1       0.93      0.89      0.91       796

    accuracy                           0.91      1584
   macro avg       0.91      0.91      0.91      1584
weighted avg       0.91      0.91      0.91      1584

 
Accuracy: 0.91
ROC AUC: 0.97


### Gradient Boosting Model

In [None]:
gb = GradientBoostingClassifier()
gb.fit(xv_train, y_train)

In [None]:
y_pred = gb.predict(xv_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: ")
print(" ")
print(cm)
print(" ")
cr_gb = classification_report(y_test, y_pred)
print("Classification Report: ")
print(cr_gb)
print(" ")
acc_score_gb = accuracy_score(y_test, y_pred)
print(f"Accuracy: {np.round(acc_score_gb, 2)}")
y_pred_proba = gb.predict_proba(xv_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {np.round(roc_auc, 2)}")

Confusion Matrix: 
 
[[722  66]
 [ 88 708]]
 
Classification Report: 
              precision    recall  f1-score   support

           0       0.89      0.92      0.90       788
           1       0.91      0.89      0.90       796

    accuracy                           0.90      1584
   macro avg       0.90      0.90      0.90      1584
weighted avg       0.90      0.90      0.90      1584

 
Accuracy: 0.9
ROC AUC: 0.97


## Summary

Using a Logistic Regression model, I achieved an impressive accuracy score of 94% in determining whether news articles were real or fake. Additionally, the model attained an Area Under the Curve (AUC) score of 0.98. This high AUC score indicates that the model excels at identifying patterns within the dataset, significantly outperforming random guessing in predicting the authenticity of the news. These metrics demonstrate the model's robustness and effectiveness in distinguishing between real and fake news, showcasing its potential as a reliable tool for this critical task.

## News Prediction

In [None]:
df['text'].values[2]

"us secretari state john f kerri said monday stop pari later week amid critic top american offici attend sunday uniti march terrorismkerri said expect arriv pari thursday even head home week abroad said fli franc conclus seri meet schedul thursday sofia bulgaria plan meet next day foreign minist laurent fabius presid francoi holland return washingtonth visit kerri famili childhood tie countri speak fluent french could address critic unit state snub franc darkest hour mani yearsth french press monday fill question neither presid obama kerri attend sunday march  leader nation obama said stay away secur need tax countri kerri prior commitmentsamong rough  leader attend isra prime minist benjamin netanyahu stranger intens secur march besid holland citi street highest rank us offici attend march jane hartley ambassador franc victoria nuland assist secretari state european affair attorney general eric h holder jr pari meet law enforc offici particip marchkerri spent sunday busi summit host i

In [None]:
def output_label(n):
  if n == 1:
    return "Real news"
  elif n == 0:
    return "Fake news"

In [None]:
def testing(news):
  testing_news = {"text":[news]}
  new_def_test = pd.DataFrame(testing_news)
  new_def_test['text'] = new_def_test['text'].apply(clean)
  new_x_test = new_def_test['text']
  new_tfidf_test = vectorization.transform(new_x_test)
  pred_log = best_log_reg.predict(new_tfidf_test)
  pred_gb = gb.predict(new_tfidf_test)
  pred_rf = rf.predict(new_tfidf_test)
  return print(f"LR Prediction: {output_label(pred_log[0])}\nGB Prediction: {output_label(pred_gb[0])}\nRF Prediction: {output_label(pred_rf[0])} ")
news = str(input())
testing(news)

us secretari state john f kerri said monday stop pari later week amid critic top american offici attend sunday uniti march terrorismkerri said expect arriv pari thursday even head home week abroad said fli franc conclus seri meet schedul thursday sofia bulgaria plan meet next day foreign minist laurent fabius presid francoi holland return washingtonth visit kerri famili childhood tie countri speak fluent french could address critic unit state snub franc darkest hour mani yearsth french press monday fill question neither presid obama kerri attend sunday march  leader nation obama said stay away secur need tax countri kerri prior commitmentsamong rough  leader attend isra prime minist benjamin netanyahu stranger intens secur march besid holland citi street highest rank us offici attend march jane hartley ambassador franc victoria nuland assist secretari state european affair attorney general eric h holder jr pari meet law enforc offici particip marchkerri spent sunday busi summit host in