In [48]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split as ts
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix as cm, classification_report as cr
import os
from dotenv import load_dotenv
import requests
import json
import pandas as pd
from datetime import datetime
import time

Data Gathering

In [2]:
data = pd.read_csv("WELFake_Dataset.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  72134 non-null  int64 
 1   title       71576 non-null  object
 2   text        72095 non-null  object
 3   label       72134 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ MB


In [4]:
data.describe()

Unnamed: 0.1,Unnamed: 0,label
count,72134.0,72134.0
mean,36066.5,0.514404
std,20823.436496,0.499796
min,0.0,0.0
25%,18033.25,0.0
50%,36066.5,1.0
75%,54099.75,1.0
max,72133.0,1.0


In [5]:
data.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [6]:
data['label'].value_counts()

label
1    37106
0    35028
Name: count, dtype: int64

In [7]:
data.shape

(72134, 4)

In [8]:
data = data.dropna()

In [9]:
data.isna().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [10]:
data.shape

(71537, 4)

In [11]:
data.reset_index(inplace=True)
data.head()

Unnamed: 0.1,index,Unnamed: 0,title,text,label
0,0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
2,3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
3,4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
4,5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


In [12]:
data['title'][0]

'LAW ENFORCEMENT ON HIGH ALERT Following Threats Against Cops And Whites On 9-11By #BlackLivesMatter And #FYF911 Terrorists [VIDEO]'

In [13]:
data = data.drop(['Unnamed: 0', 'text'], axis=1)
data.head()

Unnamed: 0,index,title,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1
1,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,1
2,3,"Bobby Jindal, raised Hindu, uses story of Chri...",0
3,4,SATAN 2: Russia unvelis an image of its terrif...,1
4,5,About Time! Christian Group Sues Amazon and SP...,1


# Data Preprocessing

1. Tokenization

In [14]:
sample_data = 'The quick brown fox jumps over the lazy dog.'
sample_data = sample_data.split()
sample_data

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.']

2. Make Lowercase

In [15]:
sample_data = [data.lower() for data in sample_data]
sample_data

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.']

3. Remove Stopwords

In [16]:
stop_words = stopwords.words('english')
print(stop_words)
print(len(stop_words))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [17]:
sample_data = [data for data in sample_data if data not in stop_words]
print(sample_data)
len(sample_data)

['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog.']


6

4. Stemming

In [18]:
ps = PorterStemmer()
sample_data_stemming = [ps.stem(data) for data in sample_data]
sample_data_stemming

['quick', 'brown', 'fox', 'jump', 'lazi', 'dog.']

5. Lemmatization

In [19]:
lm = WordNetLemmatizer()
sample_data_lemma = [lm.lemmatize(data) for data in sample_data]
sample_data_lemma

['quick', 'brown', 'fox', 'jump', 'lazy', 'dog.']

Implementation of Data-Preprocessing

In [20]:
corpus = []
for i in range(len(data)):
    review = re.sub('^a-zA-Z0-9',' ', data['title'][i])
    review = review.lower() #cpnverts to lowercase
    review = review.split() #toeknize
    review = [lm.lemmatize(x) for x in review if x not in (stopwords.words('english'))] #lemmatize
    review = " ".join(review) #join words back to single string
    corpus.append(review)

In [21]:
len(corpus)

71537

In [22]:
data['title'][0]

'LAW ENFORCEMENT ON HIGH ALERT Following Threats Against Cops And Whites On 9-11By #BlackLivesMatter And #FYF911 Terrorists [VIDEO]'

In [23]:
corpus[0]

'law enforcement high alert following threat cop white 9-11by #blacklivesmatter #fyf911 terrorist [video]'

# Vectorization (Convert Text data into the Vector)

In [24]:
tf = TfidfVectorizer(max_features=5000, min_df=2, max_df=0.8, ngram_range=(1,2))
x = tf.fit_transform(corpus).toarray()
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(71537, 5000))

In [25]:
y = data['label']
y.head()

0    1
1    1
2    0
3    1
4    1
Name: label, dtype: int64

# Data splitting into train and test

In [26]:
x_train, x_test, y_train, y_test = ts(x,y,test_size=0.3, random_state = 10, stratify = y)

In [27]:
len(x_train), len(y_train)

(50075, 50075)

In [28]:
len(x_test), len(y_test)

(21462, 21462)

# Model Building

In [29]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


Model Evaluation

In [30]:
y_pred = rf.predict(x_test)
accuracy_score_ = accuracy_score(y_test,y_pred)
accuracy_score_

0.8954431087503495

In [31]:
class Evaluation:
    
    def __init__(self, model, x_train, x_test, y_train, y_test):
        self.model = model
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test
    
    def train_evaluation(self):
        y_pred_train = self.model.predict(self.x_train)
        
        accuracy_score_train = accuracy_score(self.y_train, y_pred_train)
        print("Accuracy score on training data set: ", accuracy_score_train)
        print()
        
        confusion_matrix_train = cm(self.y_train, y_pred_train)
        print("Confusion matrix on training data set: \n", confusion_matrix_train)
        print()
        
        classification_report_train = cr(self.y_train, y_pred_train)
        print("Classification report on training data set: \n", classification_report_train)
        
    def test_evaluation(self):
        y_pred_test = self.model.predict(self.x_test)
        
        accuracy_score_test = accuracy_score(self.y_test, y_pred_test)
        print("Accuracy score on testing data set: ", accuracy_score_test)
        print()
        
        confusion_matrix_test = cm(self.y_test, y_pred_test)
        print("Confusion matrix on testing data set: \n", confusion_matrix_test)
        print()
        
        classification_report_test = cr(self.y_test, y_pred_test)
        print("Classification report on testing data set: \n", classification_report_test)
        
        


In [32]:
Evaluation(rf, x_train, x_test, y_train, y_test).train_evaluation()

Accuracy score on training data set:  0.9997403894158762

Confusion matrix on training data set: 
 [[24510     9]
 [    4 25552]]

Classification report on training data set: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     24519
           1       1.00      1.00      1.00     25556

    accuracy                           1.00     50075
   macro avg       1.00      1.00      1.00     50075
weighted avg       1.00      1.00      1.00     50075



In [33]:
Evaluation(rf, x_train, x_test, y_train, y_test).test_evaluation()

Accuracy score on testing data set:  0.8954431087503495

Confusion matrix on testing data set: 
 [[9279 1230]
 [1014 9939]]

Classification report on testing data set: 
               precision    recall  f1-score   support

           0       0.90      0.88      0.89     10509
           1       0.89      0.91      0.90     10953

    accuracy                           0.90     21462
   macro avg       0.90      0.90      0.90     21462
weighted avg       0.90      0.90      0.90     21462



Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train, y_train)

# Predict on the test set
y_pred_lr = lr.predict(x_test)

# Evaluate accuracy
accuracy_score_lr = accuracy_score(y_test, y_pred_lr)
accuracy_score_lr

0.8948839809896562

XGB Classifier

In [35]:
from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=10)
xgb.fit(x_train, y_train)
y_pred_xgb = xgb.predict(x_test)
print("XGBoost accuracy:", accuracy_score(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost accuracy: 0.8610567514677103


Classification Report

In [36]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89     10509
           1       0.89      0.91      0.90     10953

    accuracy                           0.89     21462
   macro avg       0.90      0.89      0.89     21462
weighted avg       0.90      0.89      0.89     21462



# Prediction Pipeline

In [37]:
class Preprocessing:
    
    def __init__(self,data2):
        self.data2 = data2
    
    def text_preprocessing_user(self):
        lm = WordNetLemmatizer()
        pred_data = [self.data2]
        preproccess_data = []

        
        for data2 in pred_data:
            review = re.sub('^a-zA-Z0-9',' ', data2)
            review = review.lower() #converts to lowercase
            review = review.split() #toeknize
            review = [lm.lemmatize(x) for x in review if x not in stopwords.words('english')] #lemmatize
            review = " ".join(review) #join words back to single string
            preproccess_data.append(review)
        return preproccess_data

In [38]:
data['title'][1]

'UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MOST CHARLOTTE RIOTERS WERE “PEACEFUL” PROTESTERS…In Her Home State Of North Carolina [VIDEO]'

In [39]:
data2 = 'UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MOST CHARLOTTE RIOTERS WERE “PEACEFUL” PROTESTERS…In Her Home State Of North Carolina [VIDEO]'

Preprocessing(data2).text_preprocessing_user()   

['unbelievable! obama’s attorney general say charlotte rioter “peaceful” protesters…in home state north carolina [video]']

In [40]:
class Prediction:
    
    def __init__(self, pred_data, model):
        self.pred_data = pred_data
        self.model = model

    def prediction_model(self):
        preprocess_data = Preprocessing(self.pred_data).text_preprocessing_user()
        data = tf.transform(preprocess_data)
        prediction = self.model.predict(data)
        
        if prediction [0] == 0:
            return "The News is Fake"
        else:
            return "The News is Real"

In [41]:
data2 = 'UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MOST CHARLOTTE RIOTERS WERE “PEACEFUL” PROTESTERS…In Her Home State Of North Carolina [VIDEO]'
Prediction(data2, rf).prediction_model()

'The News is Real'

# Fetch API

In [49]:
class NewsAPIFetcher:
    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = "https://newsapi.org/v2"
        self.headers = {
            'X-API-Key': api_key,
            'User-Agent': 'FakeNewsDetector/1.0'
        }
    
    def fetch_top_headlines(self, country='us', category=None, sources=None, page_size=20):
        url = f"{self.base_url}/top-headlines"
        params = {
            'pageSize': page_size,
            'apiKey': self.api_key
        }
        
        if country:
            params['country'] = country
        if category:
            params['category'] = category
        if sources:
            params['sources'] = sources
            params.pop('country', None)
        
        try:
            response = requests.get(url, params=params, headers=self.headers)
            response.raise_for_status()
            data = response.json()
            return data.get('articles', [])
        except requests.exceptions.RequestException as e:
            print(f"Error fetching news: {e}")
            return []
    
    def fetch_everything(self, query, sources=None, domains=None, from_date=None, to_date=None, page_size=20):
        url = f"{self.base_url}/everything"
        params = {
            'q': query,
            'pageSize': page_size,
            'sortBy': 'publishedAt',
            'apiKey': self.api_key
        }
        
        if sources:
            params['sources'] = sources
        if domains:
            params['domains'] = domains
        if from_date:
            params['from'] = from_date
        if to_date:
            params['to'] = to_date
        
        try:
            response = requests.get(url, params=params, headers=self.headers)
            response.raise_for_status()
            data = response.json()
            return data.get('articles', [])
        except requests.exceptions.RequestException as e:
            print(f"Error fetching news: {e}")
            return []
    
    def process_articles_for_prediction(self, articles):
        processed_articles = []
        
        for article in articles:
            processed_article = {
                'title': article.get('title', ''),
                'content': article.get('content', ''),
                'description': article.get('description', ''),
                'author': article.get('author', ''),
                'source': article.get('source', {}).get('name', ''),
                'url': article.get('url', ''),
                'publishedAt': article.get('publishedAt', ''),
                'urlToImage': article.get('urlToImage', '')
            }
            
            text_content = f"{processed_article['title']} {processed_article['description']} {processed_article['content']}"
            processed_article['full_text'] = text_content.strip()
            processed_articles.append(processed_article)
        
        return pd.DataFrame(processed_articles)
    
    def get_news_for_prediction(self, method='headlines', **kwargs):
        if method == 'headlines':
            articles = self.fetch_top_headlines(**kwargs)
        elif method == 'search':
            articles = self.fetch_everything(**kwargs)
        else:
            raise ValueError("Method must be 'headlines' or 'search'")
        
        if not articles:
            print("No articles fetched")
            return pd.DataFrame()
        
        df = self.process_articles_for_prediction(articles)
        print(f"Fetched {len(df)} articles for prediction")
        return df

class Prediction:
    def __init__(self, pred_data, model, vectorizer):
        self.pred_data = pred_data
        self.model = model
        self.vectorizer = vectorizer

    def prediction_model(self):
        try:
            preprocess_data = Preprocessing(self.pred_data).text_preprocessing_user()
            data = self.vectorizer.transform(preprocess_data)
            prediction = self.model.predict(data)
            if prediction[0] == 0:
                return "The News is Fake"
            else:
                return "The News is Real"
        except Exception as e:
            return f"Error in prediction: {str(e)}"

class NewsPredictor:
    def __init__(self, model, vectorizer):
        self.model = model
        self.vectorizer = vectorizer
    
    def predict_single_news(self, news_text):
        predictor = Prediction(news_text, self.model, self.vectorizer)
        return predictor.prediction_model()
    
    def predict_multiple_news(self, news_df):
        if news_df.empty:
            print("No news data to predict")
            return pd.DataFrame()
        
        results = []
        
        print("Making predictions for each article...")
        for idx, row in news_df.iterrows():
            try:
                news_text = row['full_text']
                prediction_result = self.predict_single_news(news_text)
                result = {
                    'article_id': idx + 1,
                    'title': row['title'],
                    'source': row['source'],
                    'url': row['url'],
                    'publishedAt': row['publishedAt'],
                    'author': row['author'],
                    'full_text': news_text,
                    'prediction_result': prediction_result
                }
                results.append(result)
                print(f"Article {idx + 1}: {prediction_result}")
            except Exception as e:
                print(f"Error predicting article {idx + 1}: {e}")
                result = {
                    'article_id': idx + 1,
                    'title': row['title'],
                    'source': row['source'],
                    'url': row['url'],
                    'publishedAt': row['publishedAt'],
                    'author': row['author'],
                    'full_text': row['full_text'],
                    'prediction_result': f"Error: {str(e)}"
                }
                results.append(result)
        
        return pd.DataFrame(results)

class Preprocessing:
    def __init__(self, data):
        self.data = data
    
    def text_preprocessing_user(self):
        if isinstance(self.data, str):
            return [self.data]
        else:
            return [str(self.data)]

def run_fake_news_detection(api_key, rf_model, tf_vectorizer):
    print("="*80)
    print("FAKE NEWS DETECTION PIPELINE")
    print("="*80)
    
    print("Initializing news fetcher...")
    fetcher = NewsAPIFetcher(api_key)
    
    print("Fetching latest news...")
    news_df = fetcher.get_news_for_prediction(
        method='headlines',
        country='us',
        page_size=10
    )
    
    if news_df.empty:
        print("❌ No articles fetched. Check API key and connection.")
        return pd.DataFrame()
    
    print(f"✓ Successfully fetched {len(news_df)} articles")
    
    print("Initializing predictor...")
    predictor = NewsPredictor(rf_model, tf_vectorizer)
    
    print("Making predictions...")
    results_df = predictor.predict_multiple_news(news_df)
    
    print("\n" + "="*80)
    print("PREDICTION RESULTS")
    print("="*80)
    
    if not results_df.empty:
        for idx, row in results_df.iterrows():
            print(f"\nArticle {row['article_id']}:")
            print(f"Title: {row['title'][:80]}...")
            print(f"Source: {row['source']}")
            print(f"Prediction: {row['prediction_result']}")
            print("-" * 60)
        
        fake_count = len(results_df[results_df['prediction_result'] == 'The News is Fake'])
        real_count = len(results_df[results_df['prediction_result'] == 'The News is Real'])
        error_count = len(results_df[results_df['prediction_result'].str.contains('Error', na=False)])
        
        print(f"\nSUMMARY:")
        print(f"Total articles: {len(results_df)}")
        print(f"Predicted as Real: {real_count}")
        print(f"Predicted as Fake: {fake_count}")
        print(f"Errors: {error_count}")
        
        results_df.to_csv('fake_news_predictions.csv', index=False)
        print(f"\n✓ Results saved to 'fake_news_predictions.csv'")
    
    return results_df

def predict_custom_news(news_text, rf_model, tf_vectorizer):
    predictor = NewsPredictor(rf_model, tf_vectorizer)
    result = predictor.predict_single_news(news_text)
    print(f"News: {news_text[:100]}...")
    print(f"Prediction: {result}")
    return result

if __name__ == "__main__":
    load_dotenv()
    api_key = os.getenv('NEWSAPI_KEY')
    if not api_key:
        raise ValueError("NEWSAPI_KEY not found in .env file")




In [None]:
api_key = "Your API key"
results_df = run_fake_news_detection(api_key, rf, tf)

FAKE NEWS DETECTION PIPELINE
Initializing news fetcher...
Fetching latest news...
Fetched 10 articles for prediction
✓ Successfully fetched 10 articles
Initializing predictor...
Making predictions...
Making predictions for each article...
Article 1: The News is Real
Article 2: The News is Real
Article 3: The News is Fake
Article 4: The News is Real
Article 5: The News is Real
Article 6: The News is Real
Article 7: The News is Real
Article 8: The News is Real
Article 9: The News is Real
Article 10: The News is Real

PREDICTION RESULTS

Article 1:
Title: Sonic Racing: CrossWorlds Officially Reveals Nickelodeon Collaboration - Nintend...
Source: Nintendo Life
Prediction: The News is Real
------------------------------------------------------------

Article 2:
Title: Israeli strikes on Iran cap dramatic shift in Mideast strategic balance - The Wa...
Source: The Washington Post
Prediction: The News is Real
------------------------------------------------------------

Article 3:
Title: Trump

In [44]:
custom_news = "Breaking: Scientists discover new method for renewable energy"
result = predict_custom_news(custom_news, rf, tf)


News: Breaking: Scientists discover new method for renewable energy...
Prediction: The News is Real
