<a href="https://colab.research.google.com/github/ganesh0706patil/Fake-News-Detection-Using-NLP-based-Transformers/blob/main/ISR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11679 sha256=101e861af762cee6761f6158c1b07e2de808d6bceac49b76bf74fee4b9179eb7
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [2]:
!pip install wikipedia-api

Collecting wikipedia-api
  Downloading wikipedia_api-0.7.1.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.7.1-py3-none-any.whl size=14346 sha256=a3d890665ac31449df35c0704c2684c0149514430cd74e922d980b8078c7e12e
  Stored in directory: /root/.cache/pip/wheels/4c/96/18/b9201cc3e8b47b02b510460210cfd832ccf10c0c4dd0522962
Successfully built wikipedia-api
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.7.1


In [3]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.1


In [None]:
import os
import json
import requests
import hashlib
from bs4 import BeautifulSoup
import wikipedia
import nltk
from sentence_transformers import SentenceTransformer, util

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the BERT model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")  # A lightweight BERT variant

# Step 1: Setup Environment
def create_topic_folder(topic):

    main_folder = 'scraped_news'
    topic_path = os.path.join(main_folder, topic)
    if not os.path.exists(main_folder):
        os.makedirs(main_folder)
    if not os.path.exists(topic_path):
        os.makedirs(topic_path)

# Step 2: Fetch News Articles Using News API
def fetch_news_articles_newsapi(topic, search_query, api_key, page_size=15):

    base_url = 'https://newsapi.org/v2/everything'
    params = {
        'q': search_query,
        'apiKey': api_key,
        'pageSize': page_size,
        'language': 'en',
        'sortBy': 'relevancy'
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()

        data = response.json()
        articles = data.get("articles", [])

        if not articles:
            print("No articles found for the given query.")
            return

        for article in articles:
            title = article.get('title', 'No Title')
            description = article.get('description', 'No Description')
            content = article.get('content', 'No Content')
            url = article.get('url', 'No URL')

            # Fetch and use full content from the article URL
            full_content = fetch_full_content(url)

            if full_content == "Content not available":
                print(f"Skipping article due to content retrieval failure: {url}")
                continue

            article_text = f"Title: {title}\nDescription: {description}\nContent: {full_content}\nURL: {url}\n"

            save_article(topic, article_text, url)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching articles from News API: {e}")

def fetch_news_articles_newsdata(topic, search_query, api_key, page_size=15):

    base_url = 'https://newsdata.io/api/1/news'
    params = {
        'q': search_query,
        'apikey': api_key,
        'language': 'en',
        'page': 1,
        'pageSize': page_size,
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()

        data = response.json()
        articles = data.get("results", [])

        if not articles:
            print("No articles found for the given query.")
            return

        for article in articles:
            title = article.get('title', 'No Title')
            description = article.get('description', 'No Description')
            content = article.get('content', 'No Content')
            url = article.get('link', 'No URL')

            # Fetch and use full content from the article URL
            full_content = fetch_full_content(url)

            if full_content == "Content not available":
                print(f"Skipping article due to content retrieval failure: {url}")
                continue

            article_text = f"Title: {title}\nDescription: {description}\nContent: {full_content}\nURL: {url}\n"

            save_article(topic, article_text, url)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching articles from NewsData.io API: {e}")

def fetch_full_content(url):

    try:
        response = requests.get(url)
        if response.status_code == 403:
            print(f"Access forbidden (403): {url}")
            return "Content not available"
        elif response.status_code != 200:
            print(f"Failed to retrieve article content: {response.status_code}")
            return "Content not available"

        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        full_text = '\n'.join([p.get_text(strip=True) for p in paragraphs])

        return full_text if full_text else "Content not available"
    except Exception as e:
        print(f"Error fetching article content: {e}")
        return "Content not available"

def save_article(topic, content, url):

    file_name = hashlib.md5(url.encode()).hexdigest() + '.txt'
    path = os.path.join('scraped_news', topic)
    with open(os.path.join(path, file_name), 'w', encoding='utf-8') as f:
        f.write(content)

# Step 3: Retrieve Full Wikipedia Page Content
def fetch_wikipedia_page(query):

    try:
        page = wikipedia.page(query)
        return page.content
    except wikipedia.exceptions.DisambiguationError as e:
        return f"Disambiguation Error: {e.options}"
    except wikipedia.exceptions.PageError:
        return "No Wikipedia page available for the given query."
    except Exception as e:
        return f"Error retrieving Wikipedia page: {e}"

def save_wikipedia_page(content, topic):

    file_name = 'wiki.txt'
    path = os.path.join('scraped_news', topic)
    with open(os.path.join(path, file_name), 'w', encoding='utf-8') as f:
        f.write(content)

# Step 4: Preprocess Text
def preprocess_text(text):
    return text.lower()

# Step 5: Calculate Similarity
def calculate_similarity(document, query):
    # Encode the document and query
    document_embedding = model.encode(document, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarity
    similarity = util.pytorch_cos_sim(document_embedding, query_embedding)
    return similarity.item()

# Step 6: Main Function to Coordinate Fetching, Saving, Ranking, and Retrieving
def main():
    search_query = "October 2024 Iranian strikes against Israel"
    # "Modi appeals for safety of Hindus in crisis-hit Bangladesh"
    # "October 2024 Iranian strikes against Israel"
    # "Maharashtra Assembly Elections on 20 November"
    # "Chandrayaan-3 Landed on the Moon"
    # "By next year we will able live on sun’"
    topic = "space"
    create_topic_folder(topic)

    # Replace with your News API keys
    news_api_key = 'News API Key'
    newsdata_io_api_key = 'NewsDataio API Key'

    # Fetch news using News API
    fetch_news_articles_newsapi(topic, search_query, news_api_key)

    # Fetch news using NewsData.io API
    fetch_news_articles_newsdata(topic, search_query, newsdata_io_api_key)

    # Retrieve Wikipedia page content and save it
    wiki_content = fetch_wikipedia_page(search_query)
    save_wikipedia_page(wiki_content, topic)
    print(f"Wikipedia page content for '{search_query}' has been saved to wiki.txt")

    # Calculate similarity for each article in the folder
    articles_path = os.path.join('scraped_news', topic)
    similarity_scores = []

    for file_name in os.listdir(articles_path):
        if file_name.endswith('.txt') and file_name != 'wiki.txt':
            file_path = os.path.join(articles_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as f:
                article_content = f.read()

            # Preprocess text
            preprocessed_article = preprocess_text(article_content)
            preprocessed_wiki = preprocess_text(wiki_content)

            # Calculate similarity
            similarity_score = calculate_similarity(preprocessed_article, preprocessed_wiki)
            similarity_scores.append((file_name, similarity_score))

    # Rank articles by similarity score (highest to lowest)
    similarity_scores.sort(key=lambda x: x[1], reverse=True)

    threshold = 0.4  # Define a similarity threshold for deciding truthfulness
    a = 0
    b = 0

    print("\nRanked Articles by Similarity Scores:")
    for file_name, score in similarity_scores:
        if score > threshold:
            print(f"Article {file_name} is likely true (Similarity Score: {score:.6f}).")
            a += 1
        else:
            print(f"Article {file_name} is likely fake (Similarity Score: {score:.6f}).")
            b += 1
    if a > b:
      print("The news is true")
    elif a < b:
      print("The news is fake")
    else:
      print("Cannot be determined")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Error fetching articles from NewsData.io API: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?q=October+2024+Iranian+strikes+against+Israel&apikey=pub_5234266b89bfb262b75fa558ee28b80433d18&language=en&page=1&pageSize=15
Wikipedia page content for 'October 2024 Iranian strikes against Israel' has been saved to wiki.txt

Ranked Articles by Similarity Scores:
Article df1f6fd2eb25fac0e8618d969c006b31.txt is likely true (Similarity Score: 0.773613).
Article 5ec34d409120e3b8437baedf3ae37d5e.txt is likely true (Similarity Score: 0.763380).
Article 2fdeb6d252a6120df97c5e669f6cb147.txt is likely true (Similarity Score: 0.761404).
Article 747a6232e30087c6a14c1a571442f1c7.txt is likely true (Similarity Score: 0.723257).
Article ea5d31c62467d3ba6a1cdbc5dec88f9c.txt is likely true (Similarity Score: 0.701456).
Article 63a3c4988db34a23e2776db23fed5a6d.txt is likely true (Similarity Score: 0.693264).
Article 582cbdfbce7616f031239bf99121d399.txt is likely true (Similarit