In [1]:
import requests
from bs4 import BeautifulSoup
from googlesearch import search
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download the NLTK resources (run this once)
#nltk.download('punkt')
#nltk.download('stopwords')

PURPLE = "\033[95m"
RESET = "\033[0m"

def display_colored_link(link, is_reputed):
    color = PURPLE if is_reputed else ""
    reset_color = RESET if is_reputed else ""
    print(f"{color}{link}{reset_color}")

# List of trusted news sources
trusted_news_sources = [
    "bbc.co.uk",
    "cnn.com",
    "nytimes.com",
    "washingtonpost.com",
    "guardian.com",
    "reuters.com",
    "aljazeera.com",
    "apnews.com",
    "wsj.com",
    "bbc.com/news/world",
    "timesofindia.indiatimes.com/",
    "m.timesofindia.com",
    "uk.news.yahoo.com",
    "https://pressgazette.co.uk",
    "https://abcnews.go.com/",
    "independent.co.uk",
    
    
    # Add more trusted news sources here
]

def is_reputed_news_source(url):
    return any(source in url for source in trusted_news_sources)

def clean_text(text):
    # Remove special characters and numbers
    cleaned_text = ''.join([char for char in text if char.isalpha() or char.isspace()])

    # Lowercase the text
    cleaned_text = cleaned_text.lower()

    return cleaned_text

def is_url_valid(url):
    try:
        response = requests.head(url)
        return response.status_code in range(200, 300)  # Check if the status code is in the success range
    except requests.exceptions.RequestException:
        return False

def get_related_news(search_keyword):
    # Perform Google search
    query = f"{search_keyword} news"
    search_results = list(search(query, num_results=5, lang="en"))

    related_news = []

    # Retrieve and scrape news articles
    for url in search_results:
        if is_url_valid(url):
            try:
                response = requests.get(url)
                soup = BeautifulSoup(response.text, 'html.parser')

                # Extract the Google snippet for the news article
                snippet = soup.find('meta', attrs={'name': 'description'})
                if snippet:
                    snippet = snippet['content']

                    # Extract the news article content
                    article = soup.find('article')  # Adjust based on the website's HTML structure
                    if article:
                        news_title_element = article.find('h1')
                        news_content_elements = article.find_all('p')

                        if news_title_element and news_content_elements:
                            news_title = news_title_element.text.strip()
                            news_content = '\n'.join([p.text.strip() for p in news_content_elements])

                            # Clean the title and content
                            cleaned_title = clean_text(news_title)
                            cleaned_content = clean_text(news_content)

                            # Tokenize the content and remove stopwords
                            stop_words = set(stopwords.words('english'))
                            title_tokens = word_tokenize(cleaned_title)
                            content_tokens = word_tokenize(cleaned_content)

                            title_tokens = [token for token in title_tokens if token not in stop_words]
                            content_tokens = [token for token in content_tokens if token not in stop_words]

                            # Check if the URL is from a reputed news source
                            is_reputed = is_reputed_news_source(url)

                            related_news.append({'source': url, 'title': title_tokens, 'content': content_tokens, 'snippet': snippet, 'is_reputed': is_reputed})

            except requests.exceptions.RequestException as e:
                print(f"Error retrieving article from {url}: {str(e)}")
        else:
            print(f"Skipping invalid URL: {url}")

    return related_news

# Example usage
def calculate_relevance(claim, article_content):
    claim_tokens = set(claim)
    content_tokens = set(article_content)
    common_tokens = claim_tokens.intersection(content_tokens)
    relevance = len(common_tokens) / len(claim_tokens)
    if not claim_tokens:
        return 0.0
    return relevance

# Function to calculate accuracy of summarized content
def calculate_accuracy(original_text, summarized_text):
    original_tokens = set(original_text)
    summarized_tokens = set(summarized_text)
    common_tokens = original_tokens.intersection(summarized_tokens)
    accuracy = len(common_tokens) / len(original_tokens)
    return accuracy

# Example usage
search_keyword = input("Enter the news keyword you want to search: ")

related_news = get_related_news(search_keyword)

# Display the related news articles
for article in related_news:
    source_prefix = "TRUSTED: " if article['is_reputed'] else "Source: "
    print(f"{source_prefix}", end="")
    display_colored_link(article['source'], article['is_reputed'])
    print(f"Title: {' '.join(article['title'])}")
    #print(f"Content: {' '.join(article['content'])}")
    print(f"Summary: {article['snippet']}")
    print("------")

    # Calculate relevance and accuracy
    claim_tokens = word_tokenize(clean_text(search_keyword))
    content_tokens = word_tokenize(clean_text(' '.join(article['content'])))
    title_tokens = word_tokenize(clean_text(' '.join(article['title'])))
    snippet_tokens = word_tokenize(clean_text(article['snippet']))
    
    relevance = calculate_relevance(claim_tokens, content_tokens)
    accuracy = calculate_accuracy(title_tokens + snippet_tokens, content_tokens)

    print(f"Relevance: {relevance:.2f}")
    print(f"Accuracy: {accuracy:.2f}")
    print("------")

Enter the news keyword you want to search: Chandrayaan ISRO
Skipping invalid URL: https://www.nytimes.com/live/2023/08/23/science/india-moon-landing-chandrayaan-3
Source: https://www.independent.co.uk/space/chandrayaan-3-isro-live-moon-landing-time-b2397874.html
Title: chandrayaan live indian space agency achieves historic moon mission landing
Summary: chandrayaan 3 update: Indian moon landing
------
Relevance: 1.00
Accuracy: 0.82
------
Source: https://www.independent.co.uk/space/chandrayaan-3-isro-live-moon-landing-time-b2397874.html
Title: chandrayaan live indian space agency achieves historic moon mission landing
Summary: chandrayaan 3 update: Indian moon landing
------
Relevance: 1.00
Accuracy: 0.82
------


In [3]:
import tkinter as tk
from tkinter import scrolledtext
from tkinter import messagebox
from io import StringIO
import sys

# Create a Tkinter window
window = tk.Tk()
window.title("News Scrapper")

# Create a Text widget for user input
search_keyword_entry = tk.Entry(window, width=50)
search_keyword_entry.pack(pady=10)

# Create a ScrolledText widget to display the results
result_text = scrolledtext.ScrolledText(window, wrap=tk.WORD, width=80, height=20)
result_text.pack(pady=10)

def run_news_scrapper():
    # Redirect standard output to capture the print statements
    output_buffer = StringIO()
    sys.stdout = output_buffer

    # Get the search keyword from the user input
    search_keyword = search_keyword_entry.get().strip()

    # Get related news articles
    related_news = get_related_news(search_keyword)

    # Prepare the result to be displayed in the ScrolledText widget
    result = ""
    for article in related_news:
        source_prefix = "TRUSTED: " if article['is_reputed'] else "Source: "
        result += f"{source_prefix}{article['source']}\n"
        result += f"Title: {' '.join(article['title'])}\n"
        #result += f"Content: {' '.join(article['content'])}\n"
        result += f"Summary: {article['snippet']}\n"
        result += "------\n"

    # Display the result in the ScrolledText widget
    result_text.delete(1.0, tk.END)
    result_text.insert(tk.END, result)

    # Restore standard output
    sys.stdout = sys.__stdout__

def on_exit():
    if messagebox.askyesno("Exit", "Do you want to exit the application?"):
        window.destroy()

# Create a button to run the news scrapper
run_button = tk.Button(window, text="Run News Scrapper", command=run_news_scrapper)
run_button.pack(pady=10)

# Create an Exit button
exit_button = tk.Button(window, text="Exit", command=on_exit)
exit_button.pack(pady=5)

# Start the Tkinter event loop
window.mainloop()


In [None]:
#Handling Ethical issue

import requests
from urllib.parse import urlparse, urljoin

# Set the user-agent header to identify your tool
HEADERS = {
    'User-Agent': 'NewsRetrievalTool/1.0)'
}

def is_allowed_by_robots(url):
    parsed_url = urlparse(url)
    robots_url = urljoin(parsed_url.geturl(), "/robots.txt")
    try:
        response = requests.get(robots_url, headers=HEADERS)
        return "/news" in response.text  # Check if /news is allowed, adjust as needed
    except requests.exceptions.RequestException:
        return True  # Assume allowed if unable to fetch robots.txt

def is_url_valid(url):
    try:
        response = requests.head(url, headers=HEADERS)
        return response.status_code in range(200, 300) and is_allowed_by_robots(url)
    except requests.exceptions.RequestException:
        return False