In [3]:
!pip install GoogleNews selenium

Collecting GoogleNews
  Downloading GoogleNews-1.6.15-py3-none-any.whl.metadata (4.5 kB)
Collecting selenium
  Downloading selenium-4.30.0-py3-none-any.whl.metadata (7.5 kB)
Collecting dateparser (from GoogleNews)
  Downloading dateparser-1.2.1-py3-none-any.whl.metadata (29 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading GoogleNews-1.6.15-py3-none-any.whl (8.8 kB)
Downloading selenium-4.30.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.29.0-py

In [4]:
from GoogleNews import GoogleNews
import pandas as pd

user_request = """("Alphabet" OR "Google" OR "Alphabet Inc.") AND
("financial anomaly" OR "financial irregularity" OR "regulatory filing" OR "legal action" OR
"financial fraud" OR "accounting fraud" OR "sanctions" OR "SEC investigation" OR "audit failure" OR
"class action lawsuit" OR "insider trading" OR "whistleblower complaint" OR "penalty" OR "fine" OR
"restatement" OR "regulatory scrutiny" OR "compliance violation" OR "litigation" OR
"shareholder lawsuit" OR "corporate governance violation" OR "misconduct" OR "stock manipulation")"""

googlenews = GoogleNews(period='7d')
googlenews.search(user_request)

all_results = []
for i in range(1, 10):
    googlenews.getpage(i)
    result = googlenews.result()
    if result:
        for item in result:
            all_results.append(item)
            if len(all_results) >= 10:
                break

df = pd.DataFrame(all_results)
df = df.drop_duplicates(subset=['title'], keep='last')
df.reset_index(drop=True, inplace=True)

In [5]:
import re
data = df.drop(columns = ['media', 'date', 'datetime', 'desc', 'img'])
latest_links = [re.split("&ved", link)[0] for link in df['link']]
print(latest_links)
print(data.shape)
print(data.columns)
for i in latest_links:
    print(i)

['https://www.socialmediatoday.com/news/meta-google-call-for-trump-assistance-fight-australian-fees/743154/', 'https://www.blackenterprise.com/google-settles-28-million-racial-pay-disparity-class-action-lawsuit/', 'https://www.ruetir.com/2025/03/20/europe-accuses-google-of-violating-the-law-and-could-impose-a-historical-fine/', 'https://www.moneycontrol.com/technology/google-apple-hit-by-eu-regulatory-crackdown-article-12969849.html', 'https://www.channelfutures.com/mergers-acquisitions/google-wiz-acquisition-regulatory-challenges', 'https://www.devdiscourse.com/article/technology/3313789-eu-antitrust-crackdown-google-and-apple-face-regulatory-heat', 'https://www.ndtv.com/world-news/google-accused-of-breaching-european-union-rules-risks-fine-7961831', 'https://www.digitalinformationworld.com/2025/03/google-maps-hit-by-10000-fake-listings.html']
(8, 2)
Index(['title', 'link'], dtype='object')
https://www.socialmediatoday.com/news/meta-google-call-for-trump-assistance-fight-australian-fe

In [6]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ProcessPoolExecutor, as_completed
import random
import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import tempfile

def get_selenium_page_source(url):
    """Retrieve page content using Selenium in case of 401/403 errors."""
    temp_profile_dir = tempfile.mkdtemp()
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument(f"--user-data-dir={temp_profile_dir}")

    driver = webdriver.Chrome(options=options)

    try:
        driver.get(url)
        time.sleep(random.uniform(1, 3))  # Random delay to avoid detection
        html_content = driver.page_source
    except Exception as e:
        # print(f"Error using Selenium for {url}: {e}")
        html_content = None
    finally:
        driver.quit()
    return html_content

# Function to fetch and process a URL
def fetch_description(url):
    try:
        # Attempt HTTP GET request
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
            "Referer": "https://www.google.com/",
            "Accept-Language": "en-US,en;q=0.9",
            "Connection": "keep-alive"
        }
        response = requests.get(url, timeout=10, headers=headers)
        if response.status_code == 200:
            html_content = response.text

        elif response.status_code in [401, 403]:
            print(f"Using Selenium for {url} due to status {response.status_code}")
            html_content = get_selenium_page_source(url)

        else:
            print(f"Failed to retrieve: {url} (Status code: {response.status_code})")
            return "Failed to retrieve the webpage."

        # Parse HTML content if available
        if html_content:
            soup = BeautifulSoup(html_content, "html.parser")
            paragraphs = soup.find_all("p")
            page_description = " ".join([p.get_text() for p in paragraphs])
            return page_description
        else:
            return "Failed to retrieve the webpage."

    except requests.exceptions.RequestException as e:
        print(f"Error retrieving {url}: {e}")
        return "Failed to retrieve the webpage."


# Fetch all descriptions concurrently using ProcessPoolExecutor
def fetch_all_descriptions(urls, max_workers):
    descriptions = []
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_url = {executor.submit(fetch_description, url): url for url in urls}

        # Process results as they complete
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                result = future.result()
                descriptions.append(result)
            except Exception as e:
                # print(f"Error processing {url}: {e}")
                descriptions.append("Failed to retrieve the webpage.")

    return descriptions

# Number of parallel processes to use
num_workers = max(1, os.cpu_count() - 1)

# Start the parallel fetching process
start_time = time.time()
descriptions = fetch_all_descriptions(latest_links, max_workers=num_workers)
end_time = time.time()

# Store results in the data dictionary
# data = {"description": descriptions}

print(f"Fetched {len(descriptions)} descriptions in {end_time - start_time:.2f} seconds.")
data["description"] = descriptions

Using Selenium for https://www.channelfutures.com/mergers-acquisitions/google-wiz-acquisition-regulatory-challenges due to status 403
Using Selenium for https://www.ndtv.com/world-news/google-accused-of-breaching-european-union-rules-risks-fine-7961831 due to status 403
Fetched 8 descriptions in 23.44 seconds.


In [11]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Load summarization model (BART or other models)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# Summarize text with better handling of large inputs
def summarize_text(text, max_length=500, chunk_size=2000):
    # Split into manageable chunks
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    # Summarize each chunk
    summaries = []
    for chunk in chunks:
        summary = summarizer(chunk, max_length=max_length, min_length=50, do_sample=False)[0]['summary_text']
        summaries.append(summary)

    # Combine all chunk summaries
    combined_summary = " ".join(summaries)

    # Optional: Hierarchical summarization to refine final summary
    if len(combined_summary) > 2000:  # Chunk limit
        combined_summary = summarizer(combined_summary, max_length=max_length, min_length=50, do_sample=False)[0]['summary_text']

    return combined_summary

input_text = " ".join(data['description'])
summary = summarize_text(input_text)
print(f"Summary: {summary}")

Device set to use cpu
Your max_length is set to 500, but your input_length is only 403. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=201)
Your max_length is set to 500, but your input_length is only 384. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=192)
Your max_length is set to 500, but your input_length is only 419. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=209)
Your max_length is set to 500, but your input_length is only 371. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summariz

Summary:  Meta is among a range of tech companies, including Google, Apple, and X, which co-signed a request for the U.S. Government to help them push back against what they’ve labeled “discriminatory” Australian media laws. Back in 2021, the Australian government implemented its “News Media Bargaining Code” which effectively forces social apps and search engines to pay local publishers.
