In [25]:
import nltk
from textblob import TextBlob
from newspaper import Article
import requests
from urllib.parse import urlparse

# Download required NLTK data (run once)
def setup_nltk():
    """Download required NLTK data"""
    try:
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('corpora/stopwords')
    except LookupError:
        print("Downloading required NLTK data...")
        nltk.download('punkt')
        nltk.download('stopwords')

def generate_news_summary(url, summary_sentences=3):
    """
    Generate a summary from a news article URL
    
    Args:
        url (str): The news article URL
        summary_sentences (int): Number of sentences in the summary (default: 3)
    
    Returns:
        dict: Contains title, summary, sentiment, and other article info
    """
    try:
        # Validate URL
        parsed_url = urlparse(url)
        if not parsed_url.scheme or not parsed_url.netloc:
            raise ValueError("Invalid URL format")
        
        # Initialize and download article
        article = Article(url)
        article.download()
        article.parse()
        
        # Check if article was successfully parsed
        if not article.text:
            raise ValueError("Could not extract text from the article")
        
        # Use newspaper3k's built-in summarization
        article.nlp()
        newspaper_summary = article.summary
        
        # Alternative summary using TextBlob and NLTK
        blob = TextBlob(article.text)
        sentences = blob.sentences
        
        # Simple extractive summarization - get top sentences
        if len(sentences) <= summary_sentences:
            textblob_summary = str(blob)
        else:
            # Get sentences from different parts of the article
            step = len(sentences) // summary_sentences
            selected_sentences = []
            for i in range(0, len(sentences), step):
                if len(selected_sentences) < summary_sentences:
                    selected_sentences.append(str(sentences[i]))
            textblob_summary = ' '.join(selected_sentences)
        
        # Sentiment analysis using TextBlob
        sentiment = blob.sentiment
        
        # Prepare result
        result = {
            'url': url,
            'title': article.title,
            'authors': article.authors,
            'publish_date': article.publish_date,
            'newspaper_summary': newspaper_summary,
            'textblob_summary': textblob_summary,
            'sentiment': {
                'polarity': sentiment.polarity,  # -1 (negative) to 1 (positive)
                'subjectivity': sentiment.subjectivity  # 0 (objective) to 1 (subjective)
            },
            'word_count': len(article.text.split()),
            'top_image': article.top_image
        }
        
        return result
        
    except Exception as e:
        return {
            'error': str(e),
            'url': url
        }

def print_summary(url, summary_sentences=3):
    """
    Print a formatted summary of a news article
    
    Args:
        url (str): The news article URL
        summary_sentences (int): Number of sentences in the summary
    """
    # Setup NLTK if needed
    setup_nltk()
    
    print(f"Analyzing: {url}")
    print("=" * 80)
    
    result = generate_news_summary(url, summary_sentences)
    
    if 'error' in result:
        print(f"Error: {result['error']}")
        return
    
    print(f"Title: {result['title']}")
    print(f"Authors: {', '.join(result['authors']) if result['authors'] else 'Unknown'}")
    print(f"Publish Date: {result['publish_date']}")
    print(f"Word Count: {result['word_count']}")
    print("\n" + "="*50 + " SUMMARY " + "="*50)
    
    # Print newspaper3k summary
    print("\n📰 Newspaper3k Summary:")
    print("-" * 30)
    print(result['newspaper_summary'])
    
    # Print TextBlob summary
    print(f"\n🔤 TextBlob Summary ({summary_sentences} sentences):")
    print("-" * 30)
    print(result['textblob_summary'])
    
    # Print sentiment analysis
    print(f"\n💭 Sentiment Analysis:")
    print("-" * 30)
    polarity = result['sentiment']['polarity']
    subjectivity = result['sentiment']['subjectivity']
    
    sentiment_label = "Neutral"
    if polarity > 0.1:
        sentiment_label = "Positive"
    elif polarity < -0.1:
        sentiment_label = "Negative"
    
    objectivity_label = "Objective" if subjectivity < 0.5 else "Subjective"
    
    print(f"Sentiment: {sentiment_label} (Polarity: {polarity:.2f})")
    print(f"Tone: {objectivity_label} (Subjectivity: {subjectivity:.2f})")

# Example usage
if __name__ == "__main__":
    # Example URLs - replace with actual news URLs
    test_urls = [
        "https://www.bbc.com/news/technology-12345678",  # Replace with real URL
        "https://www.cnn.com/2024/01/01/tech/example-news/index.html"  # Replace with real URL
    ]
    
    # Test with a single URL
    print("News Article Summarizer")
    print("=" * 80)
    
    # Get URL from user input
    url = input("Enter a news article URL: ")
    if url.strip():
        print_summary(url.strip())
    else:
        print("No URL provided. Please run the script again with a valid news URL.")

News Article Summarizer
Analyzing: https://www.livemint.com/market/stock-market-news/msci-rejig-swiggy-mazagon-dock-two-others-among-likely-additions-to-india-standard-index-in-august-rebalancing-11749890229453.html
Title: MSCI rejig: Swiggy, Mazagon Dock, two others among likely additions to India Standard Index in August rebalancing
Authors: Ankit Gohel
Publish Date: 2025-06-14 14:17:40+05:30
Word Count: 296


📰 Newspaper3k Summary:
------------------------------
Swiggy, Mazagon Dock Shipbuilders, and two other stocks are expected to be added to the MSCI India Standard Index as part of the upcoming rebalancing scheduled for August 2025.
The last MSCI rebalancing was conducted on May 14, wherein the Coromandel International and FSN E-commerce Ventures, the parent company of the fashion and beauty e-tailer Nykaa, were included in the MSCI India Index, which is part of the MSCI Global Standard Index.
The MSCI India Standard Index captures the performance of the large- and mid-cap segmen

In [2]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\faiza\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [None]:
import nltk
from textblob import TextBlob
from newspaper import Article
import json
from urllib.parse import urlparse
from datetime import datetime

# Download required NLTK data (run once)
def setup_nltk():
    """Download required NLTK data"""
    try:
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('corpora/stopwords')
    except LookupError:
        print("Downloading required NLTK data...")
        nltk.download('punkt')
        nltk.download('stopwords')

def get_news_summary(url, summary_sentences=3, return_json=True):
    """
    Generate a summary from a news article URL and return as JSON
    
    Args:
        url (str): The news article URL
        summary_sentences (int): Number of sentences in the summary (default: 3)
        return_json (bool): If True, returns JSON string; if False, returns dict
    
    Returns:
        str or dict: JSON string or dictionary containing article summary and metadata
    """
    # Setup NLTK if needed
    setup_nltk()
    
    try:
        # Validate URL
        parsed_url = urlparse(url)
        if not parsed_url.scheme or not parsed_url.netloc:
            error_result = {
                'success': False,
                'error': 'Invalid URL format',
                'url': url,
                'timestamp': datetime.now().isoformat()
            }
            return json.dumps(error_result, indent=2) if return_json else error_result
        
        # Initialize and download article
        article = Article(url)
        article.download()
        article.parse()
        
        # Check if article was successfully parsed
        if not article.text:
            error_result = {
                'success': False,
                'error': 'Could not extract text from the article',
                'url': url,
                'timestamp': datetime.now().isoformat()
            }
            return json.dumps(error_result, indent=2) if return_json else error_result
        
        # Use newspaper3k's built-in summarization
        article.nlp()
        newspaper_summary = article.summary
        
        # Alternative summary using TextBlob and NLTK
        blob = TextBlob(article.text)
        sentences = blob.sentences
        
        # Simple extractive summarization - get top sentences
        if len(sentences) <= summary_sentences:
            textblob_summary = str(blob)
        else:
            # Get sentences from different parts of the article
            step = len(sentences) // summary_sentences
            selected_sentences = []
            for i in range(0, len(sentences), step):
                if len(selected_sentences) < summary_sentences:
                    selected_sentences.append(str(sentences[i]))
            textblob_summary = ' '.join(selected_sentences)
        
        # Sentiment analysis using TextBlob
        sentiment = blob.sentiment
        
        # Determine sentiment label
        sentiment_label = "neutral"
        if sentiment.polarity > 0.1:
            sentiment_label = "positive"
        elif sentiment.polarity < -0.1:
            sentiment_label = "negative"
        
        # Determine objectivity label
        objectivity_label = "objective" if sentiment.subjectivity < 0.5 else "subjective"
        
        # Prepare successful result
        result = {
            'success': True,
            'url': url,
            'timestamp': datetime.now().isoformat(),
            'article': {
                'title': article.title or 'No title found',
                'authors': article.authors or [],
                'publish_date': article.publish_date.isoformat() if article.publish_date else None,
                'word_count': len(article.text.split()),
                'top_image': article.top_image or None
            },
            'summaries': {
                'newspaper3k': newspaper_summary or 'No summary generated',
                'textblob': textblob_summary or 'No summary generated'
            },
            'sentiment_analysis': {
                'polarity': round(sentiment.polarity, 3),
                'subjectivity': round(sentiment.subjectivity, 3),
                'sentiment_label': sentiment_label,
                'objectivity_label': objectivity_label
            },
            'keywords': article.keywords[:10] if hasattr(article, 'keywords') and article.keywords else []
        }
        
        return json.dumps(result, indent=2, default=str) if return_json else result
        
    except Exception as e:
        error_result = {
            'success': False,
            'error': str(e),
            'url': url,
            'timestamp': datetime.now().isoformat()
        }
        return json.dumps(error_result, indent=2) if return_json else error_result

# Example usage
if __name__ == "__main__":
    # Test the function
    test_url = "https://www.bbc.com/news/technology"  # Replace with a real news URL
    
    # Get JSON result
    json_result = get_news_summary(test_url)
    print("JSON Result:")
    print(json_result)
    
    # Get dict result (for programmatic use)
    dict_result = get_news_summary(`test_url`, return_json=False)
    print("\nDict Result (success status):", dict_result.get('success'))
    
    # Interactive example
    print("\n" + "="*60)
    print("Interactive News Summarizer")
    print("="*60)
    
    url = input("Enter a news article URL (or press Enter to skip): ")
    if url.strip():
        result = get_news_summary(url.strip())
        print("\nResult:")
        print(result)


Interactive News Summarizer


In [8]:
json_result = get_news_summary("https://www.financialexpress.com/business/brandwagon-homeessentials-raises-2-2m-from-india-quotient-plans-omnichannel-expansion-3878977")
print(json_result)

{
  "success": true,
  "url": "https://www.financialexpress.com/business/brandwagon-homeessentials-raises-2-2m-from-india-quotient-plans-omnichannel-expansion-3878977",
  "timestamp": "2025-06-16T11:18:15.118398",
  "article": {
    "title": "HomeEssentials raises $2.2M from India Quotient, plans omnichannel expansion",
    "authors": [
      "Brandwagon Online"
    ],
    "publish_date": null,
    "word_count": 281,
    "top_image": "https://www.financialexpress.com/wp-content/uploads/2025/06/life-47.png"
  },
  "summaries": {
    "newspaper3k": "HomeEssentials, a home and kitchen brand known for its modern, multi-use products, has raised $2.2 million in funding from early-stage venture capital firm India Quotient.\nThe company plans to use the capital to expand into offline retail, aiming to merge its digital success with a physical footprint across metros and smaller cities.\nFounded by entrepreneurs Tanishq and Divyam, the brand offers more than 250 products designed with utility, 

In [16]:
# save this as redirect_resolver.py or in a cell
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup

def resolve_redirected_url(url: str) -> str:
    """
    Given a Google News redirect URL, resolves and returns the final redirected URL.
    """
    # Setup Chrome options (headless)
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")

    # Start WebDriver
    driver = webdriver.Chrome(options=chrome_options)

    try:
        driver.get(url)
        time.sleep(5)  # Wait for the redirect to complete
        final_url = driver.current_url
    finally:
        driver.quit()

    return final_url


In [20]:
import nltk
from textblob import TextBlob
from newspaper import Article
import json
from urllib.parse import urlparse
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup

# Download required NLTK data (run once)
def setup_nltk():
    """Download required NLTK data"""
    try:
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('corpora/stopwords')
    except LookupError:
        print("Downloading required NLTK data...")
        nltk.download('punkt')
        nltk.download('stopwords')

def resolve_redirected_url(url: str) -> str:
    """
    Given a Google News redirect URL, resolves and returns the final redirected URL.
    """
    # Setup Chrome options (headless)
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")

    # Start WebDriver
    driver = webdriver.Chrome(options=chrome_options)

    try:
        driver.get(url)
        time.sleep(5)  # Wait for the redirect to complete
        final_url = driver.current_url
    finally:
        driver.quit()

    return final_url


def get_news_summary(url, summary_sentences=3, return_json=True):
    """
    Generate a summary from a news article URL and return as JSON
    
    Args:
        url (str): The news article URL
        summary_sentences (int): Number of sentences in the summary (default: 3)
        return_json (bool): If True, returns JSON string; if False, returns dict
    
    Returns:
        str or dict: JSON string or dictionary containing article summary and metadata
    """
    # Setup NLTK if needed
    setup_nltk()
    final_url = resolve_redirected_url(url)
    try:
        # Validate URL
        parsed_url = urlparse(final_url)
        if not parsed_url.scheme or not parsed_url.netloc:
            error_result = {
                'success': False,
                'error': 'Invalid URL format',
                'url': final_url,
                'timestamp': datetime.now().isoformat()
            }
            return json.dumps(error_result, indent=2) if return_json else error_result
        
        # Initialize and download article
        article = Article(final_url)
        article.download()
        article.parse()
        
        # Check if article was successfully parsed
        if not article.text:
            error_result = {
                'success': False,
                'error': 'Could not extract text from the article',
                'url': final_url,
                'timestamp': datetime.now().isoformat()
            }
            return json.dumps(error_result, indent=2) if return_json else error_result
        
        # Use newspaper3k's built-in summarization
        article.nlp()
        newspaper_summary = article.summary
        
        # Alternative summary using TextBlob and NLTK
        blob = TextBlob(article.text)
        sentences = blob.sentences
        
        # Simple extractive summarization - get top sentences
        if len(sentences) <= summary_sentences:
            textblob_summary = str(blob)
        else:
            # Get sentences from different parts of the article
            step = len(sentences) // summary_sentences
            selected_sentences = []
            for i in range(0, len(sentences), step):
                if len(selected_sentences) < summary_sentences:
                    selected_sentences.append(str(sentences[i]))
            textblob_summary = ' '.join(selected_sentences)
        
        # Sentiment analysis using TextBlob
        sentiment = blob.sentiment
        
        # Determine sentiment label
        sentiment_label = "neutral"
        if sentiment.polarity > 0.1:
            sentiment_label = "positive"
        elif sentiment.polarity < -0.1:
            sentiment_label = "negative"
        
        # Determine objectivity label
        objectivity_label = "objective" if sentiment.subjectivity < 0.5 else "subjective"
        
        # Prepare successful result
        result = {
            'success': True,
            'url': url,
            'timestamp': datetime.now().isoformat(),
            'article': {
                'title': article.title or 'No title found',
                'authors': article.authors or [],
                'publish_date': article.publish_date.isoformat() if article.publish_date else None,
                'word_count': len(article.text.split()),
                'top_image': article.top_image or None
            },
            'summaries': {
                'newspaper3k': newspaper_summary or 'No summary generated',
                'textblob': textblob_summary or 'No summary generated'
            },
            'sentiment_analysis': {
                'polarity': round(sentiment.polarity, 3),
                'subjectivity': round(sentiment.subjectivity, 3),
                'sentiment_label': sentiment_label,
                'objectivity_label': objectivity_label
            },
            'keywords': article.keywords[:10] if hasattr(article, 'keywords') and article.keywords else []
        }
        
        return json.dumps(result, indent=2, default=str) if return_json else result
        
    except Exception as e:
        error_result = {
            'success': False,
            'error': str(e),
            'url': url,
            'timestamp': datetime.now().isoformat()
        }
        return json.dumps(error_result, indent=2) if return_json else error_result

# Example usage
if __name__ == "__main__":
    # Test the function
    test_url = "https://news.google.com/read/CBMi0gFBVV95cUxPZ21qcVdFamVObDZmbUZNQ1c2VXExazN4blNYQUh4WGUzX3ZSNGMwUDcxVlRXeDlQMnIyeEx2ZVVicGJmeE5IWEFCSFNpTFUtZDFXR1FRWVBGWFhHMm5hYjRSUXp1MzNfNUFPVWRaZjZaVTYyc2ZwMmcxcjd5TXlWWlpoNUF6RTQzaE9jcklLS0pacVhvWGFDNXRPS0I3YTZpMDNoRzdvQXhPdTl3RHRxN3Rfa0s4M0NGU3lKNVJDaG5QSjlxRHVtVmNqdFk5VUN6MGfSAdcBQVVfeXFMUGlVMnk2dm9lWklWYW0yMUpkbzZiakpSZmI4TjlqanVzMGR3Yl80QnhibjFLSWpjV3B4cm04VXpTMllWcDZBeTdNOUUzaWRjVzZTWXBaQ0FReGdxNkhBTElXb2VfWDU1VEhsSGpQQlJucmhsclpfR2IwR194bXQxSTdDM3EtYUo2MXlobTNpRTFubjJUSXlUVy1sbzJQVXVZMUZGT2p0Z1FwN2tjb3ZIMEFLSVZiYVhsMWdYQUZuT092VnJOSjFFY0dnT2tBVWc2dk5PT2Fpck0?hl=en-IN&gl=IN&ceid=IN%3Aen"  # Replace with a real news URL
    
    # Get JSON result
    json_result = get_news_summary(test_url)
    print("JSON Result:")
    print(json_result)
    
    # Get dict result (for programmatic use)
    dict_result = get_news_summary(test_url, return_json=False)
    print("\nDict Result (success status):", dict_result.get('success'))
    
    # Interactive example
    print("\n" + "="*60)
    print("Interactive News Summarizer")
    print("="*60)
    
    url = input("Enter a news article URL (or press Enter to skip): ")
    if url.strip():
        result = get_news_summary(url.strip())
        print("\nResult:")
        print(result)

JSON Result:
{
  "success": false,
  "error": "Could not extract text from the article",
  "url": "https://www.business-standard.com/markets/news/msci-rejig-swiggy-mazagon-dock-among-4-entrants-850-mn-inflows-likely-125061600250_1.html",
  "timestamp": "2025-06-16T11:52:36.714758"
}

Dict Result (success status): False

Interactive News Summarizer

Result:
{
  "success": false,
  "error": "Could not extract text from the article",
  "url": "https://www.business-standard.com/markets/news/msci-rejig-swiggy-mazagon-dock-among-4-entrants-850-mn-inflows-likely-125061600250_1.html",
  "timestamp": "2025-06-16T11:53:03.413878"
}


In [None]:
from redirect_resolver import resolve_redirected_url

url = "https://news.google.com/read/CBMi0gFBVV95cUxPZ21qcVdFamVObDZmbUZNQ1c2VXExazN4blNYQUh4WGUzX3ZSNGMwUDcxVlRXeDlQMnIyeEx2ZVVicGJmeE5IWEFCSFNpTFUtZDFXR1FRWVBGWFhHMm5hYjRSUXp1MzNfNUFPVWRaZjZaVTYyc2ZwMmcxcjd5TXlWWlpoNUF6RTQzaE9jcklLS0pacVhvWGFDNXRPS0I3YTZpMDNoRzdvQXhPdTl3RHRxN3Rfa0s4M0NGU3lKNVJDaG5QSjlxRHVtVmNqdFk5VUN6MGfSAdcBQVVfeXFMUGlVMnk2dm9lWklWYW0yMUpkbzZiakpSZmI4TjlqanVzMGR3Yl80QnhibjFLSWpjV3B4cm04VXpTMllWcDZBeTdNOUUzaWRjVzZTWXBaQ0FReGdxNkhBTElXb2VfWDU1VEhsSGpQQlJucmhsclpfR2IwR194bXQxSTdDM3EtYUo2MXlobTNpRTFubjJUSXlUVy1sbzJQVXVZMUZGT2p0Z1FwN2tjb3ZIMEFLSVZiYVhsMWdYQUZuT092VnJOSjFFY0dnT2tBVWc2dk5PT2Fpck0?hl=en-IN&gl=IN&ceid=IN%3Aen"  # your Google News URL
final_url = resolve_redirected_url(url)
print("Final URL:", final_url)


In [27]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import time
import json
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import os

# Import the news summary function
import nltk
from textblob import TextBlob
from newspaper import Article
from urllib.parse import urlparse

# Add the get_news_summary function
def setup_nltk():
    """Download required NLTK data"""
    try:
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('corpora/stopwords')
    except LookupError:
        print("Downloading required NLTK data...")
        nltk.download('punkt')
        nltk.download('stopwords')

def resolve_redirected_url(url: str) -> str:
    """
    Given a Google News redirect URL, resolves and returns the final redirected URL.
    """
    # Setup Chrome options (headless)
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")

    # Start WebDriver
    driver = webdriver.Chrome(options=chrome_options)

    try:
        driver.get(url)
        time.sleep(5)  # Wait for the redirect to complete
        final_url = driver.current_url
    finally:
        driver.quit()

    return final_url


def get_news_summary(url, summary_sentences=3, return_json=False):
    """
    Generate a summary from a news article URL and return as dict
    
    Args:
        url (str): The news article URL
        summary_sentences (int): Number of sentences in the summary (default: 3)
        return_json (bool): If True, returns JSON string; if False, returns dict
    
    Returns:
        dict: Dictionary containing article summary and metadata
    """
    # Setup NLTK if needed
    setup_nltk()
    final_url = resolve_redirected_url(url)
    print(f"Resolved URL: {final_url}")
    try:
        # Validate URL
        parsed_url = urlparse(final_url)
        if not parsed_url.scheme or not parsed_url.netloc:
            error_result = {
                'success': False,
                'error': 'Invalid URL format',
                'url': final_url,
                'timestamp': datetime.now().isoformat()
            }
            return json.dumps(error_result, indent=2) if return_json else error_result
        
        # Initialize and download article
        article = Article(final_url)
        article.download()
        article.parse()
        
        # Check if article was successfully parsed
        if not article.text:
            error_result = {
                'success': False,
                'error': 'Could not extract text from the article',
                'url': final_url,
                'timestamp': datetime.now().isoformat()
            }
            return json.dumps(error_result, indent=2) if return_json else error_result
        
        # Use newspaper3k's built-in summarization
        article.nlp()
        newspaper_summary = article.summary
        
        # Alternative summary using TextBlob and NLTK
        blob = TextBlob(article.text)
        sentences = blob.sentences
        
        # Simple extractive summarization - get top sentences
        if len(sentences) <= summary_sentences:
            textblob_summary = str(blob)
        else:
            # Get sentences from different parts of the article
            step = len(sentences) // summary_sentences
            selected_sentences = []
            for i in range(0, len(sentences), step):
                if len(selected_sentences) < summary_sentences:
                    selected_sentences.append(str(sentences[i]))
            textblob_summary = ' '.join(selected_sentences)
        
        # Sentiment analysis using TextBlob
        sentiment = blob.sentiment
        
        # Determine sentiment label
        sentiment_label = "neutral"
        if sentiment.polarity > 0.1:
            sentiment_label = "positive"
        elif sentiment.polarity < -0.1:
            sentiment_label = "negative"
        
        # Determine objectivity label
        objectivity_label = "objective" if sentiment.subjectivity < 0.5 else "subjective"
        
        # Prepare successful result
        result = {
            'success': True,
            'url': final_url,
            'timestamp': datetime.now().isoformat(),
            'article': {
                'title': article.title or 'No title found',
                'authors': article.authors or [],
                'publish_date': article.publish_date.isoformat() if article.publish_date else None,
                'word_count': len(article.text.split()),
                'top_image': article.top_image or None
            },
            'summaries': {
                'newspaper3k': newspaper_summary or 'No summary generated',
                'textblob': textblob_summary or 'No summary generated'
            },
            'sentiment_analysis': {
                'polarity': round(sentiment.polarity, 3),
                'subjectivity': round(sentiment.subjectivity, 3),
                'sentiment_label': sentiment_label,
                'objectivity_label': objectivity_label
            },
            'keywords': article.keywords[:10] if hasattr(article, 'keywords') and article.keywords else []
        }
        
        return json.dumps(result, indent=2, default=str) if return_json else result
        
    except Exception as e:
        error_result = {
            'success': False,
            'error': str(e),
            'url': final_url,
            'timestamp': datetime.now().isoformat()
        }
        return json.dumps(error_result, indent=2) if return_json else error_result

class ComprehensiveNewsScraper:
    def __init__(self, headless=True):
        """Initialize the comprehensive news scraper"""
        self.chrome_options = Options()
        if headless:
            self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
        
        # Headers for requests
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }

    def scrape_google_news_articles(self, company_name, max_articles=5):
        """
        Scrapes Google News for articles about a specific company
        
        Args:
            company_name (str): Name of the company to search for
            max_articles (int): Maximum number of articles to scrape
        
        Returns:
            list: List of dictionaries containing basic article data
        """
        
        # Construct the Google News search URL
        base_url = "https://news.google.com/search"
        params = {
            'q': company_name,
            'hl': 'en-IN',
            'gl': 'IN',
            'ceid': 'IN:en'
        }
        
        search_url = f"{base_url}?{urllib.parse.urlencode(params)}"
        
        try:
            print(f"Searching for news articles about: {company_name}")
            print(f"URL: {search_url}")
            print("-" * 50)
            
            # Make the request
            response = requests.get(search_url, headers=self.headers, timeout=10)
            response.raise_for_status()
            
            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all article containers
            article_containers = soup.find_all('article') or soup.find_all(class_="xrnccd")
            
            if not article_containers:
                article_elements = soup.find_all(class_="WwrzSb")
                article_containers = []
                for element in article_elements:
                    container = element
                    for _ in range(10):
                        container = container.find_parent()
                        if container and (container.name == 'article' or 'article' in str(container.get('class', []))):
                            break
                    if container:
                        article_containers.append(container)
            
            if not article_containers:
                print("No article containers found. The page structure might have changed.")
                return []
            
            # Extract basic data from articles
            articles_data = []
            for i, container in enumerate(article_containers[:max_articles]):
                
                article_data = {
                    'google_news_url': 'URL not found',
                    'date': 'Date not found',
                    'author': 'Author not found',
                    'source_title': 'Source not found',
                    'source_image_url': 'Image not found',
                    'content_image_url': 'Content image not found',
                    'text_content': 'Content not found',
                    'article_title': 'Title not found'
                }
                
                # Extract URL
                url_element = container.find(class_="WwrzSb") or container.find(class_="JtKRv")
                if url_element:
                    href = url_element.get('href')
                    if href:
                        if href.startswith('./'):
                            article_data['google_news_url'] = f"https://news.google.com{href[1:]}"
                        elif href.startswith('/'):
                            article_data['google_news_url'] = f"https://news.google.com{href}"
                        else:
                            article_data['google_news_url'] = href
                
                # Extract source information
                source_container = container.find(class_="oovtQ")
                if source_container:
                    img_element = source_container.find('img')
                    if img_element:
                        img_src = img_element.get('src') or img_element.get('data-src')
                        if img_src:
                            article_data['source_image_url'] = img_src
                    
                    source_text = source_container.get_text(strip=True)
                    if source_text:
                        article_data['source_title'] = source_text
                
                # Extract content image
                content_image_element = container.find(class_="Quavad vwBmvb")
                if content_image_element:
                    img_tag = content_image_element.find('img')
                    if img_tag:
                        img_src = (img_tag.get('src') or 
                                  img_tag.get('data-src') or 
                                  img_tag.get('data-lazy-src') or
                                  img_tag.get('srcset', '').split(',')[0].strip().split(' ')[0])
                        
                        if img_src:
                            if img_src.startswith('http'):
                                article_data['content_image_url'] = img_src
                            elif img_src.startswith('//'):
                                article_data['content_image_url'] = f"https:{img_src}"
                            elif img_src.startswith('./'):
                                article_data['content_image_url'] = f"https://news.google.com{img_src[1:]}"
                            elif img_src.startswith('/'):
                                article_data['content_image_url'] = f"https://news.google.com{img_src}"
                
                # Extract date and author
                metadata_container = container.find(class_="UOVeFe")
                if metadata_container:
                    date_element = metadata_container.find(class_="hvbAAd")
                    if date_element:
                        article_data['date'] = date_element.get_text(strip=True)
                    
                    author_element = metadata_container.find(class_="bInasb")
                    if author_element:
                        article_data['author'] = author_element.get_text(strip=True)
                
                # Extract article title
                title_element = (container.find('h3') or 
                               container.find('h4') or 
                               container.find(class_="JtKRv") or
                               container.find(class_="mCBkyc"))
                if title_element:
                    article_data['article_title'] = title_element.get_text(strip=True)
                
                # Extract text content/summary
                content_selectors = [
                    'div[class*="snippet"]',
                    'div[class*="summary"]',
                    'div[class*="description"]',
                    '.st',
                    'span[class*="snippet"]'
                ]
                
                for selector in content_selectors:
                    content_element = container.select_one(selector)
                    if content_element:
                        article_data['text_content'] = content_element.get_text(strip=True)
                        break
                
                if article_data['text_content'] == 'Content not found':
                    all_text = container.get_text(separator=' ', strip=True)
                    content_parts = []
                    for part in all_text.split():
                        if len(' '.join(content_parts)) > 200:
                            break
                        content_parts.append(part)
                    
                    if content_parts:
                        article_data['text_content'] = ' '.join(content_parts)
                
                articles_data.append(article_data)
                
                print(f"Found Article {i+1}: {article_data['article_title']}")
            
            return articles_data
            
        except requests.RequestException as e:
            print(f"Error making request: {e}")
            return []
        except Exception as e:
            print(f"Error parsing content: {e}")
            return []

    def get_redirect_url(self, google_news_url):
        """Get the actual article URL from Google News redirect"""
        try:
            response = requests.get(google_news_url, headers=self.headers, allow_redirects=True, timeout=10)
            return response.url
        except:
            return google_news_url

    def get_news_summary_from_external(self, url):
        """
        Call the get_news_summary function and return the result
        """
        try:
            print(f"Calling get_news_summary for: {url}")
            
            # Call the actual get_news_summary function
            json_result = get_news_summary(url, return_json=False)
            
            return json_result
            
        except Exception as e:
            print(f"Error calling get_news_summary: {str(e)}")
            return None

    def extract_detailed_article_data(self, url, include_full_content=True):
        """Extract comprehensive article data from URL using external summary function"""
        try:
            print(f"Getting summary from external function for: {url}")
            
            # Call the external summary function
            summary_result = self.get_news_summary_from_external(url)
            
            if not summary_result or not summary_result.get('success'):
                print("✗ Failed to get summary from external function")
                print(summary_result)
                return None
            
            # Extract data from the external function result
            detailed_data = {
                'final_url': summary_result.get('url', url),
                'scraped_at': summary_result.get('timestamp', datetime.now().isoformat()),
                'detailed_title': summary_result.get('article', {}).get('title', 'Title not found'),
                'detailed_author': summary_result.get('article', {}).get('authors', ['Author not found']),
                'detailed_publish_date': summary_result.get('article', {}).get('publish_date', 'Date not found'),
                'word_count': summary_result.get('article', {}).get('word_count', 0),
                'main_image_url': summary_result.get('article', {}).get('top_image', 'Image not found'),
                'content_summary': summary_result.get('summaries', {}).get('newspaper3k', 'Summary not available'),
                'textblob_summary': summary_result.get('summaries', {}).get('textblob', 'TextBlob summary not available'),
                'sentiment_analysis': summary_result.get('sentiment_analysis', {}),
                'keywords': summary_result.get('keywords', []),
                'external_summary_success': True
            }
            
            # Add full content flag but don't include actual full content since we're using external summary
            if include_full_content:
                detailed_data['full_content_note'] = "Full content extraction skipped - using external summary function"
            
            return detailed_data
            
        except Exception as e:
            print(f"Error extracting detailed article data: {str(e)}")
            return None

    def scrape_comprehensive_news(self, company_name, max_articles=3, extract_full_content=True):
        """
        Main method to scrape comprehensive news data using external summary function
        
        Args:
            company_name (str): Company name to search for
            max_articles (int): Maximum number of articles to process
            extract_full_content (bool): Flag for compatibility (not used with external summary)
        
        Returns:
            dict: Comprehensive news data
        """
        print(f"Starting comprehensive news scraping for: {company_name}")
        print(f"Using external get_news_summary function")
        print("="*80)
        
        # Step 1: Get basic articles from Google News
        basic_articles = self.scrape_google_news_articles(company_name, max_articles)
        
        if not basic_articles:
            return {
                'search_query': company_name,
                'scraped_at': datetime.now().isoformat(),
                'total_articles_found': 0,
                'articles': []
            }
        
        # Step 2: Extract detailed content for each article using external function
        comprehensive_articles = []
        
        for i, basic_article in enumerate(basic_articles):
            print(f"\nProcessing article {i+1}/{len(basic_articles)}")
            print("-"*50)
            
            # Merge basic data
            comprehensive_article = {
                'article_id': i + 1,
                'google_news_data': basic_article,
                'detailed_data': None,
                'extraction_success': False
            }
            
            if basic_article['google_news_url'] != 'URL not found':
                # Get the actual article URL
                try:
                    actual_url = self.get_redirect_url(basic_article['google_news_url'])
                    print(f"Actual URL: {actual_url}")
                    
                    # Extract detailed content using external summary function
                    detailed_data = self.extract_detailed_article_data(actual_url, extract_full_content)
                    
                    if detailed_data:
                        comprehensive_article['detailed_data'] = detailed_data
                        comprehensive_article['extraction_success'] = True
                        print("✓ Successfully extracted summary using external function")
                        
                        # Print the newspaper3k summary
                        newspaper_summary = detailed_data.get('content_summary', 'No summary available')
                        print(f"✓ Newspaper3k Summary: {newspaper_summary[:100]}...")
                    else:
                        print("✗ Failed to extract content using external function")
                        
                except Exception as e:
                    print(f"✗ Error processing article: {str(e)}")
            
            comprehensive_articles.append(comprehensive_article)
            
            # Add delay between requests
            time.sleep(2)
        
        # Compile final result
        result = {
            'search_query': company_name,
            'scraped_at': datetime.now().isoformat(),
            'total_articles_found': len(comprehensive_articles),
            'successful_extractions': sum(1 for article in comprehensive_articles if article['extraction_success']),
            'summary_method': 'external_get_news_summary_function',
            'articles': comprehensive_articles
        }
        
        return result

    def save_comprehensive_data(self, data, filename=None):
        """Save comprehensive data to JSON file"""
        if filename is None:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f"comprehensive_news_external_summary_{data['search_query'].replace(' ', '_')}_{timestamp}.json"
        
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        
        print(f"\nComprehensive news data saved to: {filename}")
        return filename

    def print_summary(self, data):
        """Print a summary of the scraped data"""
        print("\n" + "="*100)
        print("COMPREHENSIVE NEWS SCRAPING SUMMARY (Using External Summary Function)")
        print("="*100)
        print(f"Search Query: {data['search_query']}")
        print(f"Scraped At: {data['scraped_at']}")
        print(f"Total Articles Found: {data['total_articles_found']}")
        print(f"Successful External Extractions: {data['successful_extractions']}")
        print(f"Summary Method: {data.get('summary_method', 'external_function')}")
        print("-"*100)
        
        for article in data['articles']:
            print(f"\nArticle {article['article_id']}:")
            print(f"  Title: {article['google_news_data']['article_title']}")
            print(f"  Source: {article['google_news_data']['source_title']}")
            print(f"  Date: {article['google_news_data']['date']}")
            print(f"  External Summary Extraction: {'✓' if article['extraction_success'] else '✗'}")
            
            if article['detailed_data']:
                # Print newspaper3k summary
                newspaper_summary = article['detailed_data'].get('content_summary', 'No summary available')
                if newspaper_summary != 'No summary available':
                    summary_preview = newspaper_summary[:150]
                    print(f"  Newspaper3k Summary: {summary_preview}...")
                
                # Print additional extracted data
                word_count = article['detailed_data'].get('word_count', 0)
                if word_count > 0:
                    print(f"  Word Count: {word_count}")
                
                keywords = article['detailed_data'].get('keywords', [])
                if keywords:
                    print(f"  Keywords: {', '.join(keywords[:5])}")
                
                sentiment = article['detailed_data'].get('sentiment_analysis', {})
                if sentiment:
                    sentiment_label = sentiment.get('sentiment_label', 'unknown')
                    polarity = sentiment.get('polarity', 0)
                    print(f"  Sentiment: {sentiment_label} (polarity: {polarity})")
        
        print("="*100)

def main():
    """Main function to run the comprehensive news scraper with external summary"""
    print("Enhanced Comprehensive News Scraper (Using External Summary Function)")
    print("="*70)
    
    # Get user input
    company_name = input("Enter the company name to search for: ").strip()
    
    if not company_name:
        print("Please enter a valid company name.")
        return
    
    try:
        max_articles = int(input("Enter max number of articles to process (default: 3): ").strip() or "3")
    except ValueError:
        max_articles = 3
    
    print("Note: Using external get_news_summary function for content extraction and summarization")
    
    # Initialize scraper
    scraper = ComprehensiveNewsScraper(headless=True)
    
    # Scrape comprehensive news data
    print(f"\nStarting comprehensive scraping...")
    comprehensive_data = scraper.scrape_comprehensive_news(
        company_name, 
        max_articles=max_articles,
        extract_full_content=True  # This flag is maintained for compatibility but not used
    )
    
    # Print summary
    scraper.print_summary(comprehensive_data)
    
    # Save to file
    save_choice = input("\nSave results to JSON file? (y/n): ").strip().lower()
    if save_choice == 'y':
        filename = input("Enter filename (press Enter for default): ").strip()
        if not filename:
            filename = None
        
        saved_file = scraper.save_comprehensive_data(comprehensive_data, filename)
        print(f"Data saved to: {saved_file}")
    
    print("\nScraping completed using external summary function!")

if __name__ == "__main__":
    main()

Enhanced Comprehensive News Scraper (Using External Summary Function)
Note: Using external get_news_summary function for content extraction and summarization

Starting comprehensive scraping...
Starting comprehensive news scraping for: swiggy
Using external get_news_summary function
Searching for news articles about: swiggy
URL: https://news.google.com/search?q=swiggy&hl=en-IN&gl=IN&ceid=IN%3Aen
--------------------------------------------------
Found Article 1: 'Not out of need, but to rebuild ties': Bengaluru techie uses Swiggy deliveries to meet people, find clients
Found Article 2: MSCI rejig: Swiggy, Mazagon Dock, two others among likely additions to India Standard Index in August rebalancing
Found Article 3: MSCI rejig: Swiggy, Mazagon Dock among 4 entrants; $850 mn inflows likely

Processing article 1/3
--------------------------------------------------
Actual URL: https://news.google.com/read/CBMijAJBVV95cUxNTnBWX19WVVUwSDBOQm10enpoMi1OUmpReXBGR2FtbGtmYkQxX2F1TkVwcl9iWXVuN3YyOS

In [28]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import time
import json
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os

class ComprehensiveNewsScraper:
    def __init__(self, headless=True):
        """Initialize the comprehensive news scraper"""
        self.chrome_options = Options()
        if headless:
            self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
        
        # Headers for requests
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }

    def scrape_google_news_articles(self, company_name, max_articles=5):
        """
        Scrapes Google News for articles about a specific company
        
        Args:
            company_name (str): Name of the company to search for
            max_articles (int): Maximum number of articles to scrape
        
        Returns:
            list: List of dictionaries containing basic article data
        """
        
        # Construct the Google News search URL
        base_url = "https://news.google.com/search"
        params = {
            'q': company_name,
            'hl': 'en-IN',
            'gl': 'IN',
            'ceid': 'IN:en'
        }
        
        search_url = f"{base_url}?{urllib.parse.urlencode(params)}"
        
        try:
            print(f"Searching for news articles about: {company_name}")
            print(f"URL: {search_url}")
            print("-" * 50)
            
            # Make the request
            response = requests.get(search_url, headers=self.headers, timeout=10)
            response.raise_for_status()
            
            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all article containers
            article_containers = soup.find_all('article') or soup.find_all(class_="xrnccd")
            
            if not article_containers:
                article_elements = soup.find_all(class_="WwrzSb")
                article_containers = []
                for element in article_elements:
                    container = element
                    for _ in range(10):
                        container = container.find_parent()
                        if container and (container.name == 'article' or 'article' in str(container.get('class', []))):
                            break
                    if container:
                        article_containers.append(container)
            
            if not article_containers:
                print("No article containers found. The page structure might have changed.")
                return []
            
            # Extract basic data from articles
            articles_data = []
            for i, container in enumerate(article_containers[:max_articles]):
                
                article_data = {
                    'google_news_url': 'URL not found',
                    'date': 'Date not found',
                    'author': 'Author not found',
                    'source_title': 'Source not found',
                    'source_image_url': 'Image not found',
                    'content_image_url': 'Content image not found',
                    'text_content': 'Content not found',
                    'article_title': 'Title not found'
                }
                
                # Extract URL
                url_element = container.find(class_="WwrzSb") or container.find(class_="JtKRv")
                if url_element:
                    href = url_element.get('href')
                    if href:
                        if href.startswith('./'):
                            article_data['google_news_url'] = f"https://news.google.com{href[1:]}"
                        elif href.startswith('/'):
                            article_data['google_news_url'] = f"https://news.google.com{href}"
                        else:
                            article_data['google_news_url'] = href
                
                # Extract source information
                source_container = container.find(class_="oovtQ")
                if source_container:
                    img_element = source_container.find('img')
                    if img_element:
                        img_src = img_element.get('src') or img_element.get('data-src')
                        if img_src:
                            article_data['source_image_url'] = img_src
                    
                    source_text = source_container.get_text(strip=True)
                    if source_text:
                        article_data['source_title'] = source_text
                
                # Extract content image
                content_image_element = container.find(class_="Quavad vwBmvb")
                if content_image_element:
                    img_tag = content_image_element.find('img')
                    if img_tag:
                        img_src = (img_tag.get('src') or 
                                  img_tag.get('data-src') or 
                                  img_tag.get('data-lazy-src') or
                                  img_tag.get('srcset', '').split(',')[0].strip().split(' ')[0])
                        
                        if img_src:
                            if img_src.startswith('http'):
                                article_data['content_image_url'] = img_src
                            elif img_src.startswith('//'):
                                article_data['content_image_url'] = f"https:{img_src}"
                            elif img_src.startswith('./'):
                                article_data['content_image_url'] = f"https://news.google.com{img_src[1:]}"
                            elif img_src.startswith('/'):
                                article_data['content_image_url'] = f"https://news.google.com{img_src}"
                
                # Extract date and author
                metadata_container = container.find(class_="UOVeFe")
                if metadata_container:
                    date_element = metadata_container.find(class_="hvbAAd")
                    if date_element:
                        article_data['date'] = date_element.get_text(strip=True)
                    
                    author_element = metadata_container.find(class_="bInasb")
                    if author_element:
                        article_data['author'] = author_element.get_text(strip=True)
                
                # Extract article title
                title_element = (container.find('h3') or 
                               container.find('h4') or 
                               container.find(class_="JtKRv") or
                               container.find(class_="mCBkyc"))
                if title_element:
                    article_data['article_title'] = title_element.get_text(strip=True)
                
                # Extract text content/summary
                content_selectors = [
                    'div[class*="snippet"]',
                    'div[class*="summary"]',
                    'div[class*="description"]',
                    '.st',
                    'span[class*="snippet"]'
                ]
                
                for selector in content_selectors:
                    content_element = container.select_one(selector)
                    if content_element:
                        article_data['text_content'] = content_element.get_text(strip=True)
                        break
                
                if article_data['text_content'] == 'Content not found':
                    all_text = container.get_text(separator=' ', strip=True)
                    content_parts = []
                    for part in all_text.split():
                        if len(' '.join(content_parts)) > 200:
                            break
                        content_parts.append(part)
                    
                    if content_parts:
                        article_data['text_content'] = ' '.join(content_parts)
                
                articles_data.append(article_data)
                
                print(f"Found Article {i+1}: {article_data['article_title']}")
            
            return articles_data
            
        except requests.RequestException as e:
            print(f"Error making request: {e}")
            return []
        except Exception as e:
            print(f"Error parsing content: {e}")
            return []

    def get_redirect_url(self, google_news_url):
        """Get the actual article URL from Google News redirect"""
        try:
            response = requests.get(google_news_url, headers=self.headers, allow_redirects=True, timeout=10)
            return response.url
        except:
            return google_news_url

    def extract_detailed_article_data(self, url, include_full_content=True):
        """Extract comprehensive article data from URL using Selenium"""
        driver = webdriver.Chrome(options=self.chrome_options)
        
        try:
            print(f"Extracting detailed content from: {url}")
            
            # Load the page
            driver.get(url)
            
            # Wait for page to load
            WebDriverWait(driver, 10).until(
                lambda d: d.execute_script("return document.readyState") == "complete"
            )
            time.sleep(3)
            
            # Get final URL after redirects
            final_url = driver.current_url
            
            # Get the raw HTML
            raw_html = driver.page_source
            
            # Parse the page
            soup = BeautifulSoup(raw_html, 'html.parser')
            
            # Extract comprehensive article data
            detailed_data = self._extract_all_detailed_data(soup, final_url, include_full_content)
            
            return detailed_data
            
        except Exception as e:
            print(f"Error extracting detailed article data: {str(e)}")
            return None
        finally:
            driver.quit()

    def _extract_all_detailed_data(self, soup, url, include_full_content=True):
        """Extract all available detailed article data"""
        # Always extract the content first for summary generation
        full_content = self._extract_content(soup)
        
        data = {
            'final_url': url,
            'scraped_at': datetime.now().isoformat(),
            'detailed_title': self._extract_title(soup),
            'meta_description': self._extract_meta_description(soup),
            'detailed_author': self._extract_author(soup),
            'detailed_publish_date': self._extract_publish_date(soup),
            'keywords': self._extract_keywords(soup),
            'category': self._extract_category(soup),
            'detailed_source': self._extract_source(soup),
            'main_image_url': self._extract_main_image(soup),
            'content_summary': None
        }
        
        # Always generate summary from content
        if full_content:
            data['content_summary'] = self._generate_summary(full_content)
        
        # Only include full content if requested
        if include_full_content:
            data['full_content'] = full_content
        
        return data

    def _extract_title(self, soup):
        """Extract article title"""
        selectors = [
            'h1',
            '[data-testid="headline"]',
            '.article-title',
            '.entry-title',
            'meta[property="og:title"]',
            'title'
        ]
        
        for selector in selectors:
            if selector.startswith('meta'):
                element = soup.select_one(selector)
                if element:
                    return element.get('content', '').strip()
            else:
                element = soup.select_one(selector)
                if element and element.get_text().strip():
                    return element.get_text().strip()
        
        return "Title not found"

    def _extract_meta_description(self, soup):
        """Extract meta description"""
        meta_selectors = [
            'meta[name="description"]',
            'meta[property="og:description"]',
            'meta[name="twitter:description"]'
        ]
        
        for selector in meta_selectors:
            meta = soup.select_one(selector)
            if meta and meta.get('content'):
                return meta.get('content').strip()
        
        return "Meta description not found"

    def _extract_author(self, soup):
        """Extract article author"""
        author_selectors = [
            'meta[name="author"]',
            'meta[property="article:author"]',
            '.author',
            '.byline',
            '[rel="author"]',
            '.article-author'
        ]
        
        for selector in author_selectors:
            if selector.startswith('meta'):
                element = soup.select_one(selector)
                if element:
                    return element.get('content', '').strip()
            else:
                element = soup.select_one(selector)
                if element:
                    return element.get_text().strip()
        
        return "Author not found"

    def _extract_publish_date(self, soup):
        """Extract publish date"""
        date_selectors = [
            'meta[property="article:published_time"]',
            'meta[name="publish_date"]',
            'time[datetime]',
            '.publish-date',
            '.article-date'
        ]
        
        for selector in date_selectors:
            element = soup.select_one(selector)
            if element:
                if selector.startswith('meta'):
                    return element.get('content', '').strip()
                elif element.name == 'time':
                    return element.get('datetime', element.get_text()).strip()
                else:
                    return element.get_text().strip()
        
        return "Publish date not found"

    def _extract_content(self, soup):
        """Extract main article content"""
        # Remove unwanted elements
        for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
            element.decompose()
        
        content_selectors = [
            'article',
            '.article-content',
            '.entry-content',
            '.post-content',
            '.content',
            'main',
            '[data-testid="article-content"]'
        ]
        
        for selector in content_selectors:
            content_div = soup.select_one(selector)
            if content_div:
                paragraphs = content_div.find_all(['p', 'div'], class_=lambda x: x != 'advertisement' if x else True)
                content_text = []
                
                for p in paragraphs:
                    text = p.get_text().strip()
                    if len(text) > 50:
                        content_text.append(text)
                
                if content_text:
                    return '\n\n'.join(content_text)
        
        # Fallback: get all paragraphs
        paragraphs = soup.find_all('p')
        content = []
        for p in paragraphs:
            text = p.get_text().strip()
            if len(text) > 50:
                content.append(text)
        
        return '\n\n'.join(content) if content else "Content not found"

    def _extract_keywords(self, soup):
        """Extract keywords/tags"""
        keyword_selectors = [
            'meta[name="keywords"]',
            'meta[property="article:tag"]',
            '.tags',
            '.keywords'
        ]
        
        for selector in keyword_selectors:
            if selector.startswith('meta'):
                element = soup.select_one(selector)
                if element:
                    return element.get('content', '').strip()
            else:
                elements = soup.select(selector + ' a, ' + selector + ' span')
                if elements:
                    return ', '.join([el.get_text().strip() for el in elements])
        
        return "Keywords not found"

    def _extract_category(self, soup):
        """Extract article category"""
        category_selectors = [
            'meta[property="article:section"]',
            '.category',
            '.section',
            '[data-category]'
        ]
        
        for selector in category_selectors:
            if selector.startswith('meta'):
                element = soup.select_one(selector)
                if element:
                    return element.get('content', '').strip()
            elif selector == '[data-category]':
                element = soup.select_one(selector)
                if element:
                    return element.get('data-category', '').strip()
            else:
                element = soup.select_one(selector)
                if element:
                    return element.get_text().strip()
        
        return "Category not found"

    def _extract_source(self, soup):
        """Extract source/publisher"""
        source_selectors = [
            'meta[property="og:site_name"]',
            'meta[name="publisher"]',
            '.source',
            '.publisher'
        ]
        
        for selector in source_selectors:
            if selector.startswith('meta'):
                element = soup.select_one(selector)
                if element:
                    return element.get('content', '').strip()
            else:
                element = soup.select_one(selector)
                if element:
                    return element.get_text().strip()
        
        return "Source not found"

    def _extract_main_image(self, soup):
        """Extract main article image"""
        image_selectors = [
            'meta[property="og:image"]',
            'meta[name="twitter:image"]',
            'article img',
            '.article-image img',
            '.featured-image img'
        ]
        
        for selector in image_selectors:
            if selector.startswith('meta'):
                element = soup.select_one(selector)
                if element:
                    return element.get('content', '').strip()
            else:
                element = soup.select_one(selector)
                if element:
                    return element.get('src', '').strip()
        
        return "Main image not found"

    def _generate_summary(self, content, max_sentences=3):
        """Generate a simple extractive summary"""
        if not content or len(content) < 100:
            return "Content too short for summary"
        
        # Split into sentences
        sentences = re.split(r'[.!?]+', content)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
        
        if len(sentences) <= max_sentences:
            return '. '.join(sentences[:max_sentences]) + '.'
        
        # Simple scoring: prefer sentences with more words
        scored_sentences = []
        for i, sentence in enumerate(sentences[:10]):
            score = len(sentence.split())
            if any(word in sentence.lower() for word in ['said', 'according', 'reported', 'announced']):
                score += 5
            scored_sentences.append((score, sentence, i))
        
        # Sort by score and take top sentences, maintaining original order
        top_sentences = sorted(scored_sentences, key=lambda x: x[0], reverse=True)[:max_sentences]
        top_sentences = sorted(top_sentences, key=lambda x: x[2])
        
        summary = '. '.join([s[1] for s in top_sentences])
        return summary + '.' if not summary.endswith('.') else summary

    def scrape_comprehensive_news(self, company_name, max_articles=3, extract_full_content=True):
        """
        Main method to scrape comprehensive news data
        
        Args:
            company_name (str): Company name to search for
            max_articles (int): Maximum number of articles to process
            extract_full_content (bool): Whether to extract and include full article content in JSON
        
        Returns:
            dict: Comprehensive news data
        """
        print(f"Starting comprehensive news scraping for: {company_name}")
        print(f"Extract full content: {extract_full_content}")
        print("="*80)
        
        # Step 1: Get basic articles from Google News
        basic_articles = self.scrape_google_news_articles(company_name, max_articles)
        
        if not basic_articles:
            return {
                'search_query': company_name,
                'scraped_at': datetime.now().isoformat(),
                'total_articles_found': 0,
                'articles': []
            }
        
        # Step 2: Extract detailed content for each article
        comprehensive_articles = []
        
        for i, basic_article in enumerate(basic_articles):
            print(f"\nProcessing article {i+1}/{len(basic_articles)}")
            print("-"*50)
            
            # Merge basic data
            comprehensive_article = {
                'article_id': i + 1,
                'google_news_data': basic_article,
                'detailed_data': None,
                'extraction_success': False
            }
            
            if basic_article['google_news_url'] != 'URL not found':
                # Get the actual article URL
                try:
                    actual_url = self.get_redirect_url(basic_article['google_news_url'])
                    print(f"Actual URL: {actual_url}")
                    
                    # Extract detailed content (always extracts metadata and summary)
                    detailed_data = self.extract_detailed_article_data(actual_url, extract_full_content)
                    
                    if detailed_data:
                        comprehensive_article['detailed_data'] = detailed_data
                        comprehensive_article['extraction_success'] = True
                        print("✓ Successfully extracted detailed content")
                        if not extract_full_content:
                            print("✓ Summary and metadata extracted (full content excluded)")
                    else:
                        print("✗ Failed to extract detailed content")
                        
                except Exception as e:
                    print(f"✗ Error processing article: {str(e)}")
            
            comprehensive_articles.append(comprehensive_article)
            
            # Add delay between requests
            time.sleep(2)
        
        # Compile final result
        result = {
            'search_query': company_name,
            'scraped_at': datetime.now().isoformat(),
            'total_articles_found': len(comprehensive_articles),
            'successful_extractions': sum(1 for article in comprehensive_articles if article['extraction_success']),
            'full_content_included': extract_full_content,
            'articles': comprehensive_articles
        }
        
        return result

    def save_comprehensive_data(self, data, filename=None):
        """Save comprehensive data to JSON file"""
        if filename is None:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            content_type = "full" if data.get('full_content_included', True) else "summary"
            filename = f"comprehensive_news_{content_type}_{data['search_query'].replace(' ', '_')}_{timestamp}.json"
        
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        
        print(f"\nComprehensive news data saved to: {filename}")
        return filename

    def print_summary(self, data):
        """Print a summary of the scraped data"""
        print("\n" + "="*100)
        print("COMPREHENSIVE NEWS SCRAPING SUMMARY")
        print("="*100)
        print(f"Search Query: {data['search_query']}")
        print(f"Scraped At: {data['scraped_at']}")
        print(f"Total Articles Found: {data['total_articles_found']}")
        print(f"Successful Detailed Extractions: {data['successful_extractions']}")
        print(f"Full Content Included: {'Yes' if data.get('full_content_included', True) else 'No (Summary only)'}")
        print("-"*100)
        
        for article in data['articles']:
            print(f"\nArticle {article['article_id']}:")
            print(f"  Title: {article['google_news_data']['article_title']}")
            print(f"  Source: {article['google_news_data']['source_title']}")
            print(f"  Date: {article['google_news_data']['date']}")
            print(f"  Detailed Extraction: {'✓' if article['extraction_success'] else '✗'}")
            
            if article['detailed_data']:
                if 'full_content' in article['detailed_data']:
                    print(f"  Full Content Length: {len(article['detailed_data']['full_content'])} characters")
                else:
                    print(f"  Full Content: Not included (summary-only mode)")
                
                if article['detailed_data']['content_summary']:
                    summary_preview = article['detailed_data']['content_summary'][:100]
                    print(f"  Summary: {summary_preview}...")
                
                if article['detailed_data']['meta_description'] != "Meta description not found":
                    meta_preview = article['detailed_data']['meta_description'][:80]
                    print(f"  Meta Description: {meta_preview}...")
        
        print("="*100)

def main():
    """Main function to run the comprehensive news scraper"""
    print("Enhanced Comprehensive News Scraper")
    print("="*50)
    
    # Get user input
    company_name = input("Enter the company name to search for: ").strip()
    
    if not company_name:
        print("Please enter a valid company name.")
        return
    
    try:
        max_articles = int(input("Enter max number of articles to process (default: 3): ").strip() or "3")
    except ValueError:
        max_articles = 3
    
    extract_full = input("Include full article content in JSON? (y/n, default: n): ").strip().lower()
    extract_full_content = extract_full == 'y'
    
    if not extract_full_content:
        print("Note: Will extract summary, meta description, and all metadata but exclude full content from JSON")
    
    # Initialize scraper
    scraper = ComprehensiveNewsScraper(headless=True)
    
    # Scrape comprehensive news data
    print(f"\nStarting comprehensive scraping...")
    comprehensive_data = scraper.scrape_comprehensive_news(
        company_name, 
        max_articles=max_articles,
        extract_full_content=extract_full_content
    )
    
    # Print summary
    scraper.print_summary(comprehensive_data)
    
    # Save to file
    save_choice = input("\nSave results to JSON file? (y/n): ").strip().lower()
    if save_choice == 'y':
        filename = input("Enter filename (press Enter for default): ").strip()
        if not filename:
            filename = None
        
        saved_file = scraper.save_comprehensive_data(comprehensive_data, filename)
        print(f"Data saved to: {saved_file}")
    
    print("\nScraping completed!")

if __name__ == "__main__":
    main()

Enhanced Comprehensive News Scraper
Note: Will extract summary, meta description, and all metadata but exclude full content from JSON

Starting comprehensive scraping...
Starting comprehensive news scraping for: swiggy
Extract full content: False
Searching for news articles about: swiggy
URL: https://news.google.com/search?q=swiggy&hl=en-IN&gl=IN&ceid=IN%3Aen
--------------------------------------------------
Found Article 1: 'Not out of need, but to rebuild ties': Bengaluru techie uses Swiggy deliveries to meet people, find clients
Found Article 2: MSCI rejig: Swiggy, Mazagon Dock, two others among likely additions to India Standard Index in August rebalancing
Found Article 3: MSCI rejig: Swiggy, Mazagon Dock among 4 entrants; $850 mn inflows likely

Processing article 1/3
--------------------------------------------------
Actual URL: https://www.google.com/sorry/index?continue=https://news.google.com/read/CBMijAJBVV95cUxNTnBWX19WVVUwSDBOQm10enpoMi1OUmpReXBGR2FtbGtmYkQxX2F1TkVwcl9iWXV