In [2]:
import pandas as pd
from newspaper import Article, Config
from GoogleNews import GoogleNews
from datetime import datetime, timedelta
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import time

# Download NLTK data
nltk.download('vader_lexicon')

# Set a more reasonable date range - last 6 months instead of since 2000
end_date = datetime.today().date()
start_date = end_date - timedelta(days=180)  # 6 months

# Browser configuration
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
config = Config()
config.browser_user_agent = user_agent
config.request_timeout = 30
config.fetch_images = False

# Finance sources
finance_sources = ["reuters", "bloomberg", "cnbc", "yahoo", "fox business", 
                   "marketwatch", "wall street journal", "forbes", "business insider"]

def get_news_sentiment(topic="mortgage rates"):
    """
    Get news sentiment about a topic using a more robust approach.
    """
    # Format dates for GoogleNews
    start_date_str = start_date.strftime('%m/%d/%Y')
    end_date_str = end_date.strftime('%m/%d/%Y')
    
    print(f"Searching for '{topic}' news from {start_date_str} to {end_date_str}")
    
    # Initialize GoogleNews and search
    googlenews = GoogleNews(lang='en', start=start_date_str, end=end_date_str)
    googlenews.search(topic)
    
    # Get results
    print("Fetching results...")
    results = googlenews.result()
    
    if not results:
        print("No results found from Google News")
        # Try with a shorter date range
        googlenews = GoogleNews(lang='en', period='1m')  # Try last month
        googlenews.search(topic)
        results = googlenews.result()
        if not results:
            print("Still no results with shorter date range")
            return pd.DataFrame()
    
    # Convert to dataframe
    print(f"Found {len(results)} results, converting to dataframe")
    df = pd.DataFrame(results)
    
    if df.empty:
        print("Empty dataframe after converting results")
        return pd.DataFrame()
    
    # Show available columns and sample data
    print(f"Columns in results: {df.columns.tolist()}")
    print(f"Sample data: {df.head(1).to_dict()}")
    
    # Check for media column
    if 'media' not in df.columns:
        print("No 'media' column found, skipping source filtering")
        filtered_df = df
    else:
        # Filter by source
        print("Filtering by finance sources...")
        mask = df['media'].str.lower().apply(lambda x: any(source in str(x).lower() for source in finance_sources))
        filtered_df = df[mask]
        
        if filtered_df.empty:
            print(f"No articles from finance sources. Using all sources instead.")
            filtered_df = df  # Use all results if no finance sources match
    
    # Process articles
    print(f"Processing {len(filtered_df)} articles...")
    records = []
    
    # Process a maximum of 10 articles to avoid long processing time
    for i in filtered_df.head(10).index:
        if 'link' not in filtered_df.columns or pd.isna(filtered_df['link'][i]):
            continue
            
        try:
            # Clean URL
            url = filtered_df['link'][i]
            if "&ved=" in url:
                url = url.split("&ved=")[0]
                
            print(f"Processing article: {url}")
            
            # Download and parse
            article = Article(url, config=config)
            article.download()
            time.sleep(1)  # Add delay to avoid rate limiting
            article.parse()
            
            # Create record
            text = article.text if article.text else ""
            summary = text[:500] if text else ""
            title = article.title if article.title else ""
            
            if not text and not title:
                print(f"⚠️ Article had no content: {url}")
                continue
                
            print(f"✅ Successfully processed: {title[:50]}...")
            
            # Extract date from the 'datetime' column if available, otherwise use today's date
            if 'datetime' in filtered_df.columns and pd.notna(filtered_df['datetime'][i]):
                article_date = filtered_df['datetime'][i]
            else:
                article_date = datetime.now()  # Use current date if no date is available
                
            records.append({
                'Date': article_date,
                'Media': filtered_df['media'][i] if 'media' in filtered_df.columns else "Unknown",
                'Title': title,
                'Summary': summary
            })
        except Exception as e:
            print(f"❌ Error processing article {url}: {str(e)}")
            continue

    if not records:
        print("No articles were successfully processed")
        return pd.DataFrame()

    # Create dataframe
    print(f"Creating dataframe with {len(records)} processed articles")
    news_df = pd.DataFrame(records)
    
    # Convert dates - date should already be a datetime object now
    try:
        if 'Date' in news_df.columns:
            # Make sure Date is a datetime object
            if not pd.api.types.is_datetime64_any_dtype(news_df['Date']):
                news_df['Date'] = pd.to_datetime(news_df['Date'], errors='coerce')
                
            news_df = news_df.dropna(subset=['Date'])
            news_df['Week'] = news_df['Date'].dt.to_period('W').dt.start_time
            
            print(f"Date range in data: {news_df['Date'].min()} to {news_df['Date'].max()}")
    except Exception as e:
        print(f"Warning: Date processing error: {str(e)}")
        # If date processing fails, use current date for all articles
        print("Using current date for all articles")
        news_df['Date'] = datetime.now()
        news_df['Week'] = news_df['Date'].dt.to_period('W').dt.start_time
    
    # Sentiment analysis
    print("Performing sentiment analysis...")
    sia = SentimentIntensityAnalyzer()
    
    # Ensure Summary column exists and has no NaNs
    if 'Summary' not in news_df.columns:
        print("No 'Summary' column found for sentiment analysis")
        if 'Title' in news_df.columns:
            print("Using 'Title' for sentiment analysis instead")
            news_df['Summary'] = news_df['Title']
        else:
            print("No text available for sentiment analysis")
            return news_df
    
    news_df['Summary'] = news_df['Summary'].fillna('')
    
    # Calculate sentiment
    news_df['Sentiment'] = news_df['Summary'].apply(lambda x: sia.polarity_scores(x))
    news_df['Compound'] = news_df['Sentiment'].apply(lambda x: x['compound'])
    news_df['Pos'] = news_df['Sentiment'].apply(lambda x: x['pos'])
    news_df['Neg'] = news_df['Sentiment'].apply(lambda x: x['neg'])
    news_df['Neu'] = news_df['Sentiment'].apply(lambda x: x['neu'])
    
    print(f"Sentiment analysis complete. Raw data sample:")
    print(news_df[['Date', 'Title', 'Compound']].head())
    
    # Skip weekly aggregation and return raw data if needed
    # return news_df  # Uncomment this line to return raw data instead of weekly aggregation
    
    # Weekly aggregation if date processing was successful and Week column exists
    if 'Week' in news_df.columns and not news_df.empty:
        try:
            print("Aggregating by week...")
            weekly = news_df.groupby("Week").agg(
                NewsSentiment=("Compound", "mean"),
                NewsPos=("Pos", "mean"),
                NewsNeg=("Neg", "mean"),
                NewsNeu=("Neu", "mean"),
                NewsCount=("Compound", "count")
            )
            print(f"Weekly aggregation complete with {len(weekly)} weeks of data")
            return weekly
        except Exception as e:
            print(f"Error in weekly aggregation: {str(e)}")
            print("Returning raw data instead")
            
    # Return the processed dataframe if weekly aggregation fails or week column doesn't exist
    return news_df

# Run the analysis
sentiment_df = get_news_sentiment("mortgage rates")

if not sentiment_df.empty:
    print("Successfully retrieved sentiment data:")
    print(sentiment_df.head())
    print(f"Shape: {sentiment_df.shape}")
else:
    print("No sentiment data was retrieved.")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\cpras\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Searching for 'mortgage rates' news from 10/12/2024 to 04/10/2025
Fetching results...
Found 10 results, converting to dataframe
Columns in results: ['title', 'media', 'date', 'datetime', 'desc', 'link', 'img']
Sample data: {'title': {0: 'Weekly Mortgage Rates Rise on Tariff Announcement'}, 'media': {0: 'Northeast Mississippi Daily Journal'}, 'date': {0: '0 minutes ago'}, 'datetime': {0: Timestamp('2025-04-10 17:20:15.035960')}, 'desc': {0: 'A new round of taxes on imported goods has increased borrowing costs, linking global politics to your monthly payment.The average rate on the 30-year...'}, 'link': {0: 'https://www.djournal.com/news/national/weekly-mortgage-rates-rise-on-tariff-announcement/article_e0b9efcc-d009-53b3-bd55-5d3b8d1c3ec3.html&ved=2ahUKEwjkxdLU2s6MAxX2ETQIHfrQBQIQxfQBegQIBBAC&usg=AOvVaw39QWFJJC9WJX_zR6N-uZau'}, 'img': {0: 'data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=='}}
Filtering by finance sources...
Processing 1 articles...
Proces