In [1]:
# Web Scrape for Top-10 Stock Events
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import numpy as np 

# load dataset from csv, drop nulls and nan records
stock_events_df = pd.read_csv("filtered_and_named_stock_data.csv")

stock_events_df["Date"] = pd.to_datetime(stock_events_df["Date"])

stock_events_df = stock_events_df.dropna(subset=["Stock"])  
stock_events_df = stock_events_df[stock_events_df["Stock"].astype(str).str.lower() != "nan"]  

# top 10 most volatile event windows from our stock data
top_10 = (
    stock_events_df.sort_values(by="Volatility", ascending=False)
    .drop_duplicates(subset=["Stock"])  
    .head(10)
)

# function to scrape Bing engine
def scrape_bing(stock_name, event_date, max_results=10):
    search_query = f"{stock_name} stock news {event_date.strftime('%Y-%m-%d')}"
    bing_url = f"https://www.bing.com/news/search?q={search_query.replace(' ', '+')}"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
    }
    
    # failed request error
    response = requests.get(bing_url, headers=headers)
    if response.status_code != 200:
        
        print(f"Request ERROR")
        
        return []
        
    # BeautifulSoup to web scrape article snippets
    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all("div", class_="news-card") 

    news_data = []
    for article in articles[:max_results]:  
        headline = article.find("a").text if article.find("a") else "No Title"
        url = article.find("a")["href"] if article.find("a") else "No URL"
        snippet = article.find("div", class_="snippet").text if article.find("div", class_="snippet") else "No Snippet"
        
        news_data.append({
            "Stock": stock_name,
            "Date": event_date.strftime('%Y-%m-%d'),
            "Headline": headline,
            "URL": url,
            "Snippet": snippet
        })

    time.sleep(2) # for request timeout
    
    return news_data

# scrape for each stock in the top-10 using our scrape_bing function
all_news = []
for _, row in top_10.iterrows():
    stock_name = row["Stock"]
    event_date = row["Date"]
    print(f"Scraping for: {stock_name} ({event_date.strftime('%Y-%m-%d')})")
    
    news_articles = scrape_bing(stock_name, event_date)
    all_news.extend(news_articles)

news_df = pd.DataFrame(all_news)

# save as csv
news_df.to_csv("scraped_stock_news.csv", index=False)

print("Saved as: scraped_stock_news.csv")

Scraping for: Caesars Entertainment (2020-04-13)
Scraping for: APA Corporation (2020-04-17)
Scraping for: Carnival Corporation (2020-04-15)
Scraping for: Occidental Petroleum (2020-04-17)
Scraping for: Diamondback Energy (2020-04-17)
Scraping for: Devon Energy (2020-04-17)
Scraping for: United Airlines (2020-04-16)
Scraping for: Darden Restaurants (2020-04-20)
Scraping for: Halliburton (2020-04-17)
Scraping for: Ventas (2020-04-20)
Saved as: scraped_stock_news.csv


In [2]:
# Saving with utf-8 specifically
import os

file_path = os.path.join(os.path.expanduser("~"), "Desktop", "scraped_stocks_news.csv")

news_df.to_csv(file_path, index=False, sep=',', encoding='utf-8')

print(f"Saved as: {file_path}")

Saved as: C:\Users\Garrett\Desktop\scraped_stocks_news.csv
