In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
import time

In [None]:
def scrape_wsj_news_filtered(start_date, end_date, delay=1.5):
    # Define stock-market related keywords
    keywords = [
        "stock", "stocks", "market", "S&P", "Nasdaq", "Dow", "shares",
        "investors", "bond", "inflation", "recession", "rate hike",
        "earnings", "profit", "loss", "fed", "Federal Reserve", "interest rate",
        "volatility", "rally", "crash", "selloff", "bull market", "bear market",
        "guidance", "forecast", "jobs report", "layoffs", "unemployment", "bank", "treasury"
    ]

    current_date = datetime.strptime(start_date, "%Y-%m-%d")
    stop_date = datetime.strptime(end_date, "%Y-%m-%d")
    all_articles = []

    while current_date <= stop_date:
        url = f"https://www.wsj.com/news/archive/{current_date.strftime('%Y/%m/%d')}"
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print(f"[{current_date.strftime('%Y-%m-%d')}] Failed: {response.status_code}")
            current_date += timedelta(days=1)
            continue

        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.find_all("article")
        count_kept = 0

        for article in articles:
            h2 = article.find("h2")
            if h2:
                headline = h2.get_text(strip=True)
                headline_lower = headline.lower()
                if any(kw in headline_lower for kw in keywords):
                    link = h2.find("a")["href"] if h2.find("a") else ""
                    all_articles.append({
                        "date": current_date.strftime("%Y-%m-%d"),
                        "headline": headline,
                        "url": link
                    })
                    count_kept += 1

        print(f"[{current_date.strftime('%Y-%m-%d')}] {count_kept} relevant headlines kept")
        time.sleep(delay)
        current_date += timedelta(days=1)

    return pd.DataFrame(all_articles)

In [None]:
def download_sp500_labels(start_date, end_date):
    import yfinance as yf
    import pandas as pd

    # Download S&P 500 data
    df = yf.download("^GSPC", start=start_date, end=end_date, auto_adjust=True)

    # Use 'Close' or fallback to 'Adj Close'
    if "Close" in df.columns:
        close_series = df["Close"]
    elif "Adj Close" in df.columns:
        close_series = df["Adj Close"]
    else:
        raise KeyError("Neither 'Close' nor 'Adj Close' found in data")

    # Shift the close series
    next_close = close_series.shift(-1)

    # Create DataFrame — ensure everything is 1D
    data = pd.DataFrame({
        "Date": close_series.index.to_list(),
        "Close": close_series.to_numpy().flatten(),
        "Next_Close": next_close.to_numpy().flatten()
    })

    # Generate binary labels
    data["Label"] = (data["Next_Close"] > data["Close"]).astype(int)

    # Drop last row (no next close available)
    data.dropna(inplace=True)

    # Format date
    data["date"] = pd.to_datetime(data["Date"]).dt.strftime("%Y-%m-%d")

    return data[["date", "Label"]]

In [None]:
def merge_news_and_labels(news_df, stock_df):
    merged = pd.merge(news_df, stock_df, on="date")
    return merged

In [None]:
import os
os.makedirs("data", exist_ok=True)

# Pick a small date range for testing
start_date = "2023-06-01"
end_date = "2023-06-30"

# Run the WSJ scraper
news_df = scrape_wsj_news_filtered(start_date, end_date)

# Save the result
news_df.to_csv("data/wsj_news.csv", index=False)
print(f"\n Saved {len(news_df)} filtered headlines to data/wsj_news.csv")


# Get S&P 500 labels
stock_df = download_sp500_labels(start_date, end_date)
stock_df.to_csv("data/sp500_labels.csv", index=False)
print(f" Saved {len(stock_df)} rows of stock movement labels to data/sp500_labels.csv")

    
# Merge news with stock labels
merged_df = merge_news_and_labels(news_df, stock_df)
merged_df.to_csv("data/merged_news_market.csv", index=False)
print(f" Merged data saved to data/merged_news_market.csv with {len(merged_df)} rows")