In [37]:
from datetime import datetime
import os
# Configuration
TICKERS = ["AAPL", "GOOG", "MSFT", "META", "TSLA", "NVDA", "AMZN", "NFLX"]
START_DATE = "2025-01-01"
END_DATE = "2025-06-30"
OUTPUT_DIR = "../data/articles"
# Alpha Vantage API key
API_KEY = os.environ["AV_TOKEN"]
os.makedirs(OUTPUT_DIR, exist_ok=True)
run_date = datetime.strptime(START_DATE, "%Y-%m-%d")
end_date = datetime.strptime(END_DATE, "%Y-%m-%d")
print(run_date)

2025-01-01 00:00:00


In [38]:
import json
import os
import time
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from pprint import pprint
import requests
import yfinance as yf

# Download historical data
hist_data = yf.download(TICKERS, start=START_DATE, end=END_DATE, group_by='ticker', auto_adjust=False)

# Cache static info
info_cache = {ticker: yf.Ticker(ticker).info for ticker in TICKERS}

def fetch_news_sentiment(ticker, start_date, stop_date):
    """Sentiment"""
    url = "https://www.alphavantage.co/query"
    print(f"Fetching news sentiment for {ticker}...")
    params = {
        "function": "NEWS_SENTIMENT",
        "tickers": ticker,
        "apikey": API_KEY,
        "limit": 10000,  # Adjust as needed
        "time_from": start_date.strftime("%Y%m%dT%H%M"),
        "time_to": stop_date.strftime("%Y%m%dT%H%M"),
        "sort": "EARLIEST"
    }

    response = requests.get(url, params=params, timeout=10)
    data = response.json()
    # print(data)

    if "feed" not in data:
        print(f"No news data found for {ticker}:", data)
        return None

    articles = []
    for article in data["feed"]:
        published_date = article["time_published"][:8]  # Extract YYYYMMDD
        sentiment_score = float(article["overall_sentiment_score"])  # Convert to float

        # Determine sentiment label based on sentiment score
        if sentiment_score <= -0.35:
            sentiment_label = "bearish"
        elif -0.35 < sentiment_score <= -0.15:
            sentiment_label = "somewhat-bearish"
        elif -0.15 < sentiment_score < 0.15:
            sentiment_label = "neutral"
        elif 0.15 <= sentiment_score < 0.35:
            sentiment_label = "somewhat-bullish"
        else:
            sentiment_label = "bullish"
        
        article_data = {
            "Ticker": ticker,
            "Title": article.get("title", ""),
            "Published": published_date,
            "Sentiment": sentiment_label.capitalize(),
            "Sentiment Score": article.get("overall_sentiment_score", ""),
            "Summary": article.get("summary", ""),
            "Source": article.get("source", ""),
            "Source_domain": article["source_domain"],
            "URL": article.get("url", ""),
            "Full_Article": None
        }
        articles.append(article_data)
        # try:
        #     print(f"Attempting to scrape article {article["url"]}"
        #     article_response = requests.get(article["url"], timeout=10)
        #     soup = BeautifulSoup(article_response.text, "html.parser")

        #     # Extract the main text content (simplified approach)
        #     paragraphs = soup.find_all("p")
        #     full_text = "\n".join([p.get_text() for p in paragraphs])
        #     article_data["Full_Article"] = full_text
        #     articles.append(article_data)
        # except ConnectionError as e:
        #     article_data["Full_Article"] = f"Error fetching article: {e}"
    return articles

[*********************100%***********************]  8 of 8 completed


In [39]:
# Initialize ticker_last_run to track the last run date for each ticker
ticker_next_run = {ticker: run_date for ticker in TICKERS}
try:
    # Loop through each ticker and fetch news sentiment
    for ticker in TICKERS:
        run_date = ticker_next_run[ticker]
        while run_date <= end_date:
            print(f"Processing date: {run_date.strftime('%Y-%m-%d')}")
            # Fetch news sentiment for each ticker
            articles = fetch_news_sentiment(ticker, run_date, end_date)
            pprint(articles)
            ticker_next_run[ticker] += timedelta(days=1)
            file = ticker + "_" + run_date.strftime('%Y-%m-%d') + "_" + end_date.strftime('%Y-%m-%d')
            filename = os.path.join(OUTPUT_DIR, f"{file}.json")
            with open(filename, "w", encoding="utf-8") as f:
                json.dump(articles, f, indent=2)
except Exception as e:
    print(e)

Processing date: 2025-01-01
Fetching news sentiment for AAPL...
[{'Full_Article': None,
  'Published': '20250101',
  'Sentiment': 'Neutral',
  'Sentiment Score': 0.077051,
  'Source': 'Motley Fool',
  'Source_domain': 'www.fool.com',
  'Summary': "Here's what to expect from the stock market, along with some of "
             "Wall Street's hottest trends and most influential businesses, in "
             'the new year.',
  'Ticker': 'AAPL',
  'Title': '10 Stock Market Predictions for 2025',
  'URL': 'https://www.fool.com/investing/2025/01/01/10-stock-market-predictions-for-2025/'},
 {'Full_Article': None,
  'Published': '20250101',
  'Sentiment': 'Somewhat-bullish',
  'Sentiment Score': 0.262962,
  'Source': 'Zacks Commentary',
  'Source_domain': 'www.zacks.com',
  'Summary': 'Style Box ETF report for ...',
  'Ticker': 'AAPL',
  'Title': 'Should Vanguard S&P 500 Growth ETF  ( VOOG )  Be on Your Investing '
           'Radar?',
  'URL': 'https://www.zacks.com/stock/news/2390296/should-v

KeyboardInterrupt: 

In [16]:
pprint(articles)

None


In [15]:
ticker_next_run[ticker] += timedelta(days=1)

In [16]:
print(ticker_next_run[ticker])

2025-01-03 00:00:00


2025-01-01 00:00:00


In [25]:
import os
from alpha_vantage_pro import AlphaVantageClient, AlphaVantageClientAsync
run_date = datetime.strptime(START_DATE, "%Y-%m-%d").strftime("%Y%m%dT%H%M")
end_date = datetime.strptime(END_DATE, "%Y-%m-%d").strftime("%Y%m%dT%H%M")
TICKERS = ["AAPL", "GOOG", "MSFT", "META", "TSLA", "NVDA", "AMZN", "NFLX"]
print(run_date)

20250101T0000


In [35]:
client = AlphaVantageClient(api_key=os.environ['AV_TOKEN'])
df = client.get_daily("AAPL")
news = client.get_news_sentiment(TICKERS, run_date, end_date, 10000)

In [36]:
print(news.count())
df

title                      50
url                        50
time_published             50
authors                    50
summary                    50
banner_image               50
source                     50
category_within_source     50
source_domain              50
topics                     50
overall_sentiment_score    50
overall_sentiment_label    50
ticker_sentiment           50
dtype: int64


Unnamed: 0,1. open,2. high,3. low,4. close,5. volume
2025-03-11,223.8050,225.8399,217.4500,220.8400,76137410
2025-03-12,220.1400,221.7500,214.9100,216.9800,62547467
2025-03-13,215.9500,216.8394,208.4200,209.6800,61368330
2025-03-14,211.2500,213.9500,209.5800,213.4900,60107582
2025-03-17,213.3100,215.2200,209.9700,214.0000,48073426
...,...,...,...,...,...
2025-07-28,214.0300,214.8450,213.0600,214.0500,37858017
2025-07-29,214.1750,214.8100,210.8200,211.2700,51411723
2025-07-30,211.8950,212.3900,207.7200,209.0500,45512514
2025-07-31,208.4900,209.8400,207.1600,207.5700,80698431


In [34]:
client = AlphaVantageClientAsync(api_key="YOUR_KEY")
df = await client.get_daily_async(TICKERS)
await client.close()

In [32]:
df

Unnamed: 0,1. open,2. high,3. low,4. close,5. volume
2025-03-11,867.4100,910.6800,866.0000,895.1000,7398934
2025-03-12,914.6000,927.1800,901.0000,919.6800,6865837
2025-03-13,913.0000,921.8892,887.5100,890.1700,5001698
2025-03-14,901.4600,919.6250,901.0000,918.0000,5200684
2025-03-17,939.9500,967.6300,934.4200,950.0200,7543146
...,...,...,...,...,...
2025-07-28,1181.2000,1197.7899,1168.5300,1174.6000,3446958
2025-07-29,1179.0000,1179.6000,1163.2900,1168.7400,2752489
2025-07-30,1168.5500,1184.8500,1166.1000,1184.2000,2922731
2025-07-31,1184.8000,1190.0000,1157.7400,1159.4000,3712949
