In [1]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import time

In [2]:
API_KEY = 'HTmoDru9h7CiP13MiwrdpdC2gItAWs7e'

In [3]:
BASE_URL = 'https://api.nytimes.com/svc/archive/v1'

In [4]:
def fetch_articles(year, month, api_key):
    url = f"{BASE_URL}/{year}/{month}.json"
    params = {'api-key': api_key}
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    elif response.status_code == 401:
        raise Exception("Invalid API key. Please check your API key and try again.")
    else:
        print(f"API request failed with status code {response.status_code} for {year}-{month}")
        return None


In [5]:
def extract_finance_headlines(articles):
    finance_keywords = ['Tesla', 'business', 'economy', 'market', 'stock', 'investment']
    #used keywords for apple: apple, stock, market and iphne
    #used keywords for Nvidia: Nvidia, GPU, stock, market, processor
    headlines = []
    for article in articles:
        news_desk = article.get('news_desk', '').lower()
        section_name = article.get('section_name', '').lower()
        keywords = [keyword['value'].lower() for keyword in article.get('keywords', [])]
        if any(kw in news_desk for kw in finance_keywords) or any(kw in section_name for kw in finance_keywords) or any(kw in keywords for kw in finance_keywords):
            headlines.append(article['headline']['main'])
        if len(headlines) == 27:  # Collect up to 27 headlines
            break
    return headlines


In [6]:
def collect_headlines(start_date, end_date, api_key):
    headlines_data = []

    current_date = start_date
    while current_date <= end_date:
        year = current_date.year
        month = current_date.month
        articles_data = fetch_articles(year, month, api_key)
        if articles_data:
            articles = articles_data['response']['docs']
            date_str = current_date.strftime('%Y-%m-%d')
            daily_articles = [article for article in articles if article['pub_date'].startswith(date_str)]
            headlines = extract_finance_headlines(daily_articles)
            headlines_data.append({'published_date': date_str, 'headlines': headlines})
            print(f"Collected {len(headlines)} headlines for {date_str}")
        current_date += timedelta(days=1)
        time.sleep(12)  # To respect rate limits

    return pd.DataFrame(headlines_data)

In [7]:
start_date = datetime(2023, 1, 1)
end_date = datetime(2024, 4, 10)
try:
    df_headlines = collect_headlines(start_date, end_date, API_KEY)
    df_headlines.to_csv('finance_headlines_nw.csv', index=False)
    print("Headlines successfully collected and saved to finance_headlines.csv")
except Exception as e:
    print(f"Error: {e}")

Collected 4 headlines for 2023-01-01
Collected 4 headlines for 2023-01-02
Collected 12 headlines for 2023-01-03
Collected 13 headlines for 2023-01-04
Collected 11 headlines for 2023-01-05
Collected 13 headlines for 2023-01-06
Collected 8 headlines for 2023-01-07
Collected 5 headlines for 2023-01-08
Collected 6 headlines for 2023-01-09
Collected 15 headlines for 2023-01-10
Collected 12 headlines for 2023-01-11
Collected 17 headlines for 2023-01-12
Collected 16 headlines for 2023-01-13
Collected 6 headlines for 2023-01-14
Collected 3 headlines for 2023-01-15
Collected 5 headlines for 2023-01-16
Collected 16 headlines for 2023-01-17
Collected 12 headlines for 2023-01-18
Collected 12 headlines for 2023-01-19
Collected 17 headlines for 2023-01-20
Collected 6 headlines for 2023-01-21
Collected 5 headlines for 2023-01-22
Collected 10 headlines for 2023-01-23
Collected 14 headlines for 2023-01-24
Collected 11 headlines for 2023-01-25
Collected 18 headlines for 2023-01-26
Collected 14 headlines