In [3]:
import os
import time
import json
import pandas as pd
from datetime import datetime
import requests  # Ensure that the requests module is imported

# Constants for rate limits
REQUESTS_PER_MINUTE = 5
SECONDS_PER_REQUEST = 12
REQUESTS_PER_DAY = 500

# Function to generate a unique progress filename for each query
def get_progress_filename(query):
    # Remove any unwanted characters such as quotes and replace spaces with underscores
    query_safe = query.replace('"', '').replace("'", "").replace(' ', '_').lower()
    return f"nyt_progress_{query_safe}.json"

# Function to generate a unique CSV filename for each query
def get_csv_filename(query):
    # Remove any unwanted characters such as quotes and replace spaces with underscores
    query_safe = query.replace('"', '').replace("'", "").replace(' ', '_').lower()
    return f"nyt_{query_safe}_articles_20220101_20240731.csv"

# Save the progress to a unique file for each query
def save_progress(query, page, request_count):
    progress_filename = get_progress_filename(query)
    progress_data = {
        "query": query,
        "page": page,
        "request_count": request_count,
        "date": datetime.now().strftime('%Y-%m-%d')  # Save the current date
    }
    with open(progress_filename, "w") as f:
        json.dump(progress_data, f)
    print(f"Progress saved to {progress_filename}: {progress_data}")

# Load the progress from a unique file for each query
def load_progress(query):
    progress_filename = get_progress_filename(query)
    if not os.path.exists(progress_filename):
        print(f"No progress file found for query '{query}'. Starting from the beginning.")
        return None
    
    with open(progress_filename, "r") as f:
        progress_data = json.load(f)
        print(f"Progress loaded from {progress_filename}: {progress_data}")
        return progress_data

def get_nyt_article_count(api_key, query, begin_date=None, end_date=None):
    url = "https://api.nytimes.com/svc/search/v2/articlesearch.json"
    
    params = {
        'q': query,
        'api-key': api_key,
        'begin_date': begin_date,
        'end_date': end_date,
        'page': 0,  # Only need to request the first page to get the count
    }

    response = requests.get(url, params=params)  # Use the requests module here
    response.raise_for_status()
    
    data = response.json()
    
    # Get the total number of articles from the 'meta' field
    total_articles = data['response']['meta']['hits']
    
    return total_articles

def get_nyt_articles(api_key, query, begin_date=None, end_date=None, page=0):
    url = "https://api.nytimes.com/svc/search/v2/articlesearch.json"
    
    params = {
        'q': query,
        'api-key': api_key,
        'page': page,
        'begin_date': begin_date,
        'end_date': end_date,
    }

    response = requests.get(url, params=params)  # Use the requests module here
    response.raise_for_status()
    
    data = response.json()
    articles = data['response']['docs']
    
    return articles

def process_query(api_key, query, begin_date, end_date, request_count, max_requests_per_day, page=0):
    print(f"Processing query: {query} starting from page {page}")
    
    all_articles = []
    total_articles_fetched = 0  # Counter to track the total number of articles fetched
    
    # Generate the CSV filename for the query
    csv_filename = get_csv_filename(query)
    
    # Try loading existing data from the CSV file if it exists (to avoid overwriting)
    if os.path.exists(csv_filename):
        existing_df = pd.read_csv(csv_filename)
        all_articles = existing_df.to_dict('records')  # Convert to list of dicts
        total_articles_fetched = len(all_articles)  # Initialize counter with existing data
    
    while True:
        try:
            # Check if we've hit the daily request limit
            if request_count >= max_requests_per_day:
                print(f"Reached the daily request limit of {max_requests_per_day}. Stopping.")
                save_progress(query, page, request_count)
                # Save the data collected so far
                save_to_csv(all_articles, csv_filename)
                print(f"Total articles extracted so far: {total_articles_fetched}")
                return all_articles, request_count

            # Fetch articles for the current page
            articles = get_nyt_articles(api_key, query, begin_date, end_date, page=page)
            
            if not articles:
                break
            
            all_articles.extend(articles)
            total_articles_fetched += len(articles)  # Update the total articles counter
            print(f"Fetched {len(articles)} articles for this page, total so far: {total_articles_fetched}")
            
            page += 1
            request_count += 1  # Increment request count across queries
            
            # Save progress after every successful request
            save_progress(query, page, request_count)
            
            # Periodically save the data to the CSV file to ensure no data loss
            save_to_csv(all_articles, csv_filename)
            
            # Enforce rate limit: sleep for 12 seconds between requests
            time.sleep(SECONDS_PER_REQUEST)
        
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 429:
                print(f"Rate limit exceeded. Backing off...")
                time.sleep(60)
            else:
                raise
    
    # Final save after query is fully processed
    save_to_csv(all_articles, csv_filename)
    print(f"Total articles fetched for query '{query}': {total_articles_fetched}")
    
    # Create a unique DataFrame for the query
    query_df = create_dataframe(all_articles)
    
    return query_df, request_count

def create_dataframe(all_articles):
    # Convert the list of articles to a DataFrame
    data = {
        'Title': [article.get('headline', {}).get('main', 'No title available') for article in all_articles],
        'URL': [article.get('web_url', 'No URL available') for article in all_articles],
        'Lead Paragraph': [article.get('lead_paragraph', 'No content available') for article in all_articles],
        'Publication Date': [article.get('pub_date', 'No date available') for article in all_articles],
        'Source': [article.get('source', 'No source available') for article in all_articles],
    }

    nyt_extract_df = pd.DataFrame(data)
    
    return nyt_extract_df

def save_to_csv(all_articles, csv_filename):
    nyt_extract_df = create_dataframe(all_articles)
    
    # Save the DataFrame to CSV file
    nyt_extract_df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"Data saved to {csv_filename}")

def main():
    # Your API key
    api_key = 'sample_api_key'  # Replace with your own API key
    
    # Define the date range
    begin_date = '20220101'  # Start date in YYYYMMDD format
    end_date = '20240731'    # End date in YYYYMMDD format
    
    # List of queries to process
    queries = ["federal reserve"]
    
    # Dictionary to store the DataFrame for each query
    query_dataframes = {}
    
    # Iterate over the queries
    for query in queries:
        # Load progress for the specific query
        progress = load_progress(query)
        if progress:
            last_query = progress['query']
            start_page = progress['page']
            request_count = progress['request_count']
            last_run_date = progress['date']
            
            # If a new day has started, reset the request count
            if last_run_date != datetime.now().strftime('%Y-%m-%d'):
                print("New day detected. Resetting the request count.")
                request_count = 0
                start_page = 0
        else:
            start_page = 0
            request_count = 0

        # Process the query and store the DataFrame in the dictionary
        query_df, request_count = process_query(api_key, query, begin_date, end_date, request_count, REQUESTS_PER_DAY, page=start_page)
        query_dataframes[query] = query_df
        
        # Display the DataFrame for the current query
        print(f"\nDataFrame for '{query}':")
        print(query_df.head())

    print("All queries have been processed.")

if __name__ == "__main__":
    main()


Progress loaded from nyt_progress_federal_reserve.json: {'query': 'federal reserve', 'page': 27, 'request_count': 27, 'date': '2024-08-19'}
Processing query: federal reserve starting from page 27
Fetched 10 articles for this page, total so far: 4290
Progress saved to nyt_progress_federal_reserve.json: {'query': 'federal reserve', 'page': 28, 'request_count': 28, 'date': '2024-08-19'}
Data saved to nyt_federal_reserve_articles_20220101_20240731.csv
Fetched 10 articles for this page, total so far: 4300
Progress saved to nyt_progress_federal_reserve.json: {'query': 'federal reserve', 'page': 29, 'request_count': 29, 'date': '2024-08-19'}
Data saved to nyt_federal_reserve_articles_20220101_20240731.csv
Fetched 10 articles for this page, total so far: 4310
Progress saved to nyt_progress_federal_reserve.json: {'query': 'federal reserve', 'page': 30, 'request_count': 30, 'date': '2024-08-19'}
Data saved to nyt_federal_reserve_articles_20220101_20240731.csv
Rate limit exceeded. Backing off...


ConnectionError: HTTPSConnectionPool(host='api.nytimes.com', port=443): Max retries exceeded with url: /svc/search/v2/articlesearch.json?q=federal+reserve&api-key=S03RGAvnZiFYKcIbnpeXqSbN41PaxHuU&page=201&begin_date=20220101&end_date=20240731 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001A3AB166E00>: Failed to resolve 'api.nytimes.com' ([Errno 11001] getaddrinfo failed)"))