### What to do
Scrape the websites in order to extract the following information:
- Name 
- Logo
- Location 
- Themes (i.e. children, homeless, medicine...)
- Description
- URL of their website
- Year of foundation (This information is sometimes implicit: can be computed based on Number of year of activity)

### What to use
- Requests
- BeautifulSoup
- Scrapy

### Websites to scrape
urls = [
    "https://www.charitynavigator.org",
    "https://www.globalgiving.org",
    "https://www.guidestar.org",
]

### Charity Navigator API
https://charity-navigator.stellate.io

In [9]:
import os
from dotenv import load_dotenv
import requests
import pandas as pd

# Load environment variables from .env file
load_dotenv()

# Access the variables
charity_navigator_key = os.getenv('CHARITY_NAVIGATOR')

CHARITY_NAVIGATOR_ENDPOINT = 'https://data.charitynavigator.org/'

QUERY = """
query PublicSearchFaceted($term: String!) {
    publicSearchFaceted(term: $term) {
        size
        from
        term
        result_count
        results {
            ein
            name
            mission
            organization_url
            charity_navigator_url
            encompass_score
            encompass_star_rating
            encompass_publication_date
            cause
            street
            street2
            city
            state
            zip
            country
            highest_level_advisory
            encompass_rating_id
        }
    }
}
"""

# Fetch data from Charity Navigator
def fetch_charity_data(term):
    print("api key", charity_navigator_key)
    headers = {
        "Authorization": f"Bearer {charity_navigator_key}",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.post(
        CHARITY_NAVIGATOR_ENDPOINT,
        headers=headers,
        json={"query": QUERY, "variables": {"term": term}}
    )
    # Raise an error if the request fails
    response.raise_for_status()
    
    # Return the JSON response
    return response.json()

# Extract the results from the JSON response
def extract_results(data):
    results = data.get("data", {}).get("publicSearchFaceted", {}).get("results", [])
    return results

# Fetch paginated data
def fetch_paginated_data(term, page_size=10):
    all_results = []
    offset = 0
    headers = {
        "Authorization": f"Bearer {charity_navigator_key}",
        "Stellate-Api-Token": charity_navigator_key,
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    while True:
        # Add pagination variables
        response = requests.post(
            CHARITY_NAVIGATOR_ENDPOINT,
            headers=headers,  # Add headers here
            json={
                "query": QUERY,
                "variables": {"term": term, "size": page_size, "from": offset}
            }
        )
        response.raise_for_status()
        data = response.json()

        # Extract results
        results = extract_results(data)
        all_results.extend(results)

        # Check if there are more results
        if len(results) < page_size:
            break

        offset += page_size

    return all_results

# Single page fetch
term = "education"  # Example search term
try:
    data = fetch_charity_data(term)
    results = extract_results(data)
    df = pd.DataFrame(results)
    df.to_csv("charity_data.csv", index=False)
    print("Data saved to charity_data.csv")
except Exception as e:
    print(f"Error: {e}")

# Paginated fetch
try:
    all_results = fetch_paginated_data(term)
    df = pd.DataFrame(all_results)
    df.to_csv("charity_data_complete.csv", index=False)
    print(df.head())
except Exception as e:
    print(f"Error: {e}")


api key stl8_9fbdbb1f5dc244b9de7f3d7066ee0b62ab8f2e94f73712e57a236371cc3bbb82
Error: 403 Client Error:  for url: https://data.charitynavigator.org/


KeyboardInterrupt: 