## Sraping CHARITY NAVIGATOR

In [None]:
## Fetch data from Charity Navigator API
import os
from dotenv import load_dotenv
import requests
import pandas as pd
import json

# Load environment variables from .env file
load_dotenv()

# Access the variables
charity_navigator_key = os.getenv('CHARITY_NAVIGATOR')

CHARITY_NAVIGATOR_ENDPOINT = 'https://data.charitynavigator.org/'

# Fetch data from Charity Navigator
def fetch_charity_data(count):
    QUERY = """
    query {
        publicSearchFaceted(term: "", from: %d) {
            size
            from
            term
            result_count
            results {
                ein
                name
                mission
                organization_url
                charity_navigator_url
                encompass_score
                encompass_star_rating
                encompass_publication_date
                cause
                street
                street2
                city
                state
                zip
                country
                highest_level_advisory
                encompass_rating_id
            }
        }
    }
    """ % count

    headers = {
        "Stellate-Api-Token": charity_navigator_key,
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.post(
        CHARITY_NAVIGATOR_ENDPOINT,
        headers=headers,
        json={"query": QUERY}
    )
    # Raise an error if the request fails
    response.raise_for_status()
    
    # Return the JSON response
    return response.json()

results = []

""" for i in range(0, 10001, 10):
   data = fetch_charity_data(i)
   print(data)
   results.append(data) """

# Save the results to a JSON file
with open('charity_navigator_data.json', 'w') as f:
    f.write(json.dumps(results, indent=2))

In [5]:
import os
from dotenv import load_dotenv
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.parse import urljoin

# Load environment variables
load_dotenv()

# Helper function to validate URLs
def is_valid_url(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

# Fix and normalize URLs
def fixURL(url):
    if url is None:
        return None
    url = url.lower().strip()
    if url.startswith('http'):
        return url
    return 'https://' + url

# Function to fetch the logo URL from the organization page
def get_logo(url):
    """Fetch and return the logo URL from a given website URL."""
    print(f"Fetching logo from {url}")
    if not is_valid_url(url):
        print(f"Invalid URL skipped: {url}")
        return None
    
    try:
        response = requests.get(url, timeout=10)  # Add timeout for safety
        response.raise_for_status()  # Raise HTTPError for bad responses
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Look for common logo patterns
        logo = None
        for tag in soup.find_all(['img', 'link']):
            # Check for <img> tags with 'logo' in the class, id, or alt attribute
            if tag.name == 'img' and any(keyword in (tag.get('class', []) + [tag.get('id', ''), tag.get('alt', '')]) for keyword in ['logo', 'brand']):
                logo = tag
                break
            
            # Check for <link> tags that might point to an icon/logo
            if tag.name == 'link' and tag.get('rel') and 'icon' in tag.get('rel', []):
                logo = tag
                break

        # Return the absolute URL of the logo
        if logo and logo.get('src'):
            return urljoin(url, logo['src'])
        elif logo and logo.get('href'):
            return urljoin(url, logo['href'])
        
        print("Logo not found on the page.")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching logo from {url}: {e}")
        return None

# Process the list of charities and fetch logos
def get_logos(charities):
    for charity in charities:
        url = charity.get('organization_url')
        if url:
            charity['logoUrl'] = get_logo(url)
        else:
            charity['logoUrl'] = None
    return charities


In [None]:
# Load Charity Navigator data
with open("charity_navigator_data.json", "r") as f:
    charity_navigator_data = json.load(f)

# Prepare the charity data for processing
charity_navigator_json = []
for data in charity_navigator_data:
    for charity in data['data']['publicSearchFaceted']['results']:
        charity_navigator_json.append({
            'name': charity['name'],
            'mission': charity['mission'],
            'cause': charity['cause'],
            'city': charity['city'],
            'country': charity['country'],
            'organization_url': fixURL(charity.get('organization_url'))
        })


# Save the processed charity data to a new JSON file
with open('charity_navigator.json', 'w') as f:
    f.write(json.dumps(charity_navigator_json, indent=2))


In [6]:
# Get the first 10 charities from charity_navigator.json and fetch their logos
with open('charity_navigator.json', 'r') as f:
    charity_navigator_json = json.load(f)

# Fetch logos for the first 10 charities
get_logos(charity_navigator_json[:10])

with open('charity_navigator_logos.json', 'w') as f:
    f.write(json.dumps(charity_navigator_json, indent=2))


Fetching logo from https://www.mercyships.org
Fetching logo from https://www.rescue.org
Fetching logo from https://www.doctorswithoutborders.org
Fetching logo from https://www.irteams.org
Fetching logo from https://www.wck.org
Fetching logo from https://www.unicefusa.org
Fetching logo from https://www.lirs.org
Fetching logo from https://www.audubon.org
Fetching logo from https://www.ymca.org
Fetching logo from https://www.map.org


In [7]:
# load charity_navigator.json
with open('charity_navigator.json', 'r') as f:
    charity_navigator_json = json.load(f)
    
# For each element in charity_navigator_json, fetch the logo
get_logos(charity_navigator_json)

with open('charity_navigator_logos.json', 'w') as f:
    f.write(json.dumps(charity_navigator_json, indent=2))

Fetching logo from https://www.mercyships.org
Fetching logo from https://www.rescue.org
Fetching logo from https://www.doctorswithoutborders.org
Fetching logo from https://www.irteams.org
Fetching logo from https://www.wck.org
Fetching logo from https://www.unicefusa.org
Fetching logo from https://www.lirs.org
Fetching logo from https://www.audubon.org
Fetching logo from https://www.ymca.org
Fetching logo from https://www.map.org
Fetching logo from https://www.amnestyusa.org
Error fetching logo from https://www.amnestyusa.org: 403 Client Error: Forbidden for url: https://www.amnestyusa.org/
Fetching logo from https://www.conservation.org
Fetching logo from https://www.wri.org
Fetching logo from https://www.missingkids.org
Fetching logo from https://www.goodwill.org
Fetching logo from https://www.hearttoheart.org
Error fetching logo from https://www.hearttoheart.org: 403 Client Error: Forbidden for url: https://www.hearttoheart.org/
Fetching logo from https://www.worldvision.org
Fetchin