## Sraping CHARITY NAVIGATOR

In [9]:
import os
from dotenv import load_dotenv
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.parse import urljoin
import re
import signal

# Load environment variables
load_dotenv()

# Helper function to validate URLs
def is_valid_url(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

# Fix and normalize URLs
def fixURL(url):
    if url is None:
        return None
    url = url.lower()
    if url.startswith('http'):
        return url
    return 'https://' + url


def timeout_handler(signum, frame):
    raise TimeoutError("Processing time exceeded 15 seconds.")

def get_logo(url):
    # Set timeout signal
    signal.signal(signal.SIGALRM, timeout_handler)
    signal.alarm(15)  # 15 seconds timeout

    try:
        response = requests.get(url, timeout=10)  # 10 seconds timeout for HTTP request
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        logo_candidates = []

        logo_keywords = ['logo']
        exclude_keywords = ['menu', 'close', 'button', 'hamburger', 'arrow', 'background']

        def is_valid_logo(src, tag, width, height):
            if not src:
                return False
            src_lower = src.lower()

            if any(exclude in src_lower for exclude in exclude_keywords):
                return False

            alt = tag.get('alt', '').lower()
            tag_class = ' '.join(tag.get('class', [])).lower()
            tag_id = tag.get('id', '').lower()

            if any(keyword in (src_lower + alt + tag_class + tag_id) for keyword in logo_keywords):
                if width and height and (width > 500 or height > 200):
                    return False
                return True
            return False

        def get_priority(src, tag):
            src_lower = src.lower() if src else ''
            tag_class = ' '.join(tag.get('class', [])).lower()
            tag_id = tag.get('id', '').lower()
            alt = tag.get('alt', '').lower()

            if 'logo' in (src_lower + tag_class + tag_id + alt):
                return 0
            return 2

        # Step 1: Recursive search in containers with "logo" keyword
        for container in soup.find_all(True, class_=True):
            tag_classes = ' '.join(container.get('class', [])).lower()
            if 'logo' in tag_classes:
                # Check for <svg> directly
                svg = container.find('svg')
                if svg:
                    logo_candidates.append((container.prettify(), 0, 0, 0))
                # Check for child <img>
                img = container.find('img')
                if img and img.get('src'):
                    src = img.get('src')
                    width = img.get('width', '0').replace('px', '')
                    height = img.get('height', '0').replace('px', '')
                    width = int(width) if width.isdigit() else 0
                    height = int(height) if height.isdigit() else 0

                    if is_valid_logo(src, img, width, height):
                        priority = get_priority(src, img)
                        logo_candidates.append((urljoin(url, src), width, height, priority))

        # Step 2: Meta tags (Open Graph/Twitter)
        meta_tags = [{'property': 'og:image'}, {'name': 'og:image'}, {'property': 'twitter:image'}]
        for tag in meta_tags:
            meta = soup.find('meta', tag)
            if meta and meta.get('content'):
                logo_candidates.append((urljoin(url, meta['content']), 0, 0, 1))

        # Step 3: <link> tags for icons
        for tag in soup.find_all('link', rel=True):
            rel = tag.get('rel', [])
            href = tag.get('href')
            if href and 'icon' in rel and 'favicon' not in href.lower():
                logo_candidates.append((urljoin(url, href), 0, 0, 2))

        # Step 4: General <img> tags
        for tag in soup.find_all('img'):
            src = tag.get('src')
            width = tag.get('width', '0').replace('px', '')
            height = tag.get('height', '0').replace('px', '')
            width = int(width) if width.isdigit() else 0
            height = int(height) if height.isdigit() else 0

            if is_valid_logo(src, tag, width, height):
                priority = get_priority(src, tag)
                logo_candidates.append((urljoin(url, src), width, height, priority))

        # Step 5: Sort candidates and select the best
        if logo_candidates:
            logo_candidates.sort(key=lambda x: (x[3], -(x[1] * x[2])))
            best_candidate = logo_candidates[0][0]

            if "<svg" in best_candidate:
                print("SVG logo detected.")
                return best_candidate
            return best_candidate

        print("Logo not found on the page.")
        return None
    except TimeoutError:
        print("Timeout: Logo extraction took too long.")
        return None
    except requests.RequestException as e:
        print(f"Error fetching logo from {url}: {e}")
        return None
    finally:
        signal.alarm(0)  # Disable alarm

# Process the list of charities and fetch logos
def get_logos(charities):
    idx = 0
    for charity in charities:
        print(f"Processing {charity['name']}, index {idx}")
        url = charity.get('organization_url')
        if url:
            charity['logoUrl'] = get_logo(url)
        else:
            charity['logoUrl'] = None
        idx += 1
    return charities


In [None]:
## Fetch data from Charity Navigator API
import os
from dotenv import load_dotenv
import requests
import pandas as pd
import json

# Load environment variables from .env file
load_dotenv()

# Access the variables
charity_navigator_key = os.getenv('CHARITY_NAVIGATOR')

CHARITY_NAVIGATOR_ENDPOINT = 'https://data.charitynavigator.org/'

# Fetch data from Charity Navigator
def fetch_charity_data(count):
    QUERY = """
    query {
        publicSearchFaceted(term: "", from: %d) {
            size
            from
            term
            result_count
            results {
                ein
                name
                mission
                organization_url
                charity_navigator_url
                encompass_score
                encompass_star_rating
                encompass_publication_date
                cause
                street
                street2
                city
                state
                zip
                country
                highest_level_advisory
                encompass_rating_id
            }
        }
    }
    """ % count

    headers = {
        "Stellate-Api-Token": charity_navigator_key,
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.post(
        CHARITY_NAVIGATOR_ENDPOINT,
        headers=headers,
        json={"query": QUERY}
    )
    # Raise an error if the request fails
    response.raise_for_status()
    
    # Return the JSON response
    return response.json()

results = []

""" for i in range(0, 10001, 10):
   data = fetch_charity_data(i)
   print(data)
   results.append(data) """

# Save the results to a JSON file
with open('charity_navigator_data.json', 'w') as f:
    f.write(json.dumps(results, indent=2))

In [None]:
# Load Charity Navigator data
with open("charity_navigator_data.json", "r") as f:
    charity_navigator_data = json.load(f)

# Prepare the charity data for processing
charity_navigator_json = []
for data in charity_navigator_data:
    for charity in data['data']['publicSearchFaceted']['results']:
        charity_navigator_json.append({
            'name': charity['name'],
            'mission': charity['mission'],
            'cause': charity['cause'],
            'city': charity['city'],
            'country': charity['country'],
            'organization_url': fixURL(charity.get('organization_url'))
        })


# Save the processed charity data to a new JSON file
with open('charity_navigator.json', 'w') as f:
    f.write(json.dumps(charity_navigator_json, indent=2))


In [None]:
import pycountry
import pycountry_convert as pc
import json

def country_to_continent(country_name):
    try:
        # Get country alpha-2 code
        country_code = pycountry.countries.lookup(country_name).alpha_2
        # Map to continent code
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        # Convert to continent name
        return pc.convert_continent_code_to_continent_name(continent_code)
    except LookupError:
        return "Unknown country"

# Open ./data/charity_navigator_logos.json 
# Read the JSON data from the file
with open('scraping/data/charity_navigator_logos.json', 'r') as file:
    json_data = json.load(file)
    

# For each organization, add a field 'continent' that maps to the continent of the country
for organization in json_data:
    organization["continent"] = country_to_continent(organization["country"])

# Save the updated data to a JSON file
with open('charity_navigator_logos.json', 'w') as f:
    f.write(json.dumps(json_data, indent=2))
    

In [12]:
# Get the first 10 charities from charity_navigator.json and fetch their logos
with open('scraping/data/charity_navigator.json', 'r') as f:
    charity_navigator_json = json.load(f)

# Fetch logos for the first 10 charities
get_logos(charity_navigator_json)

with open('scraping/data/charity_navigator.json', 'w') as f:
    f.write(json.dumps(charity_navigator_json, indent=2))


Processing Mercy Ships International , index 0
Processing International Rescue Committee, index 1
Processing Doctors Without Borders, USA, index 2
Processing International Relief Teams, index 3
Processing World Central Kitchen Incorporated, index 4
Processing UNICEF USA, index 5
SVG logo detected.
Processing Global Refuge, index 6
SVG logo detected.
Processing National Audubon Society, index 7
Processing National Council of YMCAs of the USA, index 8
SVG logo detected.
Processing MAP International, index 9
Processing Amnesty International USA, index 10
Error fetching logo from https://www.amnestyusa.org: 403 Client Error: Forbidden for url: https://www.amnestyusa.org/
Processing Conservation International, index 11
Processing World Resources Institute, index 12
Processing National Center for Missing & Exploited Children, index 13
Processing Goodwill Industries International Inc., index 14
Processing Heart to Heart International, index 15
Error fetching logo from https://www.hearttoheart