## Sraping CHARITY NAVIGATOR

In [30]:
import os
from dotenv import load_dotenv
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.parse import urljoin
import re

# Load environment variables
load_dotenv()

# Helper function to validate URLs
def is_valid_url(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

# Fix and normalize URLs
def fixURL(url):
    if url is None:
        return None
    url = url.lower()
    if url.startswith('http'):
        return url
    return 'https://' + url

def get_logo(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        logo_candidates = []

        # Keywords to detect logos
        logo_keywords = ['logo', 'brand', 'header', 'main', 'icon']

        # Keywords to exclude irrelevant images
        exclude_keywords = ['menu', 'close', 'button', 'hamburger', 'arrow', 'banner', 'background']

        def is_valid_logo(src, tag, width, height):
            """Determine if the image source is a valid logo."""
            if not src:
                return False
            src_lower = src.lower()

            # Exclude based on keywords in the URL or filename
            if any(exclude in src_lower for exclude in exclude_keywords):
                return False

            # Include based on keywords in src, alt, class, or id
            alt = tag.get('alt', '').lower()
            tag_class = ' '.join(tag.get('class', [])).lower()
            tag_id = tag.get('id', '').lower()

            if any(keyword in (src_lower + alt + tag_class + tag_id) for keyword in logo_keywords):
                # Exclude overly large images that might be banners
                if width and height:
                    if int(width) > 500 or int(height) > 200:  # Too large, likely a banner
                        return False
                    if int(width) / int(height) > 5 or int(height) / int(width) > 5:  # Extreme aspect ratio
                        return False
                return True

            return False

        # Step 1: Meta tags (Open Graph and Twitter card)
        meta_tags = [
            {'property': 'og:image'}, {'name': 'og:image'},
            {'property': 'twitter:image'}, {'name': 'twitter:image'}
        ]
        for tag in meta_tags:
            meta = soup.find('meta', tag)
            if meta and meta.get('content'):
                logo_candidates.append((urljoin(url, meta['content']), 0, 0, 1))  # Priority: 1

        # Step 2: <link> tags (exclude favicons and irrelevant icons)
        for tag in soup.find_all('link', rel=True):
            rel = tag.get('rel', [])
            href = tag.get('href')
            if href and 'icon' in rel and not re.search(r'favicon|menu|close|button|icon|banner', href, re.IGNORECASE):
                logo_candidates.append((urljoin(url, href), 0, 0, 1))  # Priority: 1

        # Step 3: <img> tags (prioritize based on class, alt, id)
        for tag in soup.find_all('img'):
            src = tag.get('src')
            width = tag.get('width', '0').replace('px', '')
            height = tag.get('height', '0').replace('px', '')

            # Default size if width/height is not set
            width = int(width) if width.isdigit() else 0
            height = int(height) if height.isdigit() else 0

            # Check if valid logo
            if is_valid_logo(src, tag, width, height):
                priority = 2 if 'logo' in str(tag.get('class', '')).lower() or 'icon' in str(tag.get('class', '')).lower() else 3
                logo_candidates.append((urljoin(url, src), width, height, priority))

        # Step 4: Sort candidates by priority, then size
        if logo_candidates:
            logo_candidates.sort(key=lambda x: (x[3], -(x[1] * x[2])))
            return logo_candidates[0][0]

        print("Logo not found on the page.")
        return None
    except requests.RequestException as e:
        print(f"Error fetching logo from {url}: {e}")
        return None


# Process the list of charities and fetch logos
def get_logos(charities):
    idx = 0
    for charity in charities:
        print(f"Processing {charity['name']}, index {idx}")
        url = charity.get('organization_url')
        if url:
            charity['logoUrl'] = get_logo(url)
        else:
            charity['logoUrl'] = None
        idx += 1
    return charities


In [None]:
## Fetch data from Charity Navigator API
import os
from dotenv import load_dotenv
import requests
import pandas as pd
import json

# Load environment variables from .env file
load_dotenv()

# Access the variables
charity_navigator_key = os.getenv('CHARITY_NAVIGATOR')

CHARITY_NAVIGATOR_ENDPOINT = 'https://data.charitynavigator.org/'

# Fetch data from Charity Navigator
def fetch_charity_data(count):
    QUERY = """
    query {
        publicSearchFaceted(term: "", from: %d) {
            size
            from
            term
            result_count
            results {
                ein
                name
                mission
                organization_url
                charity_navigator_url
                encompass_score
                encompass_star_rating
                encompass_publication_date
                cause
                street
                street2
                city
                state
                zip
                country
                highest_level_advisory
                encompass_rating_id
            }
        }
    }
    """ % count

    headers = {
        "Stellate-Api-Token": charity_navigator_key,
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.post(
        CHARITY_NAVIGATOR_ENDPOINT,
        headers=headers,
        json={"query": QUERY}
    )
    # Raise an error if the request fails
    response.raise_for_status()
    
    # Return the JSON response
    return response.json()

results = []

""" for i in range(0, 10001, 10):
   data = fetch_charity_data(i)
   print(data)
   results.append(data) """

# Save the results to a JSON file
with open('charity_navigator_data.json', 'w') as f:
    f.write(json.dumps(results, indent=2))

In [None]:
# Load Charity Navigator data
with open("charity_navigator_data.json", "r") as f:
    charity_navigator_data = json.load(f)

# Prepare the charity data for processing
charity_navigator_json = []
for data in charity_navigator_data:
    for charity in data['data']['publicSearchFaceted']['results']:
        charity_navigator_json.append({
            'name': charity['name'],
            'mission': charity['mission'],
            'cause': charity['cause'],
            'city': charity['city'],
            'country': charity['country'],
            'organization_url': fixURL(charity.get('organization_url'))
        })


# Save the processed charity data to a new JSON file
with open('charity_navigator.json', 'w') as f:
    f.write(json.dumps(charity_navigator_json, indent=2))


In [None]:
import pycountry
import pycountry_convert as pc
import json

def country_to_continent(country_name):
    try:
        # Get country alpha-2 code
        country_code = pycountry.countries.lookup(country_name).alpha_2
        # Map to continent code
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        # Convert to continent name
        return pc.convert_continent_code_to_continent_name(continent_code)
    except LookupError:
        return "Unknown country"

# Open ./data/charity_navigator_logos.json 
# Read the JSON data from the file
with open('scraping/data/charity_navigator_logos.json', 'r') as file:
    json_data = json.load(file)
    

# For each organization, add a field 'continent' that maps to the continent of the country
for organization in json_data:
    organization["continent"] = country_to_continent(organization["country"])

# Save the updated data to a JSON file
with open('charity_navigator_logos.json', 'w') as f:
    f.write(json.dumps(json_data, indent=2))
    

In [31]:
# Get the first 10 charities from charity_navigator.json and fetch their logos
with open('scraping/data/charity_navigator.json', 'r') as f:
    charity_navigator_json = json.load(f)

# Fetch logos for the first 10 charities
get_logos(charity_navigator_json[:10])

with open('scraping/data/charity_navigator.json', 'w') as f:
    f.write(json.dumps(charity_navigator_json, indent=2))


Processing Mercy Ships International , index 0
Processing International Rescue Committee, index 1
Processing Doctors Without Borders, USA, index 2
Processing International Relief Teams, index 3
Processing World Central Kitchen Incorporated, index 4
Processing UNICEF USA, index 5
Logo not found on the page.
Processing Global Refuge, index 6
Processing National Audubon Society, index 7
Processing National Council of YMCAs of the USA, index 8
Processing MAP International, index 9


In [None]:
# load charity_navigator.json
with open('charity_navigator.json', 'r') as f:
    charity_navigator_json = json.load(f)
    
# For each element in charity_navigator_json, fetch the logo
get_logos(charity_navigator_json)

with open('charity_navigator_logos.json', 'w') as f:
    f.write(json.dumps(charity_navigator_json, indent=2))