### What to do
Scrape the websites in order to extract the following information:
- Name 
- Logo
- Location 
- Themes (i.e. children, homeless, medicine...)
- Description / mission
- URL of their website
- Year of foundation (This information is sometimes implicit: can be computed based on Number of year of activity)

### What to use
- Requests
- BeautifulSoup
- Scrapy

### Websites to scrape
urls = [
    "https://www.charitynavigator.org",
    "https://www.globalgiving.org",
    "https://www.guidestar.org",
]

### Charity Navigator API
https://charity-navigator.stellate.io

In [None]:
## Fetch data from Charity Navigator API

import os
from dotenv import load_dotenv
import requests
import pandas as pd
import json

# Load environment variables from .env file
load_dotenv()

# Access the variables
charity_navigator_key = os.getenv('CHARITY_NAVIGATOR')

CHARITY_NAVIGATOR_ENDPOINT = 'https://data.charitynavigator.org/'

# Fetch data from Charity Navigator
def fetch_charity_data(count):
    QUERY = """
    query {
        publicSearchFaceted(term: "", from: %d) {
            size
            from
            term
            result_count
            results {
                ein
                name
                mission
                organization_url
                charity_navigator_url
                encompass_score
                encompass_star_rating
                encompass_publication_date
                cause
                street
                street2
                city
                state
                zip
                country
                highest_level_advisory
                encompass_rating_id
            }
        }
    }
    """ % count

    headers = {
        "Stellate-Api-Token": charity_navigator_key,
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.post(
        CHARITY_NAVIGATOR_ENDPOINT,
        headers=headers,
        json={"query": QUERY}
    )
    # Raise an error if the request fails
    response.raise_for_status()
    
    # Return the JSON response
    return response.json()

results = []

for i in range(0, 10001, 10):
   data = fetch_charity_data(i)
   print(data)
   results.append(data)

# Save the results to a JSON file
with open('charity_navigator_data.json', 'w') as f:
    f.write(json.dumps(results, indent=2))

In [None]:
import os
from dotenv import load_dotenv
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse

# Helper function to validate URLs
def is_valid_url(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

# Fix and normalize URLs
def fixURL(url):
    if url is None:
        return None
    url = url.lower().strip()
    if url.startswith('http'):
        return url
    return 'https://' + url

# ************************************** CHARITY NAVIGATOR CHARITIES ************************************** #

# Load Charity Navigator data
with open("charity_navigator_data.json", "r") as f:
    charity_navigator_data = json.load(f)

# Prepare the charity data for processing
charity_navigator_json = []
for data in charity_navigator_data:
    for charity in data['data']['publicSearchFaceted']['results']:
        charity_navigator_json.append({
            'name': charity['name'],
            'mission': charity['mission'],
            'cause': charity['cause'],
            'city': charity['city'],
            'country': charity['country'],
            'organization_url': fixURL(charity.get('organization_url'))
        })


# Save the processed charity data to a new JSON file
with open('charity_navigator.json', 'w') as f:
    f.write(json.dumps(charity_navigator_json, indent=2))


In [None]:
# Function to fetch the logo URL from the organization page
def get_logo(url):
    print(f"Fetching logo from {url}")
    if not is_valid_url(url):
        print(f"Invalid URL skipped: {url}")
        return None
    try:
        response = requests.get(url, timeout=10)  # Add timeout for safety
        response.raise_for_status()  # Raise HTTPError for bad responses
        soup = BeautifulSoup(response.content, 'html.parser')
        logo = soup.find('img', class_='logo')
        return logo['src'] if logo else None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching logo from {url}: {e}")
        return None

# Process the list of charities and fetch logos
def get_logos(charities):
    for charity in charities:
        url = charity.get('organization_url')
        if url:
            charity['logoUrl'] = get_logo(url)
        else:
            charity['logoUrl'] = None
    return charities



""" # load charity_navigator.json
with open('charity_navigator.json', 'r') as f:
    charity_navigator_json = json.load(f)
    
# For each element in charity_navigator_json, fetch the logo
get_logos(charity_navigator_json)

with open('charity_navigator_logos.json', 'w') as f:
    f.write(json.dumps(charity_navigator_json, indent=2)) """

In [3]:
# ************************************** GLOBAL GIVING CHARITIES ************************************** #

import xml.etree.ElementTree as ET
import json

# Read the XML data from the file
with open('organizations.xml', 'r') as file:
    xml_data = file.read()

# Parse the XML data
root = ET.fromstring(xml_data)

# Extract data from all organization elements
global_giving_json = []
for organization in root.findall("organization"):
    org_data = {
        "name": organization.find("name").text,
        "city": organization.find("city").text,
        "country": organization.find("country").text,
        "activeProjects": organization.find("activeProjects").text,
        "totalProjects": organization.find("totalProjects").text,
        "mission": organization.find("mission").text.strip() if organization.find("mission") is not None and organization.find("mission").text is not None else "",
        "organization_url": organization.find("url").text,
        "logoUrl": organization.find("logoUrl").text if organization.find("logoUrl") is not None else None,
        "cause": [
            {
                "name": theme.find("name").text
            }
            for theme in organization.find("themes").findall("theme")
        ],
        "countries_of_operation": [
            {
                "name": country.find("name").text
            }
            for country in organization.find("countries").findall("country")
        ],
    }
    global_giving_json.append(org_data)

# Save the extracted data to a JSON file
with open('global_giving.json', 'w') as f:
    f.write(json.dumps(global_giving_json, indent=2))
    
# Display the extracted data
#pprint.pprint(organizations_data[:10])
