## Scraping bank4 Donor IDs

In [26]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
import time
import re
import random
import os
import json
import logging

OUTPUT_DIR = "/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw/bank4"
OUTPUT_FILENAME = "bank4_donor_ids.txt"

def load_config():
    """Load configuration from config.json"""
    with open('config.json', 'r') as f:
        return json.load(f)

def setup_driver(user_agent):
    """Set up and return the Chrome WebDriver with random user agent"""
    try:
        options = webdriver.ChromeOptions()
        options.add_argument('--disable-gpu')
        options.add_argument('--disable-extensions')
        options.add_argument('--disable-notifications')
        options.add_argument(f'user-agent={user_agent}')
        
        driver = webdriver.Chrome(options=options)
        driver.set_page_load_timeout(30)
        return driver
    except Exception as e:
        logging.error(f"Error setting up WebDriver: {str(e)}")
        raise

def random_delay(min_seconds=3, max_seconds=10):
    """Generate a random delay between min_seconds and max_seconds"""
    return random.uniform(min_seconds, max_seconds)

def click_search_button(driver):
    """Click the search button to initiate the search"""
    try:
        logging.info("Waiting for page elements to load...")
        wait = WebDriverWait(driver, 15)
        
        # Wait for the form
        wait.until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "form#ctl00_ctl00_content_content_ctl00_primarySearch_searchForm")
            )
        )
        
        logging.info("Waiting for search button...")
        search_button = wait.until(
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR, "input[type='submit'][value='Search']")
            )
        )
        
        driver.execute_script("arguments[0].scrollIntoView(true);", search_button)
        time.sleep(1)
        
        logging.info("Clicking search button...")
        search_button.click()
        
        # Wait for results
        wait.until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, ".tabs-body-01")
            )
        )
        
        time.sleep(random_delay(3, 6))
        return True
    except Exception as e:
        logging.error(f"Error with search button: {str(e)}")
        return False

def get_donor_ids_from_page(driver):
    """Extract all donor IDs from the current page"""
    try:
        logging.info("Waiting for donor links...")
        wait = WebDriverWait(driver, 15)
        
        # Wait for content to load
        wait.until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, ".tabs-body-01")
            )
        )
        
        # Get donor links
        donor_links = wait.until(
            EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR, "a[href*='donorprofile.aspx?number=']")
            )
        )
        
        donor_ids = set()
        for link in donor_links:
            try:
                href = link.get_attribute('href')
                match = re.search(r'number=(\d+)', href)
                if match:
                    donor_ids.add(match.group(1))
            except Exception:
                continue
        
        logging.info(f"Found {len(donor_ids)} unique donors on this page")
        return list(donor_ids)
    except Exception as e:
        logging.error(f"Error finding donor links: {str(e)}")
        return []

def click_next_page(driver):
    """Click the next page button"""
    try:
        # Wait for the navigation section
        wait = WebDriverWait(driver, 10)
        nav_section = wait.until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, ".navigation-bottom")
            )
        )
        
        # Find and click the next page button
        next_button = nav_section.find_element(By.CSS_SELECTOR, "a[onclick*='getPage']")
        driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
        time.sleep(1)
        next_button.click()
        
        # Wait for page to update
        time.sleep(random_delay(2, 4))
        return True
    except Exception as e:
        logging.error(f"Error clicking next page: {str(e)}")
        return False

def scrape_donor_ids(bank_code='bank4'):
    """Scrape donor IDs for specified bank"""
    config = load_config()
    bank_config = config['banks'][bank_code]
    
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    output_file = os.path.join(OUTPUT_DIR, OUTPUT_FILENAME)
    
    logging.basicConfig(
        level=logging.INFO,
        format='%(message)s',
        handlers=[
            logging.FileHandler(os.path.join(OUTPUT_DIR, f"{bank_code}_scraping.log")),
            logging.StreamHandler()
        ]
    )
    
    user_agent = random.choice(config['user_agents'])
    
    try:
        driver = setup_driver(user_agent)
    except Exception:
        logging.error("Failed to setup driver")
        return []

    all_donor_ids = []
    current_page = 1
    max_pages = bank_config.get('max_pages', 40)
    
    try:
        search_url = bank_config['base_url'].rstrip('/') + bank_config['search_url_pattern']
        logging.info(f"Starting page {current_page} of {max_pages}")
        driver.get(search_url)
        time.sleep(random_delay(5, 8))
        
        if not click_search_button(driver):
            logging.error("Failed to initiate search")
            return []
        
        while current_page <= max_pages:
            logging.info(f"Processing page {current_page}")
            
            page_donor_ids = get_donor_ids_from_page(driver)
            if page_donor_ids:
                all_donor_ids.extend(page_donor_ids)
            
            if current_page < max_pages:
                if not click_next_page(driver):
                    logging.info("Reached last page")
                    break
            
            current_page += 1
            
    except Exception as e:
        logging.error(f"An error occurred during scraping: {str(e)}")
    finally:
        time.sleep(5)
        driver.quit()
    
    # Save results to txt file
    unique_donor_ids = sorted(list(set(all_donor_ids)))
    with open(output_file, 'w') as f:
        for donor_id in unique_donor_ids:
            f.write(f"{donor_id}\n")
    
    logging.info(f"Total unique donors collected: {len(unique_donor_ids)}")
    logging.info(f"Results saved to: {os.path.basename(output_file)}")
    
    return unique_donor_ids

if __name__ == "__main__":
    donor_ids = scrape_donor_ids('bank4')

Starting page 1 of 40
Waiting for page elements to load...
Waiting for search button...
Clicking search button...
Processing page 1
Waiting for donor links...
Found 10 unique donors on this page
Processing page 2
Waiting for donor links...
Found 10 unique donors on this page
Processing page 3
Waiting for donor links...
Found 10 unique donors on this page
Processing page 4
Waiting for donor links...
Found 10 unique donors on this page
Processing page 5
Waiting for donor links...
Found 10 unique donors on this page
Processing page 6
Waiting for donor links...
Found 10 unique donors on this page
Processing page 7
Waiting for donor links...
Found 10 unique donors on this page
Processing page 8
Waiting for donor links...
Found 10 unique donors on this page
Processing page 9
Waiting for donor links...
Found 10 unique donors on this page
Processing page 10
Waiting for donor links...
Found 10 unique donors on this page
Processing page 11
Waiting for donor links...
Found 10 unique donors on thi