## 1. Import Modules

In [13]:
# Import cell remains the same
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import json
import csv
import time
import random
from login_bank3 import setup_driver, login, cleanup_driver
from config import load_config
import os
import logging
from log_utils import setup_anonymized_logging

## 2. Configure Login

In [14]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

## 3. Define Helper Functions

In [15]:
def get_profile_links(driver, page_number):
    """
    Extract profile links from a page using Selenium.
    """
    try:
        # Construct and navigate to page URL
        bank_info = config['banks']['bank3']
        page_url = f"{bank_info['base_url']}/donor-search/"
        if page_number > 1:
            page_url += f"?_page={page_number}"
        
        print(f"Navigating to page {page_number}...")
        driver.get(page_url)
        time.sleep(5)  # Give more time for the page to load
        
        # Save screenshot for debugging
        driver.save_screenshot(f'donor_page_{page_number}.png')
        
        # Try multiple methods to find donor links
        profile_links = []
        
        # Method 1: Look for links containing donor IDs
        links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/donor/']")
        if not links:
            # Method 2: Try looking for thumbnail containers
            links = driver.find_elements(By.CSS_SELECTOR, ".ct-div-block.space-bottom-2x.shadow-sm.shadow-hover")
        
        for link in links:
            try:
                href = link.get_attribute('href')
                if href and 'donor' in href:
                    # Clean up the URL
                    clean_href = href.replace(bank_info['base_url'], '').strip('/')
                    if clean_href not in profile_links:
                        profile_links.append(clean_href)
            except:
                continue
        
        print(f"Found {len(profile_links)} profile links on page {page_number}")
        for link in profile_links:
            print(f"Found profile link: {link}")
        
        time.sleep(random.uniform(1, 2))
        return profile_links
        
    except Exception as e:
        print(f"Error getting profile links from page {page_number}: {str(e)}")
        driver.save_screenshot(f'error_page_{page_number}.png')
        return []

def scrape_profile(driver, relative_url):
    """
    Scrape a single profile using Selenium with more precise selectors
    """
    try:
        bank_info = config['banks']['bank3']
        full_url = f"{bank_info['base_url']}/{relative_url}"
        print(f"Scraping profile: {relative_url}")
        
        driver.get(full_url)
        time.sleep(6)  # Let the page load
        
        def get_field_value(field_name):
            """Helper function to extract field values using precise selectors"""
            try:
                print(f"\nLooking for field: {field_name}")
                
                # First attempt - look for the specific ct-text-block body-text structure
                css_selector = "div.ct-text-block.body-text span.ct-span"
                try:
                    elements = driver.find_elements(By.CSS_SELECTOR, css_selector)
                    # Look for element that has a preceding sibling containing our field name
                    for element in elements:
                        previous = element.find_element(By.XPATH, "./parent::div/preceding-sibling::div[contains(@class, 'bold-text')]")
                        if field_name in previous.text:
                            value = element.text.strip()
                            print(f"Found value with CSS: {value}")
                            return value
                except Exception as e:
                    print(f"Error with CSS attempt: {str(e)}")
                
                # Second attempt - direct xpath to span
                xpath = (
                    f"//div[contains(@class, 'ct-text-block') and "
                    f"contains(@class, 'bold-text') and "
                    f"contains(text(), '{field_name}')]/"
                    f"following-sibling::div[contains(@class, 'body-text')]//span[contains(@class, 'ct-span')]"
                )
                print(f"Trying xpath: {xpath}")
                try:
                    elements = driver.find_elements(By.XPATH, xpath)
                    for elem in elements:
                        value = elem.text.strip()
                        if value:
                            print(f"Found value with XPath: {value}")
                            return value
                except Exception as e:
                    print(f"Error with XPath attempt: {str(e)}")
                
                # Third attempt - look for any span within new_columns structure
                columns_xpath = (
                    f"//div[contains(@id, 'new_columns')]"
                    f"//div[contains(text(), '{field_name}')]/"
                    f"following-sibling::div//span[contains(@class, 'ct-span')]"
                )
                print(f"Trying columns xpath: {columns_xpath}")
                try:
                    elements = driver.find_elements(By.XPATH, columns_xpath)
                    for elem in elements:
                        value = elem.text.strip()
                        if value:
                            print(f"Found value with columns XPath: {value}")
                            return value
                except Exception as e:
                    print(f"Error with columns XPath attempt: {str(e)}")
                
                # Fourth attempt - try finding text directly following the field name
                text_xpath = (
                    f"//div[contains(text(), '{field_name}:')]/"
                    f"following-sibling::div[contains(@class, 'ct-text-block')]"
                )
                print(f"Trying text xpath: {text_xpath}")
                try:
                    elements = driver.find_elements(By.XPATH, text_xpath)
                    for elem in elements:
                        value = elem.text.strip()
                        if value:
                            print(f"Found value with text XPath: {value}")
                            return value
                except Exception as e:
                    print(f"Error with text XPath attempt: {str(e)}")
                
                # Special cases
                if field_name == 'Donor ID':
                    try:
                        donor_id_elem = driver.find_element(By.XPATH, "//span[@id='span-6-12730']")
                        return donor_id_elem.text.strip()
                    except:
                        pass
                
                print(f"No value found for {field_name}")
                return "Not found"
                
            except Exception as e:
                print(f"Error getting {field_name}: {str(e)}")
                return "Not found"
        
        # Dictionary of fields to extract with exact labels
        fields = {
            'donor_alias': 'Donor Alias',
            'donor_id': 'Donor ID',
            'height': 'Height',
            'weight': 'Weight',
            'race': 'Race',
        }
        
        # Extract all fields
        profile_data = {}
        
        for field_key, field_label in fields.items():
            value = get_field_value(field_label)
            if field_key == 'weight' and value != "Not found":
                value = value.replace(' lbs.', '').strip()
            profile_data[field_key] = value
            print(f"Extracted {field_key}: {value}")
        
        # Get staff impression
        try:
            staff_impression = driver.find_element(
                By.XPATH,
                "//div[contains(@class, 'ct-text-block') and contains(@class, 'body-text')]//span//p"
            ).text.strip()
        except:
            staff_impression = "Not found"
                
        profile_data['staff_impression'] = staff_impression
        
        time.sleep(random.uniform(1, 2))
        return profile_data
        
    except Exception as e:
        print(f"Error scraping profile: {str(e)}")
        driver.save_screenshot(f'error_profile_{random.randint(1000,9999)}.png')
        return None

## 4. Testing Page 1

In [12]:
# Configure basic logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def test_page_one():
    """
    Test function to scrape only page 1 profiles using Selenium.
    Includes detailed logging and error handling.
    """
    # Initialize logging
    logger = logging.getLogger(__name__)
    
    driver = None
    try:
        # Load configuration
        global config
        config = load_config()
        
        print("Starting test execution...")
        
        # Update output directory
        config['output_directory'] = '/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw/bank3'
        
        # Create output directory if it doesn't exist
        os.makedirs(config['output_directory'], exist_ok=True)
        
        print("Config loaded, setting up WebDriver...")
        
        # Setup WebDriver and login
        driver = setup_driver()
        
        print("Attempting login...")
        if not login(driver, config):
            print("Login failed!")
            return
        
        print("Login successful, starting profile scrape...")
        
        # Test scraping page 1 only
        test_profiles = []
        
        # Get profile links from page 1
        profile_links = get_profile_links(driver, 1)
        print(f"Found {len(profile_links)} profiles on page 1")
        
        # Limit to first 3 profiles for testing
        test_links = profile_links[:3]
        print(f"Testing with first {len(test_links)} profiles")
        
        # Scrape each profile from page 1
        for relative_url in test_links:
            print(f"Scraping profile: {relative_url}")
            profile_data = scrape_profile(driver, relative_url)
            if profile_data:
                test_profiles.append(profile_data)
                print(f"Successfully scraped profile {profile_data.get('donor_id', 'unknown')}")
            else:
                print(f"Failed to scrape profile: {relative_url}")
            
            time.sleep(random.uniform(2, 3))
            
        # Save test results
        if test_profiles:
            try:
                # Save as JSON
                timestamp = time.strftime("%Y%m%d_%H%M%S")
                json_filename = os.path.join(config['output_directory'], f'test_page1_{timestamp}.json')
                with open(json_filename, 'w', encoding='utf-8') as f:
                    json.dump(test_profiles, f, indent=2, ensure_ascii=False)
                print(f"Test data saved to {json_filename}")
                
                # Print first profile for verification
                print("\nFirst profile data for verification:")
                print(json.dumps(test_profiles[0], indent=2))
                
            except Exception as e:
                print(f"Error saving test data: {str(e)}")
                raise
        else:
            print("No profiles were collected during the test")
            
    except Exception as e:
        print(f"Error during test: {str(e)}")
        raise
        
    finally:
        if driver:
            print("Cleaning up WebDriver...")
            cleanup_driver(driver)
        print("Test completed.")

# Run the test if this file is run directly
if __name__ == "__main__":
    test_page_one()

Starting test execution...
Config loaded, setting up WebDriver...


2024-11-01 23:10:44 - INFO - Navigating to main site...


Attempting login...


2024-11-01 23:10:50 - INFO - Saved screenshot: debug_main_page.png
2024-11-01 23:10:50 - INFO - Looking for login button...
2024-11-01 23:10:50 - INFO - Found login button, clicking...
2024-11-01 23:10:54 - INFO - Saved screenshot: debug_login_page.png
2024-11-01 23:10:54 - INFO - Looking for login form fields...
2024-11-01 23:10:55 - INFO - Entering credentials...
2024-11-01 23:11:16 - INFO - Clicking submit button...
2024-11-01 23:11:22 - INFO - Saved screenshot: debug_after_submit.png
2024-11-01 23:11:22 - INFO - Login successful!


Login successful, starting profile scrape...
Navigating to page 1...
Found 8 profile links on page 1
Found profile link: donor/162668
Found profile link: donor/10128
Found profile link: donor/10126
Found profile link: donor/10125
Found profile link: donor/10124
Found profile link: donor/10123
Found profile link: donor/10122
Found profile link: donor/10119
Found 8 profiles on page 1
Testing with first 3 profiles
Scraping profile: donor/162668
Scraping profile: donor/162668

Looking for field: Donor Alias
Error with CSS attempt: Message: no such element: Unable to locate element: {"method":"xpath","selector":"./parent::div/preceding-sibling::div[contains(@class, 'bold-text')]"}
  (Session info: chrome=130.0.6723.92); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000104607648 cxxbridge1$str$ptr + 3645404
1   chromedriver              

## 5. Scrape and Save

In [17]:
def main():
    """
    Main function to handle the scraping process with conservative timing for overnight running
    """
    driver = None
    try:
        # Load configuration
        global config
        config = load_config()
        
        # Update output directory
        config['output_directory'] = '/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw/bank3'
        
        # Create output directory if it doesn't exist
        os.makedirs(config['output_directory'], exist_ok=True)
        
        print("Starting scraper...")
        
        # Setup WebDriver and login
        driver = setup_driver()
        
        print("Attempting login...")
        if not login(driver, config):
            print("Login failed")
            return
            
        print("Successfully logged in")
        
        # Initialize list for all profiles
        all_profiles = []
        
        # Scrape each page
        max_pages = config['banks']['bank3'].get('max_pages', 1)
        for page in range(1, max_pages + 1):
            print(f"\n=== Scraping page {page} of {max_pages} ===")
            
            # Get profile links from current page
            profile_links = get_profile_links(driver, page)
            print(f"Found {len(profile_links)} profiles on page {page}")
            
            # Scrape each profile
            for idx, relative_url in enumerate(profile_links, 1):
                print(f"\nScraping profile {idx}/{len(profile_links)} on page {page}: {relative_url}")
                profile_data = scrape_profile(driver, relative_url)
                
                if profile_data:
                    all_profiles.append(profile_data)
                    print(f"Successfully scraped profile {profile_data.get('donor_id', 'unknown')}")
                
                # More variable wait between profiles (4-8 seconds)
                wait_time = random.uniform(4, 8)
                print(f"Waiting {wait_time:.1f} seconds before next profile...")
                time.sleep(wait_time)
            
            # Save progress after each page
            if all_profiles:
                save_data(all_profiles, config['output_directory'], f'profiles_bank3_page_{page}')
            
            # More variable wait between pages (8-15 seconds)
            if page < max_pages:  # Don't wait after the last page
                wait_time = random.uniform(8, 15)
                print(f"\nWaiting {wait_time:.1f} seconds before next page...")
                time.sleep(wait_time)
            
            # Additional periodic longer pause every 3 pages
            if page % 3 == 0 and page < max_pages:
                wait_time = random.uniform(20, 30)
                print(f"\nTaking a longer break after page {page}... ({wait_time:.1f} seconds)")
                time.sleep(wait_time)
            
    except Exception as e:
        print(f"Error during scraping: {str(e)}")
        raise
        
    finally:
        if driver:
            cleanup_driver(driver)
        print("Scraping completed.")

def save_data(profiles, output_dir, filename_prefix):
    """
    Save scraped data to both JSON and CSV formats
    """
    try:
        # Add timestamp to filename to prevent overwrites
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        
        # Save as JSON
        json_filename = os.path.join(output_dir, f'{filename_prefix}_{timestamp}.json')
        with open(json_filename, 'w', encoding='utf-8') as f:
            json.dump(profiles, f, indent=2, ensure_ascii=False)
        print(f"Data saved to {json_filename}")
        
        # Save as CSV
        csv_filename = os.path.join(output_dir, f'{filename_prefix}_{timestamp}.csv')
        if profiles:
            with open(csv_filename, 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=profiles[0].keys())
                writer.writeheader()
                writer.writerows(profiles)
            print(f"Data saved to {csv_filename}")
            
    except Exception as e:
        print(f"Error saving data: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Starting scraper...


2024-11-01 23:27:30 - INFO - Navigating to main site...


Attempting login...


2024-11-01 23:27:36 - INFO - Saved screenshot: debug_main_page.png
2024-11-01 23:27:36 - INFO - Looking for login button...
2024-11-01 23:27:36 - INFO - Found login button, clicking...
2024-11-01 23:27:41 - INFO - Saved screenshot: debug_login_page.png
2024-11-01 23:27:41 - INFO - Looking for login form fields...
2024-11-01 23:27:41 - INFO - Entering credentials...
2024-11-01 23:28:02 - INFO - Clicking submit button...
2024-11-01 23:28:08 - INFO - Saved screenshot: debug_after_submit.png
2024-11-01 23:28:08 - INFO - Login successful!


Successfully logged in

=== Scraping page 1 of 9 ===
Navigating to page 1...
Found 8 profile links on page 1
Found profile link: donor/162668
Found profile link: donor/10128
Found profile link: donor/10126
Found profile link: donor/10125
Found profile link: donor/10124
Found profile link: donor/10123
Found profile link: donor/10122
Found profile link: donor/10119
Found 8 profiles on page 1

Scraping profile 1/8 on page 1: donor/162668
Scraping profile: donor/162668

Looking for field: Donor Alias
Error with CSS attempt: Message: no such element: Unable to locate element: {"method":"xpath","selector":"./parent::div/preceding-sibling::div[contains(@class, 'bold-text')]"}
  (Session info: chrome=130.0.6723.92); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x00000001014df648 cxxbridge1$str$ptr + 3645404
1   chromedriver                     