In [13]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
import time
import json
import random
import os
import pandas as pd
from typing import Dict, List

def random_delay(min_seconds=3, max_seconds=10):
    """Generate a random delay between min_seconds and max_seconds"""
    return random.uniform(min_seconds, max_seconds)

def setup_driver():
    """Set up and return the Chrome WebDriver"""
    options = webdriver.ChromeOptions()
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-notifications')
    
    driver = webdriver.Chrome(options=options)
    driver.set_page_load_timeout(30)
    return driver

class DonorProfileScraper:
    def __init__(self, output_dir: str):
        self.output_dir = output_dir
        self.base_url = "https://fairfaxcryobank.com/search/donorprofile.aspx"
        
    def get_profile_url(self, donor_id: str) -> str:
        """Generate correct profile URL"""
        return f"{self.base_url}?number={donor_id}"
    
    def scrape_single_donor(self, donor_id: str) -> Dict:
        """Scrape a single donor with fresh driver instance"""
        driver = setup_driver()
        try:
            profile_url = self.get_profile_url(donor_id)
            print("\nAccessing profile")
            
            driver.get(profile_url)
            time.sleep(random_delay(3, 5))
            
            # Get basic info first
            print("Getting basic info...")
            basic_info = self.get_basic_info(driver)
            
            # Get donor description
            print("Getting donor description...")
            description = self.get_description(driver)
            
            # Get list info
            print("Getting list info...")
            list_info = self.get_list_info(driver)
            
            # Get essay URL only
            print("Getting essay URL...")
            essays = self.get_essay_responses(driver)
            
            profile_data = {
                "donor_id": donor_id,
                "description": description,
                **basic_info,
                **list_info,
                **essays  # Will only contain essay_download_url if found
            }
            
            print(f"Successfully scraped data for donor {donor_id}")
            return profile_data
            
        except Exception as e:
            print(f"Error scraping donor {donor_id}: {str(e)}")
            return {"donor_id": donor_id, "error": str(e)}
        finally:
            try:
                driver.quit()
            except:
                pass
    
    def get_basic_info(self, driver) -> Dict:
        """Get basic information from the height table"""
        basic_info = {}
        try:
            table = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "height"))
            )
            rows = table.find_elements(By.TAG_NAME, "tr")
            for row in rows:
                try:
                    cells = row.find_elements(By.TAG_NAME, "td")
                    if len(cells) >= 2:
                        for cell in cells:
                            try:
                                title = cell.find_element(By.CLASS_NAME, "title-block").text.strip(':')
                                info = cell.find_element(By.CLASS_NAME, "info").text.strip()
                                if title and info:
                                    basic_info[title.lower()] = info
                            except:
                                continue
                except:
                    continue
        except Exception as e:
            print(f"Error getting basic info: {str(e)}")
        return basic_info
    
    def get_description(self, driver) -> str:
        """Get donor description"""
        try:
            description = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.text p"))
            ).text.strip()
            return description
        except:
            return None
    
    def get_list_info(self, driver) -> Dict:
        """Get information from the list-info section"""
        list_info = {}
        try:
            list_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "ul.list-info li"))
            )
            
            for item in list_items:
                try:
                    spans = item.find_elements(By.TAG_NAME, "span")
                    if len(spans) >= 2:
                        label = spans[1].text.strip(':')  # width class
                        value = spans[2].text.strip()     # text class
                        if label and value:
                            list_info[label.lower()] = value
                except:
                    continue
                    
        except Exception as e:
            print(f"Error getting list info: {str(e)}")
        return list_info
    
    def get_essay_responses(self, driver) -> Dict:
        """Get essay download URL only"""
        essay_data = {}
        
        try:
            # Get the download URL
            try:
                essay_link = driver.find_element(
                    By.XPATH, 
                    "//span[contains(text(), 'Donor Essay')]/following-sibling::a[contains(@href, 'download.aspx')]"
                )
                
                if essay_link:
                    essay_data['essay_download_url'] = essay_link.get_attribute('href')
                    print("Found essay download URL")
            except:
                print("No essay download link found")
                
        except Exception as e:
            print(f"Error in essay URL processing: {str(e)}")
            
        return essay_data

def save_results(profiles: List[Dict], output_dir: str, test_suffix: str = ""):
    """Save results"""
    # Save full data as JSON
    json_path = os.path.join(output_dir, f"donor_profiles{test_suffix}.json")
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(profiles, f, indent=2, ensure_ascii=False)
        
    # Save as CSV
    csv_path = os.path.join(output_dir, f"donor_profiles{test_suffix}.csv")
    pd.DataFrame(profiles).to_csv(csv_path, index=False, encoding='utf-8')
    
    print(f"\nSaved profiles to:")
    print(f"- {os.path.basename(csv_path)}")
    print(f"- {os.path.basename(json_path)}")

def scrape_all_donors():
    """Function to scrape all donor profiles"""
    input_file = "/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw/bank4/bank4_donor_ids.txt"
    output_dir = os.path.dirname(input_file)
    
    # Read all donor IDs
    with open(input_file, 'r') as f:
        donor_ids = [line.strip() for line in f]
    
    print(f"Starting to scrape {len(donor_ids)} donors")
    
    scraper = DonorProfileScraper(output_dir)
    profiles = []
    
    for i, donor_id in enumerate(donor_ids, 1):
        print(f"\nScraping donor {i} of {len(donor_ids)} (ID: {donor_id})")
        profile = scraper.scrape_single_donor(donor_id)
        profiles.append(profile)
        
        # Add random delay between profiles
        delay = random_delay(3, 7)
        print(f"Waiting {delay:.1f} seconds before next profile...")
        time.sleep(delay)
        
        # Save progress every 10 donors
        if i % 10 == 0:
            save_results(profiles, output_dir, test_suffix="")
            print(f"Progress saved after {i} donors")
    
    # Final save
    save_results(profiles, output_dir, test_suffix="")
    print(f"\nCompleted scraping {len(profiles)} donor profiles")

if __name__ == "__main__":
    scrape_all_donors()

Starting to scrape 395 donors

Scraping donor 1 of 395 (ID: 0351)

Accessing profile
Getting basic info...
Getting donor description...
Getting list info...
Getting essay URL...
Found essay download URL
Successfully scraped data for donor 0351
Waiting 3.4 seconds before next profile...

Scraping donor 2 of 395 (ID: 0406)

Accessing profile
Getting basic info...
Getting donor description...
Getting list info...
Getting essay URL...
Found essay download URL
Successfully scraped data for donor 0406
Waiting 4.9 seconds before next profile...

Scraping donor 3 of 395 (ID: 0834)

Accessing profile
Getting basic info...
Getting donor description...
Getting list info...
Getting essay URL...
No essay download link found
Successfully scraped data for donor 0834
Waiting 3.1 seconds before next profile...

Scraping donor 4 of 395 (ID: 0846)

Accessing profile
Getting basic info...
Getting donor description...
Getting list info...
Getting essay URL...
Found essay download URL
Successfully scraped d

Error sending stats to Plausible: error sending request for url (https://plausible.io/api/event)



Accessing profile
Getting basic info...
Getting donor description...
Getting list info...
Getting essay URL...
Found essay download URL
Successfully scraped data for donor 6876
Waiting 6.2 seconds before next profile...

Scraping donor 276 of 395 (ID: 6879)

Accessing profile
Getting basic info...
Getting donor description...
Getting list info...
Getting essay URL...
Found essay download URL
Successfully scraped data for donor 6879
Waiting 3.3 seconds before next profile...

Scraping donor 277 of 395 (ID: 6891)

Accessing profile
Getting basic info...
Getting donor description...
Getting list info...
Getting essay URL...
Found essay download URL
Successfully scraped data for donor 6891
Waiting 6.6 seconds before next profile...

Scraping donor 278 of 395 (ID: 6897)

Accessing profile
Getting basic info...
Getting donor description...
Getting list info...
Getting essay URL...
Found essay download URL
Successfully scraped data for donor 6897
Waiting 4.7 seconds before next profile...

S