## Extract Donor IDs from web scraping result

In [2]:
import json
import os
import pandas as pd
from typing import List

class DonorIDExtractor:
    def __init__(self, output_dir: str):
        self.output_dir = output_dir
        
    def extract_all_donor_ids(self, profiles_json_path: str) -> List[str]:
        """Extract all donor IDs from profiles JSON"""
        try:
            # Load profiles JSON
            with open(profiles_json_path, 'r', encoding='utf-8') as f:
                profiles = json.load(f)
            
            # Extract all donor IDs, cleaning the format
            donor_ids = []
            for profile in profiles:
                donor_id = profile['donor_id'].replace('Donor ', '').strip()
                donor_ids.append(donor_id)
            
            # Remove duplicates while maintaining order
            unique_donor_ids = list(dict.fromkeys(donor_ids))
            
            print(f"Found {len(unique_donor_ids)} unique donors")
            print("First few donor IDs:", unique_donor_ids[:5])
            
            # Save to txt file for easier access
            txt_path = os.path.join(self.output_dir, 'bank2_donor_ids.txt')
            with open(txt_path, 'w') as f:
                for donor_id in unique_donor_ids:
                    f.write(f"{donor_id}\n")
            
            print(f"\nSaved donor IDs to: {os.path.basename(txt_path)}")
            return unique_donor_ids
            
        except Exception as e:
            print(f"Error extracting donor IDs: {str(e)}")
            return []

def extract_ids():
    """Extract all donor IDs from profiles"""
    output_dir = "/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw/bank2"
    profiles_json = os.path.join(output_dir, "profiles_bank2.json")  # Updated filename
    
    extractor = DonorIDExtractor(output_dir)
    donor_ids = extractor.extract_all_donor_ids(profiles_json)
    
    if donor_ids:
        print(f"\nTotal unique donors: {len(donor_ids)}")

if __name__ == "__main__":
    extract_ids()

Found 110 unique donors
First few donor IDs: ['1821', '1888', '1897', '1896', '1617']

Saved donor IDs to: bank2_donor_ids.txt

Total unique donors: 110


## Test Scraping for the Donor Essays Link

In [2]:
import json
import os
import pandas as pd
from typing import Dict, List
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random

def load_config():
    """Load configuration from config.json"""
    with open('config.json', 'r') as f:
        config = json.load(f)
    return config

class DonorEssayScraper:
    def __init__(self, output_dir: str):
        self.output_dir = output_dir
        config = load_config()
        self.base_url = config['banks']['bank2']['base_url'] + "/catalogue/products/donor-{}"
    
    def load_donor_ids(self, limit: int = None) -> List[str]:
        """Load donor IDs from txt file"""
        input_file = os.path.join(self.output_dir, "bank2_donor_ids.txt")
        with open(input_file, 'r') as f:
            donor_ids = [line.strip() for line in f]
            
        if limit:
            donor_ids = donor_ids[:limit]
        return donor_ids
    
    def get_donor_essay_url(self, driver, donor_id: str) -> str:
        """Find donor essay URL on donor profile page"""
        try:
            # Keep original selector pattern for functionality but use anonymized variables
            links = driver.find_elements(By.CSS_SELECTOR, f"a[href*='{donor_id}'][href*='pen-sketch']")
            
            for link in links:
                href = link.get_attribute('href')
                if href and '.pdf' in href:
                    print(f"Found donor essay for ID: {donor_id}")
                    return href
            
            print(f"No donor essay found for ID: {donor_id}")
            return None
            
        except Exception as e:
            print(f"Error finding donor essay for ID {donor_id}: {str(e)}")
            return None
    
    def scrape_donor_essay_urls(self, num_donors: int = 3):
        """Test scrape donor essay URLs"""
        driver = setup_driver()
        results = []
        
        try:
            donor_ids = self.load_donor_ids(limit=num_donors)
            print(f"\nProcessing {len(donor_ids)} donor profiles")
            
            for i, donor_id in enumerate(donor_ids, 1):
                print(f"\nProcessing ID {i} of {num_donors}")
                
                try:
                    print(f"Accessing donor profile...")
                    driver.get(self.base_url.format(donor_id))
                    time.sleep(random_delay(2, 4))
                    
                    essay_url = self.get_donor_essay_url(driver, donor_id)
                    
                    # Store the result but don't print the URL
                    results.append({
                        'donor_id': donor_id,
                        'essay_url': essay_url
                    })
                    
                    if i < len(donor_ids):
                        delay = random_delay(2, 4)
                        print(f"Waiting {delay:.1f} seconds...")
                        time.sleep(delay)
                        
                except Exception as e:
                    print(f"Error processing ID {donor_id}: {str(e)}")
                    continue
                    
        except Exception as e:
            print(f"Error during scraping: {str(e)}")
        finally:
            driver.quit()
        
        self.save_results(results, test_suffix="_test_3donors")
        
    def save_results(self, results: List[Dict], test_suffix: str = ""):
        """Save scraped URLs to JSON"""
        output_file = os.path.join(self.output_dir, f"donor_essay_urls{test_suffix}.json")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2)
            
        print(f"\nSaved results to: {os.path.basename(output_file)}")
        print("\nResults summary:")
        for result in results:
            status = 'Found' if result['essay_url'] else 'Not found'
            print(f"ID {result['donor_id']}: {status}")

def random_delay(min_seconds=2, max_seconds=4):
    return random.uniform(min_seconds, max_seconds)

def setup_driver():
    """Set up and return the Chrome WebDriver"""
    options = webdriver.ChromeOptions()
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-notifications')
    
    driver = webdriver.Chrome(options=options)
    driver.set_page_load_timeout(30)
    return driver

# Test the scraper
if __name__ == "__main__":
    output_dir = "/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw/bank2"
    scraper = DonorEssayScraper(output_dir)
    scraper.scrape_donor_essay_urls(3)


Processing 3 donor profiles

Processing ID 1 of 3
Accessing donor profile...
Found donor essay for ID: 1821
Waiting 3.0 seconds...

Processing ID 2 of 3
Accessing donor profile...
No donor essay found for ID: 1888
Waiting 3.8 seconds...

Processing ID 3 of 3
Accessing donor profile...
Found donor essay for ID: 1897

Saved results to: donor_essay_urls_test_3donors.json

Results summary:
ID 1821: Found
ID 1888: Not found
ID 1897: Found


## Full scrape for the donor essay URLs

In [4]:
import json
import os
import pandas as pd
from typing import Dict, List
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random
import requests
from urllib.parse import urlparse
from pathlib import Path

def load_config():
    """Load configuration from config.json"""
    with open('config.json', 'r') as f:
        config = json.load(f)
    return config

class DonorEssayProcessor:
    def __init__(self, base_dir: str, pdf_dir: str):
        self.base_dir = base_dir  # Directory containing donor IDs file
        self.pdf_dir = pdf_dir    # Directory for PDF storage
        self.ensure_pdf_directory()
        config = load_config()
        self.base_url = config['banks']['bank2']['base_url'] + "/catalogue/products/donor-{}"
    
    def ensure_pdf_directory(self):
        """Create PDF directory if it doesn't exist"""
        os.makedirs(self.pdf_dir, exist_ok=True)
        print(f"PDF directory ready at: {os.path.basename(self.pdf_dir)}/")

    def load_donor_ids(self) -> List[str]:
        """Load all donor IDs from txt file"""
        input_file = os.path.join(self.base_dir, "bank2_donor_ids.txt")
        with open(input_file, 'r') as f:
            donor_ids = [line.strip() for line in f]
        return donor_ids
    
    def get_donor_essay_url(self, driver, donor_id: str) -> str:
        """Find donor essay URL on donor profile page"""
        try:
            links = driver.find_elements(By.CSS_SELECTOR, f"a[href*='{donor_id}'][href*='pen-sketch']")
            
            for link in links:
                href = link.get_attribute('href')
                if href and '.pdf' in href:
                    print(f"Found donor essay for ID: {donor_id}")
                    return href
            
            print(f"No donor essay found for ID: {donor_id}")
            return None
            
        except Exception as e:
            print(f"Error finding donor essay for ID {donor_id}: {str(e)}")
            return None

    def download_pdf(self, url: str, donor_id: str) -> bool:
        """Download PDF file and save it locally"""
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            
            response = requests.get(url, headers=headers)
            if response.status_code == 200 and response.headers['content-type'] == 'application/pdf':
                pdf_path = os.path.join(self.pdf_dir, f"donor_{donor_id}_essay.pdf")
                with open(pdf_path, 'wb') as f:
                    f.write(response.content)
                print(f"Successfully downloaded essay for ID: {donor_id}")
                return True
            else:
                print(f"Failed to download essay for ID: {donor_id} (Status: {response.status_code})")
                return False
                
        except Exception as e:
            print(f"Error downloading essay for ID {donor_id}: {str(e)}")
            return False
    
    def process_all_donors(self):
        """Scrape URLs and download PDFs for all donors"""
        driver = setup_driver()
        results = []
        
        try:
            donor_ids = self.load_donor_ids()
            total_donors = len(donor_ids)
            print(f"\nStarting full process for {total_donors} donor profiles")
            
            for i, donor_id in enumerate(donor_ids, 1):
                print(f"\nProcessing ID {i} of {total_donors} ({(i/total_donors)*100:.1f}% complete)")
                
                try:
                    print(f"Accessing donor profile...")
                    driver.get(self.base_url.format(donor_id))
                    time.sleep(random_delay(2, 4))
                    
                    essay_url = self.get_donor_essay_url(driver, donor_id)
                    download_success = False
                    
                    if essay_url:
                        download_success = self.download_pdf(essay_url, donor_id)
                    
                    results.append({
                        'donor_id': donor_id,
                        'essay_url': essay_url,
                        'download_success': download_success
                    })
                    
                    # Save intermediate results every 10 donors
                    if i % 10 == 0:
                        self.save_results(results, intermediate=True)
                    
                    if i < total_donors:
                        delay = random_delay(2, 4)
                        print(f"Waiting {delay:.1f} seconds...")
                        time.sleep(delay)
                        
                except Exception as e:
                    print(f"Error processing ID {donor_id}: {str(e)}")
                    continue
                    
        except Exception as e:
            print(f"Error during processing: {str(e)}")
        finally:
            driver.quit()
            
        # Save final results
        self.save_results(results)
        
    def save_results(self, results: List[Dict], intermediate: bool = False):
        """Save scraped URLs and download status to JSON"""
        suffix = "_intermediate" if intermediate else "_final"
        output_file = os.path.join(self.pdf_dir, f"donor_essay_results{suffix}.json")
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2)
            
        print(f"\nSaved {len(results)} results to: {os.path.basename(output_file)}")
        
        # Print summary statistics
        found_count = sum(1 for r in results if r['essay_url'])
        download_count = sum(1 for r in results if r.get('download_success', False))
        print(f"\nProgress Summary:")
        print(f"Total profiles processed: {len(results)}")
        print(f"Essays found: {found_count}")
        print(f"Essays downloaded: {download_count}")
        print(f"Success rate: {(download_count/len(results))*100:.1f}%")

def random_delay(min_seconds=2, max_seconds=4):
    """Generate random delay between requests"""
    return random.uniform(min_seconds, max_seconds)

def setup_driver():
    """Set up and return the Chrome WebDriver"""
    options = webdriver.ChromeOptions()
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-notifications')
    
    driver = webdriver.Chrome(options=options)
    driver.set_page_load_timeout(30)
    return driver

# Run the full processor
if __name__ == "__main__":
    base_dir = "/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw/bank2"  # Directory with donor IDs
    pdf_dir = "/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw/bank2/donor_essay_pdfs"  # Directory for PDFs
    
    processor = DonorEssayProcessor(base_dir, pdf_dir)
    processor.process_all_donors()

PDF directory ready at: donor_essay_pdfs/

Starting full process for 110 donor profiles

Processing ID 1 of 110 (0.9% complete)
Accessing donor profile...
Found donor essay for ID: 1821
Successfully downloaded essay for ID: 1821
Waiting 2.1 seconds...

Processing ID 2 of 110 (1.8% complete)
Accessing donor profile...
No donor essay found for ID: 1888
Waiting 2.8 seconds...

Processing ID 3 of 110 (2.7% complete)
Accessing donor profile...
Found donor essay for ID: 1897
Successfully downloaded essay for ID: 1897
Waiting 3.6 seconds...

Processing ID 4 of 110 (3.6% complete)
Accessing donor profile...
Found donor essay for ID: 1896
Successfully downloaded essay for ID: 1896
Waiting 2.1 seconds...

Processing ID 5 of 110 (4.5% complete)
Accessing donor profile...
No donor essay found for ID: 1617
Waiting 2.9 seconds...

Processing ID 6 of 110 (5.5% complete)
Accessing donor profile...
Found donor essay for ID: 1873
Successfully downloaded essay for ID: 1873
Waiting 3.6 seconds...

Process