In [3]:
# Import cell
from login_bank3 import setup_driver, login, cleanup_driver
from config import load_config
import logging
import json
import os
from datetime import datetime
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import random
import time


In [4]:

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)


In [5]:
def scrape_profile(driver, relative_url):
    """
    Scrape a single profile using Selenium with more precise selectors
    """
    try:
        bank_info = config['banks']['bank3']
        full_url = f"{bank_info['base_url']}/{relative_url}"
        print(f"Scraping profile: {relative_url}")
        
        driver.get(full_url)
        time.sleep(6)  # Let the page load
        
        def get_field_value(field_name):
            """Helper function to extract field values using precise selectors"""
            try:
                print(f"\nLooking for field: {field_name}")
                
                # First attempt - look for the specific ct-text-block body-text structure
                css_selector = "div.ct-text-block.body-text span.ct-span"
                try:
                    elements = driver.find_elements(By.CSS_SELECTOR, css_selector)
                    # Look for element that has a preceding sibling containing our field name
                    for element in elements:
                        previous = element.find_element(By.XPATH, "./parent::div/preceding-sibling::div[contains(@class, 'bold-text')]")
                        if field_name in previous.text:
                            value = element.text.strip()
                            print(f"Found value with CSS: {value}")
                            return value
                except Exception as e:
                    print(f"Error with CSS attempt: {str(e)}")
                
                # Second attempt - direct xpath to span
                xpath = (
                    f"//div[contains(@class, 'ct-text-block') and "
                    f"contains(@class, 'bold-text') and "
                    f"contains(text(), '{field_name}')]/"
                    f"following-sibling::div[contains(@class, 'body-text')]//span[contains(@class, 'ct-span')]"
                )
                print(f"Trying xpath: {xpath}")
                try:
                    elements = driver.find_elements(By.XPATH, xpath)
                    for elem in elements:
                        value = elem.text.strip()
                        if value:
                            print(f"Found value with XPath: {value}")
                            return value
                except Exception as e:
                    print(f"Error with XPath attempt: {str(e)}")
                
                # Third attempt - look for any span within new_columns structure
                columns_xpath = (
                    f"//div[contains(@id, 'new_columns')]"
                    f"//div[contains(text(), '{field_name}')]/"
                    f"following-sibling::div//span[contains(@class, 'ct-span')]"
                )
                print(f"Trying columns xpath: {columns_xpath}")
                try:
                    elements = driver.find_elements(By.XPATH, columns_xpath)
                    for elem in elements:
                        value = elem.text.strip()
                        if value:
                            print(f"Found value with columns XPath: {value}")
                            return value
                except Exception as e:
                    print(f"Error with columns XPath attempt: {str(e)}")
                
                # Fourth attempt - try finding text directly following the field name
                text_xpath = (
                    f"//div[contains(text(), '{field_name}:')]/"
                    f"following-sibling::div[contains(@class, 'ct-text-block')]"
                )
                print(f"Trying text xpath: {text_xpath}")
                try:
                    elements = driver.find_elements(By.XPATH, text_xpath)
                    for elem in elements:
                        value = elem.text.strip()
                        if value:
                            print(f"Found value with text XPath: {value}")
                            return value
                except Exception as e:
                    print(f"Error with text XPath attempt: {str(e)}")
                
                # Special cases
                if field_name == 'Donor ID':
                    try:
                        donor_id_elem = driver.find_element(By.XPATH, "//span[@id='span-6-12730']")
                        return donor_id_elem.text.strip()
                    except:
                        pass
                
                print(f"No value found for {field_name}")
                return "Not found"
                
            except Exception as e:
                print(f"Error getting {field_name}: {str(e)}")
                return "Not found"
        
        # Dictionary of fields to extract with exact labels
        fields = {
            'donor_alias': 'Donor Alias',
            'donor_id': 'Donor ID',
            'height': 'Height',
            'weight': 'Weight',
            'race': 'Race',
        }
        
        # Extract all fields
        profile_data = {}
        
        for field_key, field_label in fields.items():
            value = get_field_value(field_label)
            if field_key == 'weight' and value != "Not found":
                value = value.replace(' lbs.', '').strip()
            profile_data[field_key] = value
            print(f"Extracted {field_key}: {value}")
        
        # Get staff impression
        try:
            staff_impression = driver.find_element(
                By.XPATH,
                "//div[contains(@class, 'ct-text-block') and contains(@class, 'body-text')]//span//p"
            ).text.strip()
        except:
            staff_impression = "Not found"
                
        profile_data['staff_impression'] = staff_impression
        
        return profile_data
        
    except Exception as e:
        print(f"Error scraping profile: {str(e)}")
        driver.save_screenshot(f'error_profile_{random.randint(1000,9999)}.png')
        return None

In [6]:
# Test cell
print("Starting test execution...")

driver = None
try:
    # Load configuration
    config = load_config()
    
    # Update output directory
    config['output_directory'] = '/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw/bank3'
    
    # Create output directory if it doesn't exist
    os.makedirs(config['output_directory'], exist_ok=True)
    
    print("Config loaded, setting up WebDriver...")
    
    # Setup WebDriver
    driver = setup_driver()
    
    # Login first
    print("Attempting login...")
    if not login(driver, config):
        print("Login failed!")
    else:
        print("Login successful, starting profile scrape...")
        
        # Test with a single profile
        profile_url = "donor/10128"
        print(f"Scraping profile: {profile_url}")
        
        profile_data = scrape_profile(driver, profile_url)
        
        if profile_data:
            print("\nSuccessfully scraped profile. Data:")
            print(json.dumps(profile_data, indent=2))
            
            # Save to correct directory
            output_file = os.path.join(config['output_directory'], 'test_single_profile.json')
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(profile_data, f, indent=2)
            print(f"\nData saved to: {output_file}")
        else:
            print("Failed to scrape profile")

except Exception as e:
    print(f"Test failed with error: {str(e)}")
    
finally:
    if driver:
        print("Cleaning up WebDriver...")
        cleanup_driver(driver)
    print("Test completed.")

Starting test execution...
Config loaded, setting up WebDriver...


2024-11-01 22:41:22 - INFO - Navigating to main site...


Attempting login...


2024-11-01 22:41:27 - INFO - Saved screenshot: debug_main_page.png
2024-11-01 22:41:27 - INFO - Looking for login button...
2024-11-01 22:41:27 - INFO - Found login button, clicking...
2024-11-01 22:41:32 - INFO - Saved screenshot: debug_login_page.png
2024-11-01 22:41:32 - INFO - Looking for login form fields...
2024-11-01 22:41:32 - INFO - Entering credentials...
2024-11-01 22:41:53 - INFO - Clicking submit button...
2024-11-01 22:41:59 - INFO - Saved screenshot: debug_after_submit.png
2024-11-01 22:41:59 - INFO - Login successful!


Login successful, starting profile scrape...
Scraping profile: donor/10128
Scraping profile: donor/10128

Looking for field: Donor Alias
Error with CSS attempt: Message: no such element: Unable to locate element: {"method":"xpath","selector":"./parent::div/preceding-sibling::div[contains(@class, 'bold-text')]"}
  (Session info: chrome=130.0.6723.92); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000100e1f648 cxxbridge1$str$ptr + 3645404
1   chromedriver                        0x0000000100e17ea8 cxxbridge1$str$ptr + 3614780
2   chromedriver                        0x0000000100884104 cxxbridge1$string$len + 88416
3   chromedriver                        0x00000001008c6364 cxxbridge1$string$len + 359360
4   chromedriver                        0x00000001008bc8b0 cxxbridge1$string$len + 319756
5   chromedriver                        0x000