# Scraping Bank1

## 1. Module Imports

In [None]:
!pip install selenium

In [68]:
import requests
from bs4 import BeautifulSoup
import json
import os
import time
import random
import logging
import csv  
from config import load_config
from log_utils import setup_anonymized_logging
from scraper_utils import add_random_delay
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime, timedelta

In [69]:
class HumanlikeBehavior:
    """Helper class for human-like browsing behavior"""
    @staticmethod
    def add_human_delay(action_type='page_view'):
        """Simulate human-like delays based on action type"""
        delays = {
            'page_view': (20, 45),  # Time to read a profile
            'scroll': (2, 5),       # Scrolling delay
            'click': (1, 3),        # Clicking delay
            'session_break': (1800, 3600)  # 30-60 minute break between sessions
        }
        delay = random.uniform(*delays[action_type])
        time.sleep(delay)
    
    @staticmethod
    def simulate_scrolling(driver):
        """Simulate human-like scrolling behavior"""
        total_height = driver.execute_script("return document.body.scrollHeight")
        viewport_height = driver.execute_script("return window.innerHeight")
        current_position = 0
        
        while current_position < total_height:
            scroll_amount = random.randint(100, 300)
            current_position = min(current_position + scroll_amount, total_height)
            
            driver.execute_script(f"window.scrollTo({{top: {current_position}, behavior: 'smooth'}})")
            HumanlikeBehavior.add_human_delay('scroll')
            
            if random.random() < 0.1:
                scroll_up = random.randint(50, 150)
                current_position = max(0, current_position - scroll_up)
                driver.execute_script(f"window.scrollTo({{top: {current_position}, behavior: 'smooth'}})")
                HumanlikeBehavior.add_human_delay('scroll')



## 2. Configure logging

In [70]:
# Configure logging
logger = setup_anonymized_logging()

## 3. Loading Configurations and Initialize Session

In [71]:
# Load configuration 
config = load_config()
bank_id = 'bank1'
bank_config = config['banks'][bank_id]

# Initialize session
session = requests.Session()
session.headers.update({
    'User-Agent': random.choice(config['user_agents'])
})

## 4. Utility functions

In [72]:
# Utility functions
def get_total_pages(known_total_donors=478, donors_per_page=15):
    """Calculate total pages needed"""
    return (known_total_donors + donors_per_page - 1) // donors_per_page

## 5. Donor ID Collection

In [74]:
## 5. Donor ID Collection

def get_all_donor_ids(session, driver):
    """Get all unique donor IDs from search pages"""
    donor_ids = set()  # Using set to avoid duplicates
    total_pages = 32
    
    try:
        print(f"Starting to collect donor IDs...")
        
        for page in range(1, total_pages + 1):
            url = f"{bank_config['base_url']}/search/?donor_sort=default_Sort&page={page}"
            print(f"Scanning page {page} of {total_pages}")
            
            driver.get(url)
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "donor-id"))
            )
            
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            donor_id_spans = soup.find_all('span', class_='donor-id')
            
            for span in donor_id_spans:
                donor_id = span.text.strip()
                donor_ids.add(donor_id)
            
            print(f"Found {len(donor_id_spans)} donors on page {page}. Total unique IDs so far: {len(donor_ids)}")
            add_random_delay(5, 10)  # Delay between pages
            
    except Exception as e:
        print(f"Error collecting donor IDs: {str(e)}")
        
    finally:
        print(f"Finished collecting donor IDs. Total unique donors found: {len(donor_ids)}")
        # Print first few IDs as a sample
        sample_ids = list(donor_ids)[:5]
        print(f"Sample of first 5 donor IDs: {sample_ids}")
        return list(donor_ids)

# Initialize Selenium and run collection
print("Initializing Selenium driver...")
options = webdriver.ChromeOptions()
options.add_argument(f'user-agent={random.choice(config["user_agents"])}')
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

try:
    collected_donor_ids = get_all_donor_ids(session, driver)
finally:
    driver.quit()

# Print final results
if collected_donor_ids:
    print(f"\nSuccessfully collected {len(collected_donor_ids)} unique donor IDs")
    # Save IDs to file in bank1 directory
    output_directory = '/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw/bank1'
    os.makedirs(output_directory, exist_ok=True)
    id_file_path = os.path.join(output_directory, 'donor_ids.txt')
    with open(id_file_path, 'w') as f:
        for donor_id in collected_donor_ids:
            f.write(f"{donor_id}\n")
    print(f"Saved donor IDs to {id_file_path}")

Initializing Selenium driver...
Starting to collect donor IDs...
Scanning page 1 of 32
Found 16 donors on page 1. Total unique IDs so far: 16
Scanning page 2 of 32
Found 30 donors on page 2. Total unique IDs so far: 31
Scanning page 3 of 32
Found 46 donors on page 3. Total unique IDs so far: 46
Scanning page 4 of 32
Found 60 donors on page 4. Total unique IDs so far: 61
Scanning page 5 of 32
Found 75 donors on page 5. Total unique IDs so far: 76
Scanning page 6 of 32
Found 90 donors on page 6. Total unique IDs so far: 91
Scanning page 7 of 32
Found 106 donors on page 7. Total unique IDs so far: 106
Scanning page 8 of 32
Found 121 donors on page 8. Total unique IDs so far: 121
Scanning page 9 of 32
Found 135 donors on page 9. Total unique IDs so far: 136
Scanning page 10 of 32
Found 150 donors on page 10. Total unique IDs so far: 151
Scanning page 11 of 32
Found 165 donors on page 11. Total unique IDs so far: 166
Scanning page 12 of 32
Found 180 donors on page 12. Total unique IDs so fa

## 6. Profile Scraping Function

In [79]:

def scrape_profile_bank1(relative_url, driver=None):
    """
    Scrape a single profile page for Bank1 using either Selenium or requests.
    """
    try:
        full_url = f"{bank_config['base_url']}{relative_url}"
        
        if driver:
            # Selenium approach
            driver.get(full_url)
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "profile-details"))
            )
            
            # Simulate human reading behavior
            HumanlikeBehavior.simulate_scrolling(driver)
            HumanlikeBehavior.add_human_delay('page_view')
            
            response_text = driver.page_source
            soup = BeautifulSoup(response_text, 'html.parser')
        else:
            # Fallback to requests approach
            response = session.get(full_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
        
        # Initialize profile data dictionary
        profile_data = {
            'url': full_url,
            'donor_id': None,
            'headline': None,
            'donor_description': None,
            'donor_lookalikes': None,
            'height': None,
            'weight': None,
            'eye_color': None,
            'hair_color': None,
            'hair_texture': None,
            'complexion': None,
            'ethnic_origin': None,
            'ancestry': None,
            'religion': None,
            'jewish_ancestry': None,
            'education_level': None,
            'areas_of_study': None
        }
            
        # Extract donor ID
        donor_id_elem = soup.find('span', class_='donor-id')
        if donor_id_elem:
            id_span = donor_id_elem.find('span', class_='id')
        if id_span:
            profile_data['donor_id'] = id_span.text.strip()
        
        # Extract headline
        description_div = soup.find('div', id='description')
        if description_div:
            headline_elem = description_div.find('h2')
        if headline_elem:
            profile_data['headline'] = headline_elem.text.strip()
        
        # Extract donor description
        description_elem = soup.find('div', id='description')
        if description_elem and description_elem.find('p'):
            profile_data['donor_description'] = description_elem.find('p').text.strip()
        
        # Extract look-alikes
        lookalikes_div = soup.find('div', class_='look-a-likes')
        if lookalikes_div:
            profile_data['donor_lookalikes'] = lookalikes_div.text.strip()
            # If the text is empty, check for the "Not available" message
            if not profile_data['donor_lookalikes']:
                not_available = lookalikes_div.find('div', class_='not-available')
                if not_available:
                    profile_data['donor_lookalikes'] = not_available.text.strip()
        
        # Physical characteristics 
        physical_chars = soup.find('section', id='appearance')
        if physical_chars:
            characteristics_div = physical_chars.find('div', id='collapse-panel-1')
            if characteristics_div:
                label_to_key = {
                    'Height:': 'height',
                    'Weight:': 'weight',
                    'Eye Color:': 'eye_color',
                    'Hair Color:': 'hair_color',
                    'Hair Texture:': 'hair_texture',
                    'Complexion:': 'complexion'
                }
                
                for li in characteristics_div.find_all('li'):
                    label_span = li.find('span', class_='tab')
                    if label_span:
                        label = label_span.text.strip()
                        if label in label_to_key:
                            value = li.get_text(strip=True)
                            value = value.replace(label, '').strip()
                            profile_data[label_to_key[label]] = value

        # Heritage information
        heritage_section = soup.find('section', id='heritage')
        if heritage_section:
            heritage_div = heritage_section.find('div', id='collapse-panel-2')
            if heritage_div:
                label_to_key = {
                    'Ethnic Origin:': 'ethnic_origin',
                    'Ancestry (Self-Reported):': 'ancestry',
                    'Religion:': 'religion',
                    'Jewish Ancestry (Self-Reported):': 'jewish_ancestry'
                }
                
                for li in heritage_div.find_all('li'):
                    label_span = li.find('span', class_='tab')
                    if label_span:
                        label = label_span.text.strip()
                        print(f"Found label: '{label}'")  # Debug print
                        if label in label_to_key:
                            # Special IDs for some fields
                            if 'Ethnic Origin:' in label:
                                value_span = li.find('span', id='donor-profile-ethnicity')
                            elif 'Religion:' in label:
                                value_span = li.find('span', id='donor-profile-religiion')
                            elif 'Jewish Ancestry' in label:
                                value_span = li.find('span', id='donor-profile-jewish')
                            else:
                                value_span = li.find('span')
                            
                            if value_span:
                                profile_data[label_to_key[label]] = value_span.text.strip()

        # Education information
        education_section = soup.find('section', id='education')
        if education_section:
            education_div = education_section.find('div', id='collapse-panel-3')
            if education_div:
                label_to_key = {
                    'Education Level:': 'education_level',
                    'Areas of Study:': 'areas_of_study'
                }
                
                for li in education_div.find_all('li'):
                    label_span = li.find('span', class_='tab')
                    if label_span:
                        label = label_span.text.strip()
                        if label in label_to_key:
                            # Get the value directly from the <span> next to the label
                            value_span = li.find_all('span')[1]  # This gets the second <span>, which is the value
                            
                            if value_span:
                                profile_data[label_to_key[label]] = value_span.text.strip()

                    
        return profile_data
            
    except Exception as e:
        logger.error(f"Error scraping profile {full_url}: {str(e)}")
        return None
            

In [80]:
def scrape_donor_profiles(session, page_number, seen_donors, driver=None):
    """Scrape donors from a single page with human-like behavior"""
    should_quit_driver = False
    try:
        # Create driver only if not provided
        if not driver:
            options = webdriver.ChromeOptions()
            options.add_argument(f'user-agent={random.choice(config["user_agents"])}')
            options.add_argument('--headless')  # Optional: run in headless mode
            driver = webdriver.Chrome(options=options)
            should_quit_driver = True
        
        url = f"{bank_config['base_url']}/search/?donor_sort=default_Sort&page={page_number}"
        driver.get(url)
        
        # Wait for content to load and simulate human behavior
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "profile-details"))
        )
        HumanlikeBehavior.simulate_scrolling(driver)
        HumanlikeBehavior.add_human_delay('page_view')
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        donors_on_page = []
        
        profile_sections = soup.find_all('div', class_='profile-details')
        logger.info(f"Found {len(profile_sections)} profiles on page {page_number}")
        
        for profile in profile_sections:
            try:
                donor_id_elem = profile.find('span', class_='donor-id')
                if donor_id_elem:
                    donor_id = donor_id_elem.text.strip()
                    relative_url = f"/donor/{donor_id}"
                    
                    if donor_id not in seen_donors:
                        profile_data = scrape_profile_bank1(relative_url, driver)
                        if profile_data:
                            seen_donors.add(donor_id)
                            donors_on_page.append(profile_data)
                    else:
                        logger.info(f"Found duplicate donor {donor_id} on page {page_number}")
            
            except Exception as e:
                logger.error(f"Error processing profile with ID {donor_id if donor_id else 'unknown'}: {str(e)}")
                continue
        
        if should_quit_driver:
            driver.quit()
        return donors_on_page
        
    except Exception as e:
        logger.error(f"Error scraping page {page_number}: {str(e)}")
        if should_quit_driver and driver:
            driver.quit()
        return []

## 7. Session Management

In [81]:
class ScrapingSession:
    def __init__(self):
        self.start_time = datetime.now()
        self.profiles_scraped = 0
        
    def should_take_break(self):
        """Check if we should take a break based on time and profiles scraped"""
        session_duration = datetime.now() - self.start_time
        
        if (session_duration > timedelta(hours=2) or 
            self.profiles_scraped >= 30):
            return True
        return False
    
    def take_break(self):
        """Take a break and reset session counters"""
        logger.info("Taking a session break...")
        HumanlikeBehavior.add_human_delay('session_break')
        self.start_time = datetime.now()
        self.profiles_scraped = 0

## 8. Test Scraping 

In [82]:
def test_specific_profiles():
    try:
        # List of known donor IDs to test
        test_donor_ids = ['20035', '19807', '20400']
        profiles = []
        
        # Initialize Selenium for testing
        options = webdriver.ChromeOptions()
        options.add_argument(f'user-agent={random.choice(config["user_agents"])}')
        options.add_argument('--headless')
        driver = webdriver.Chrome(options=options)
        
        print("Starting test scrape of specific profiles...")
        
        for donor_id in test_donor_ids:
            relative_url = f"/donor/{donor_id}"
            logger.info(f"Scraping profile for donor {donor_id}")
            profile_data = scrape_profile_bank1(relative_url, driver)
            if profile_data:
                profiles.append(profile_data)
                print(f"Successfully scraped donor {donor_id}")
            else:
                print(f"Failed to scrape donor {donor_id}")
                
        print(f"\nProfiles scraped: {len(profiles)}")
        
        if profiles:
            # Save test results
            test_output_dir = '/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw'
            os.makedirs(test_output_dir, exist_ok=True)
            
            with open(os.path.join(test_output_dir, 'test_specific_profiles.json'), 'w', encoding='utf-8') as f:
                json.dump(profiles, f, indent=2, ensure_ascii=False)
            print("Saved test results to raw data folder")
            
        return profiles
            
    except Exception as e:
        logger.error(f"Error in test scraping: {str(e)}")
        print(f"\nError details: {str(e)}")
        return None
    finally:
        if driver:
            driver.quit()

# Run test
test_profiles = test_specific_profiles()

2024-10-26 22:57:40 - INFO - Scraping profile for donor 20035


Starting test scrape of specific profiles...


2024-10-26 23:00:03 - INFO - Scraping profile for donor 19807


Found label: 'Ethnic Origin:'
Found label: 'Ancestry (Self-Reported):'
Found label: 'Religion:'
Found label: 'Jewish Ancestry (Self-Reported):'
Successfully scraped donor 20035


2024-10-26 23:02:21 - INFO - Scraping profile for donor 20400


Found label: 'Ethnic Origin:'
Found label: 'Ancestry (Self-Reported):'
Found label: 'Religion:'
Found label: 'Jewish Ancestry (Self-Reported):'
Successfully scraped donor 19807
Found label: 'Ethnic Origin:'
Found label: 'Ancestry (Self-Reported):'
Found label: 'Religion:'
Found label: 'Jewish Ancestry (Self-Reported):'
Successfully scraped donor 20400

Profiles scraped: 3
Saved test results to raw data folder


## 9. Scrape and Save

In [85]:
def main():
    # Load configuration
    global config, bank_config
    config = load_config()
    bank_id = 'bank1'
    bank_config = config['banks'][bank_id]

    # Initialize session
    all_profiles = []
    session_manager = ScrapingSession()
    
    try:
        # Load collected donor IDs
        donor_ids_path = '/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw/bank1/donor_ids.txt'
        with open(donor_ids_path, 'r') as f:
            donor_ids = [line.strip() for line in f]
        
        print(f"Loaded {len(donor_ids)} donor IDs to scrape")
        
        # Initialize driver
        options = webdriver.ChromeOptions()
        options.add_argument(f'user-agent={random.choice(config["user_agents"])}')
        options.add_argument('--headless')
        driver = webdriver.Chrome(options=options)
        
        for i, donor_id in enumerate(donor_ids, 1):
            try:
                logger.info(f"Scraping profile {i} of {len(donor_ids)}")
                
                if session_manager.should_take_break():
                    session_manager.take_break()
                
                relative_url = f"/donor/{donor_id}"
                profile_data = scrape_profile_bank1(relative_url, driver)
                
                if profile_data:
                    all_profiles.append(profile_data)
                    session_manager.profiles_scraped += 1
                    
                    # Save progress every 10 profiles
                    if len(all_profiles) % 10 == 0:
                        save_progress(all_profiles, bank_id)
                
                logger.info(f"Completed {i} of {len(donor_ids)} profiles")
                
            except Exception as e:
                logger.error(f"Error scraping profile {donor_id}: {str(e)}")
                # Reinitialize driver on error
                try:
                    driver.quit()
                except:
                    pass
                driver = webdriver.Chrome(options=options)
                continue
            
    except Exception as e:
        logger.error(f"Error during scraping: {str(e)}")
        
    finally:
        try:
            driver.quit()
        except:
            pass
        
        # Save final results
        save_final_results(all_profiles, bank_id)

def save_progress(profiles, bank_id):
    """Save intermediate results"""
    output_directory = '/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw/bank1'
    temp_file = os.path.join(output_directory, f'profiles_{bank_id}_in_progress.json')
    with open(temp_file, 'w', encoding='utf-8') as f:
        json.dump(profiles, f, indent=2, ensure_ascii=False)
    print(f"Saved progress: {len(profiles)} profiles")

def save_final_results(profiles, bank_id):
    """Save final results in both JSON and CSV formats"""
    output_directory = '/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw/bank1'
    os.makedirs(output_directory, exist_ok=True)
    
    if profiles:
        try:
            # Save as JSON
            json_filename = os.path.join(output_directory, 'profiles.json')
            with open(json_filename, 'w', encoding='utf-8') as f:
                json.dump(profiles, f, indent=2, ensure_ascii=False)
            print(f"Data saved to {json_filename}")
                
            # Save as CSV
            csv_filename = os.path.join(output_directory, 'profiles.csv')
            with open(csv_filename, 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=profiles[0].keys())
                writer.writeheader()
                writer.writerows(profiles)
            print(f"Data saved to {csv_filename}")
        except Exception as e:
            logger.error(f"Error saving final results: {str(e)}")
    else:
        logger.warning("No profiles were collected to save")

if __name__ == "__main__":
    main()

Loaded 436 donor IDs to scrape


2024-10-26 23:12:22 - INFO - Scraping profile 1 of 436
2024-10-26 23:14:20 - INFO - Completed 1 of 436 profiles
2024-10-26 23:14:20 - INFO - Scraping profile 2 of 436


Found label: 'Ethnic Origin:'
Found label: 'Ancestry (Self-Reported):'
Found label: 'Religion:'
Found label: 'Jewish Ancestry (Self-Reported):'


2024-10-26 23:16:20 - INFO - Completed 2 of 436 profiles
2024-10-26 23:16:20 - INFO - Scraping profile 3 of 436


Found label: 'Ethnic Origin:'
Found label: 'Ancestry (Self-Reported):'
Found label: 'Religion:'
Found label: 'Jewish Ancestry (Self-Reported):'


2024-10-26 23:18:57 - INFO - Completed 3 of 436 profiles
2024-10-26 23:18:57 - INFO - Scraping profile 4 of 436


Found label: 'Ethnic Origin:'
Found label: 'Ancestry (Self-Reported):'
Found label: 'Religion:'
Found label: 'Jewish Ancestry (Self-Reported):'


2024-10-26 23:21:24 - INFO - Completed 4 of 436 profiles
2024-10-26 23:21:24 - INFO - Scraping profile 5 of 436


Found label: 'Ethnic Origin:'
Found label: 'Ancestry (Self-Reported):'
Found label: 'Religion:'
Found label: 'Jewish Ancestry (Self-Reported):'


2024-10-26 23:23:15 - INFO - Completed 5 of 436 profiles
2024-10-26 23:23:15 - INFO - Scraping profile 6 of 436


Found label: 'Ethnic Origin:'
Found label: 'Ancestry (Self-Reported):'
Found label: 'Religion:'
Found label: 'Jewish Ancestry (Self-Reported):'


2024-10-26 23:25:10 - INFO - Completed 6 of 436 profiles
2024-10-26 23:25:10 - INFO - Scraping profile 7 of 436


Found label: 'Ethnic Origin:'
Found label: 'Ancestry (Self-Reported):'
Found label: 'Religion:'
Found label: 'Jewish Ancestry (Self-Reported):'


2024-10-26 23:27:19 - INFO - Completed 7 of 436 profiles
2024-10-26 23:27:19 - INFO - Scraping profile 8 of 436


Found label: 'Ethnic Origin:'
Found label: 'Ancestry (Self-Reported):'
Found label: 'Religion:'
Found label: 'Jewish Ancestry (Self-Reported):'


2024-10-26 23:29:56 - INFO - Completed 8 of 436 profiles
2024-10-26 23:29:56 - INFO - Scraping profile 9 of 436


Found label: 'Ethnic Origin:'
Found label: 'Ancestry (Self-Reported):'
Found label: 'Religion:'
Found label: 'Jewish Ancestry (Self-Reported):'


2024-10-26 23:31:50 - INFO - Completed 9 of 436 profiles
2024-10-26 23:31:50 - INFO - Scraping profile 10 of 436


Found label: 'Ethnic Origin:'
Found label: 'Ancestry (Self-Reported):'
Found label: 'Religion:'
Found label: 'Jewish Ancestry (Self-Reported):'


2024-10-26 23:34:05 - INFO - Completed 10 of 436 profiles
2024-10-26 23:34:05 - INFO - Scraping profile 11 of 436


Found label: 'Ethnic Origin:'
Found label: 'Ancestry (Self-Reported):'
Found label: 'Religion:'
Found label: 'Jewish Ancestry (Self-Reported):'
Saved progress: 10 profiles


2024-10-26 23:36:31 - INFO - Completed 11 of 436 profiles
2024-10-26 23:36:31 - INFO - Scraping profile 12 of 436


Found label: 'Ethnic Origin:'
Found label: 'Ancestry (Self-Reported):'
Found label: 'Religion:'
Found label: 'Jewish Ancestry (Self-Reported):'


2024-10-26 23:37:05 - ERROR - Error scraping profile [ANONYMIZED_URL] Message: disconnected: not connected to DevTools
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=130.0.6723.70)
Stacktrace:
0   chromedriver                        0x000000010117b634 cxxbridge1$str$ptr + 3645404
1   chromedriver                        0x0000000101173e94 cxxbridge1$str$ptr + 3614780
2   chromedriver                        0x0000000100be0104 cxxbridge1$string$len + 88416
3   chromedriver                        0x0000000100bca3d0 core::str::slice_error_fail::h1cab30ac4b13c655 + 63280
4   chromedriver                        0x0000000100bca310 core::str::slice_error_fail::h1cab30ac4b13c655 + 63088
5   chromedriver                        0x0000000100c5b654 cxxbridge1$string$len + 593584
6   chromedriver                        0x0000000100c16f54 cxxbridge1$string$len + 313264
7   chromedriver                        0x0000000100c17ba4 cxxbridge1$strin

Data saved to /Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw/bank1/profiles.json
Data saved to /Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/raw/bank1/profiles.csv


## 10. Progress Tracking Visualization

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def visualize_scraping_progress(all_profiles):
    # Create a DataFrame from profiles
    df = pd.DataFrame(all_profiles)
    
    # Basic statistics
    print("Scraping Statistics:")
    print(f"Total profiles collected: {len(df)}")
    print(f"Number of unique donors: {df['donor_id'].nunique()}")
    
    # Completeness check
    completeness = df.notna().mean() * 100
    
    # Plot completeness
    plt.figure(figsize=(12, 6))
    completeness.plot(kind='bar')
    plt.title('Data Completeness by Field')
    plt.ylabel('Percentage Complete')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return df

## 11. Data Validation

In [None]:
def validate_data(profiles):
    """Validate scraped data for completeness and consistency"""
    validation_results = {
        'total_profiles': len(profiles),
        'issues_found': [],
        'field_stats': {},
        'validation_passed': True
    }
    
    # Check each profile
    for i, profile in enumerate(profiles):
        # Required fields check
        required_fields = ['donor_id', 'url']
        for field in required_fields:
            if not profile.get(field):
                validation_results['issues_found'].append(
                    f"Profile {i}: Missing required field '{field}'"
                )
                validation_results['validation_passed'] = False
        
        # Format checks
        if profile.get('height'):
            if not any(unit in profile['height'].lower() for unit in ['cm', 'feet', "'"]):
                validation_results['issues_found'].append(
                    f"Profile {i}: Invalid height format: {profile['height']}"
                )
        
        # Consistency checks
        if profile.get('weight'):
            if not any(unit in profile['weight'].lower() for unit in ['kg', 'lbs']):
                validation_results['issues_found'].append(
                    f"Profile {i}: Invalid weight format: {profile['weight']}"
                )
    
    # Calculate field statistics
    for field in profiles[0].keys():
        filled_values = sum(1 for p in profiles if p.get(field))
        validation_results['field_stats'][field] = {
            'filled': filled_values,
            'empty': len(profiles) - filled_values,
            'completion_rate': (filled_values / len(profiles)) * 100
        }
    
    # Print validation summary
    print("\nData Validation Summary:")
    print(f"Total Profiles: {validation_results['total_profiles']}")
    print(f"Validation Passed: {validation_results['validation_passed']}")
    print("\nField Completion Rates:")
    for field, stats in validation_results['field_stats'].items():
        print(f"{field}: {stats['completion_rate']:.1f}%")
    
    if validation_results['issues_found']:
        print("\nIssues Found:")
        for issue in validation_results['issues_found'][:5]:  # Show first 5 issues
            print(f"- {issue}")
        if len(validation_results['issues_found']) > 5:
            print(f"... and {len(validation_results['issues_found']) - 5} more issues")
    
    return validation_results

# Example usage:
# After test scraping:
if test_profiles:
    print("\nValidating test profiles:")
    validation_results = validate_data(test_profiles)
    df = visualize_scraping_progress(test_profiles)