In [1]:
# Import required libraries
import pandas as pd
import os
import subprocess
import sys
from IPython.display import display, HTML

# First, check if the necessary tools are installed
def check_and_install_tools():
    print("Checking if ChromeDriver is installed...")
    
    try:
        # Check for ChromeDriver in PATH
        result = subprocess.run(["which", "chromedriver"], 
                               capture_output=True, text=True)
        if result.returncode == 0:
            chromedriver_path = result.stdout.strip()
            print(f"ChromeDriver found at: {chromedriver_path}")
            return chromedriver_path
        else:
            print("ChromeDriver not found in PATH.")
            
            # Check if it's installed via Homebrew but not in PATH
            if os.path.exists("/opt/homebrew/bin/chromedriver"):
                print("ChromeDriver found in Homebrew directory.")
                return "/opt/homebrew/bin/chromedriver"
            
            print("Would you like to install ChromeDriver using Homebrew? (y/n)")
            response = input()
            if response.lower() == 'y':
                print("Installing ChromeDriver using Homebrew...")
                subprocess.run(["brew", "install", "chromedriver"], check=True)
                print("ChromeDriver installed successfully!")
                
                # Get the path
                result = subprocess.run(["which", "chromedriver"], 
                                       capture_output=True, text=True)
                return result.stdout.strip()
            else:
                print("Please install ChromeDriver manually:")
                print("1. Run: brew install chromedriver")
                print("2. Or download from: https://sites.google.com/chromium.org/driver/")
                return None
    except Exception as e:
        print(f"An error occurred: {e}")
        print("Please install ChromeDriver manually.")
        return None


# Function to scrape the website - will be called after our setup
def scrape_credit_ratings(chromedriver_path=None):
    try:
        # First, import Selenium libraries
        from selenium import webdriver
        from selenium.webdriver.chrome.options import Options
        from selenium.webdriver.chrome.service import Service
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support.ui import WebDriverWait
        from selenium.webdriver.support import expected_conditions as EC
        
        # Set up Chrome options
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")  # Important for Mac
        
        # Set up the Chrome service
        if chromedriver_path:
            service = Service(executable_path=chromedriver_path)
        else:
            service = Service()
        
        print("Starting Chrome WebDriver...")
        driver = webdriver.Chrome(service=service, options=chrome_options)
        
        try:
            # Navigate to the website
            url = "https://www.worldgovernmentbonds.com/world-credit-ratings/"
            print(f"Navigating to {url}...")
            driver.get(url)
            
            # Print the page title
            print(f"Page title: {driver.title}")
            
            # Take a screenshot if needed
            screenshot_path = "webpage.png"
            driver.save_screenshot(screenshot_path)
            print(f"Screenshot saved to {screenshot_path}")
            
            # Wait for tables to load
            print("Waiting for tables to load...")
            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.TAG_NAME, "table"))
            )
            
            # Find all tables
            tables = driver.find_elements(By.TAG_NAME, "table")
            print(f"Found {len(tables)} tables on the page")
            
            if not tables:
                print("No tables found on the page.")
                return None
            
            # Use the table with the most rows
            target_table = max(tables, key=lambda t: len(t.find_elements(By.TAG_NAME, "tr")))
            
            # Extract headers
            headers = []
            header_elements = target_table.find_elements(By.TAG_NAME, "th")
            
            for header in header_elements:
                headers.append(header.text.strip() or f"Column_{len(headers)+1}")
            
            print(f"Found {len(headers)} headers")
            
            # Extract rows
            rows = []
            row_elements = target_table.find_elements(By.TAG_NAME, "tr")
            
            for row in row_elements:
                # Skip header row
                if row.find_elements(By.TAG_NAME, "th"):
                    continue
                
                # Get cells
                cell_elements = row.find_elements(By.TAG_NAME, "td")
                
                if cell_elements:
                    row_data = []
                    
                    for cell in cell_elements:
                        cell_text = cell.text.strip()
                        
                        # Try to get color information
                        try:
                            # Check attributes for color indicators
                            class_name = cell.get_attribute("class") or ""
                            style = cell.get_attribute("style") or ""
                            
                            # Check for red/green indicators
                            if "red" in class_name.lower() or "red" in style.lower():
                                cell_text += " [red]"
                            elif "green" in class_name.lower() or "green" in style.lower():
                                cell_text += " [green]"
                            
                            # Check child elements for colors
                            colored_elements = cell.find_elements(By.CSS_SELECTOR, 
                                                                 "[class*='red'], [class*='green'], [style*='color:red'], [style*='color:green']")
                            
                            if colored_elements and "[red]" not in cell_text and "[green]" not in cell_text:
                                for elem in colored_elements:
                                    elem_class = elem.get_attribute("class") or ""
                                    elem_style = elem.get_attribute("style") or ""
                                    
                                    if "red" in elem_class.lower() or "red" in elem_style.lower():
                                        cell_text += " [red]"
                                        break
                                    elif "green" in elem_class.lower() or "green" in elem_style.lower():
                                        cell_text += " [green]"
                                        break
                        except:
                            # If error occurs when checking colors, just use the text
                            pass
                        
                        row_data.append(cell_text)
                    
                    # Only add non-empty rows
                    if any(cell for cell in row_data if cell.strip()):
                        rows.append(row_data)
            
            print(f"Extracted {len(rows)} data rows")
            
            # Adjust headers if needed
            if rows and len(headers) != len(rows[0]):
                print(f"Warning: Headers count ({len(headers)}) doesn't match column count in first row ({len(rows[0])})")
                
                if len(headers) > len(rows[0]):
                    headers = headers[:len(rows[0])]
                else:
                    while len(headers) < len(rows[0]):
                        headers.append(f"Column_{len(headers)+1}")
            
            # Create DataFrame
            if rows:
                df = pd.DataFrame(rows, columns=headers)
                return df
            else:
                print("No data rows found")
                return None
                
        finally:
            # Always close the driver
            driver.quit()
            print("WebDriver closed")
            
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Main execution for Jupyter
# Check for ChromeDriver first
chromedriver_path = check_and_install_tools()

if chromedriver_path:
    # If ChromeDriver was found or installed, try to scrape
    print(f"Using ChromeDriver at: {chromedriver_path}")
    
    # Make sure it's executable (common issue on Mac)
    try:
        subprocess.run(["chmod", "+x", chromedriver_path], check=True)
        print("Fixed permissions for ChromeDriver")
    except:
        print("Could not fix permissions - might still work")
    
    # Perform the scraping
    df = scrape_credit_ratings(chromedriver_path)
    
    if df is not None:
        print("\nScraping successful! Here's a preview of the data:")
        display(df.head())
        
        # Save to CSV
        csv_path = "world_credit_ratings.csv"
        df.to_csv(csv_path, index=False)
        print(f"\nData saved to {csv_path}")
    else:
        print("\nScraping failed. No data was retrieved.")
else:
    print("\nCannot proceed without ChromeDriver.")

Checking if ChromeDriver is installed...
ChromeDriver not found in PATH.
Would you like to install ChromeDriver using Homebrew? (y/n)


 y


Installing ChromeDriver using Homebrew...


==> Auto-updating Homebrew...
Adjust how often this is run with HOMEBREW_AUTO_UPDATE_SECS or disable with
HOMEBREW_NO_AUTO_UPDATE. Hide these hints with HOMEBREW_NO_ENV_HINTS (see `man brew`).
==> Downloading https://ghcr.io/v2/homebrew/portable-ruby/portable-ruby/blobs/sha256:40e7f5d7514a7e9757facdd39006f7a351d3d7986d3a228be13c8b1c3216727b
######################################################################## 100.0%
==> Pouring portable-ruby-3.4.3.arm64_big_sur.bottle.tar.gz
==> Auto-updated Homebrew!
Updated 4 taps (homebrew/services, ethereum/ethereum, homebrew/core and homebrew/cask).
==> New Formulae
ab-av1
acme.sh
acronym
ad
adapterremoval
adaptivecpp
add-determinism
algolia
alive2
alloy
anubis
anyquery
apache-flink@1
api-linter
aqtinstall
aqua
arelo
astroterm
async_simple
atac
azure-storage-blobs-cpp
azure-storage-common-cpp
azurehound
babelfish
bacon-ls
bagels
bazel@7
beanquery
behaviortree.cpp
bender
binocle
bkmr
block-goose-cli
bold
bombardier
bpmnlint
brename
bruno-cli
buf

==> Downloading https://storage.googleapis.com/chrome-for-testing-public/136.0.7103.92/mac-arm64/chromedriver-mac-arm64.zip
==> Installing Cask chromedriver
==> Linking Binary 'chromedriver' to '/opt/homebrew/bin/chromedriver'
🍺  chromedriver was successfully installed!
==> `brew cleanup` has not been run in the last 30 days, running now...
Disable this behaviour by setting HOMEBREW_NO_INSTALL_CLEANUP.
Hide these hints with HOMEBREW_NO_ENV_HINTS (see `man brew`).
Removing: /Users/mac/Library/Caches/Homebrew/aom_bottle_manifest--3.11.0... (20.7KB)
Removing: /Users/mac/Library/Caches/Homebrew/aom--3.11.0... (3.9MB)
Removing: /Users/mac/Library/Caches/Homebrew/c-ares_bottle_manifest--1.34.3... (7.3KB)
Removing: /Users/mac/Library/Caches/Homebrew/c-ares--1.34.3... (295.0KB)
Removing: /Users/mac/Library/Caches/Homebrew/ca-certificates_bottle_manifest--2024-09-24... (1.9KB)
Removing: /Users/mac/Library/Caches/Homebrew/ca-certificates--2024-09-24... (132.6KB)
Removing: /Users/mac/Library/Cach