In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize the WebDriver
print("Initializing Chrome WebDriver...")
driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=chrome_options
)

try:
    # Navigate to the website
    url = "https://www.worldgovernmentbonds.com/world-credit-ratings/"
    print(f"Navigating to {url}...")
    driver.get(url)
    
    # Print the page title to verify we're on the right page
    print(f"Page title: {driver.title}")
    
    # Take a screenshot to see what's on the page
    print("Taking a screenshot...")
    driver.save_screenshot("webpage.png")
    print("Screenshot saved as 'webpage.png' - check this file to see what's on the page")
    
    # Wait for any table to be present
    print("Waiting for tables to load...")
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.TAG_NAME, "table"))
    )
    
    # Find all tables
    tables = driver.find_elements(By.TAG_NAME, "table")
    print(f"Found {len(tables)} tables on the page")
    
    # If no tables are found, look for div elements that might contain our data
    if len(tables) == 0:
        print("No tables found, looking for divs with table-like content...")
        divs = driver.find_elements(By.CSS_SELECTOR, "div[class*='table'], div[class*='grid']")
        print(f"Found {len(divs)} potential table-like divs")
        
        if len(divs) > 0:
            # Use the first div that might be a table
            largest_div = max(divs, key=lambda div: len(div.text))
            print(f"Using the largest div with text length: {len(largest_div.text)}")
            
            # Try to extract data from this div
            rows_elements = largest_div.find_elements(By.CSS_SELECTOR, "div[class*='row']")
            if len(rows_elements) > 0:
                print(f"Found {len(rows_elements)} row-like elements")
                # Process these as rows...
            else:
                print("No row-like elements found in div")
    
    # If we found tables, proceed with extraction
    if len(tables) > 0:
        # Find the table that is most likely to contain our data
        # Strategy: Use the table with the most rows or that contains "rating" in its text
        target_table = None
        max_rows = 0
        
        for i, table in enumerate(tables):
            rows = table.find_elements(By.TAG_NAME, "tr")
            print(f"Table {i+1} has {len(rows)} rows")
            
            # Check if table has headers related to credit ratings
            headers = [h.text.lower() for h in table.find_elements(By.TAG_NAME, "th")]
            has_rating_headers = any("rating" in h for h in headers) or any("moody" in h for h in headers) or any("fitch" in h for h in headers)
            
            if has_rating_headers:
                print(f"Table {i+1} has headers related to ratings!")
                target_table = table
                break
            
            if len(rows) > max_rows:
                max_rows = len(rows)
                target_table = table
        
        if target_table is None and len(tables) > 0:
            # If we couldn't identify a specific table, use the first one
            target_table = tables[0]
            print("Using the first table found")
        
        # Extract headers from the target table
        print("Extracting table headers...")
        headers = []
        header_elements = target_table.find_elements(By.TAG_NAME, "th")
        
        for header in header_elements:
            header_text = header.text.strip()
            if header_text:  # Only add non-empty headers
                headers.append(header_text)
            else:
                # If header is empty, use a placeholder
                headers.append(f"Column_{len(headers)+1}")
        
        print(f"Headers: {headers}")
        
        # Extract rows
        print("Extracting table rows...")
        rows = []
        row_elements = target_table.find_elements(By.TAG_NAME, "tr")
        
        for row in row_elements:
            # Skip header row
            if row.find_elements(By.TAG_NAME, "th"):
                continue
                
            # Get all cells in the row
            cell_elements = row.find_elements(By.TAG_NAME, "td")
            
            if cell_elements:
                row_data = []
                for cell in cell_elements:
                    row_data.append(cell.text.strip())
                
                if any(row_data):  # Only add non-empty rows
                    rows.append(row_data)
        
        print(f"Extracted {len(rows)} data rows")
        
        # Handle case where headers count doesn't match column count
        if rows and len(headers) != len(rows[0]):
            print(f"Warning: Headers count ({len(headers)}) doesn't match column count in first row ({len(rows[0])})")
            
            if len(headers) > len(rows[0]):
                headers = headers[:len(rows[0])]  # Truncate headers
            else:
                # Add generic headers if needed
                while len(headers) < len(rows[0]):
                    headers.append(f"Column_{len(headers)+1}")
        
        # Create DataFrame
        if rows:
            df = pd.DataFrame(rows, columns=headers)
            print(f"Created DataFrame with shape {df.shape}")
            
            # Save to CSV
            output_file = "world_credit_ratings.csv"
            df.to_csv(output_file, index=False)
            print(f"Data saved to {output_file}")
            
            # Print preview
            print("Preview of the data:")
            print(df.head())
        else:
            print("No data rows found, cannot create DataFrame")
    else:
        print("No tables found on the page after waiting")
        
        # Save page source for debugging
        with open("page_source.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        print("Saved page source to 'page_source.html' for debugging")

except Exception as e:
    print(f"An error occurred: {e}")
    
    # Save the page source in case of error
    try:
        with open("error_page_source.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        print("Saved page source to 'error_page_source.html' for debugging")
    except:
        print("Could not save page source")

finally:
    # Close the driver
    driver.quit()
    print("WebDriver closed")

Initializing Chrome WebDriver...


WebDriverException: Message: Service /Users/mac/.wdm/drivers/chromedriver/mac64/135.0.7049.114/chromedriver-mac-arm64/chromedriver unexpectedly exited. Status code was: -9


In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize the WebDriver
print("Initializing Chrome WebDriver...")
driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=chrome_options
)

try:
    # Navigate to the website
    url = "https://www.worldgovernmentbonds.com/world-credit-ratings/"
    print(f"Navigating to {url}...")
    driver.get(url)
    
    # Print the page title to verify we're on the right page
    print(f"Page title: {driver.title}")
    
    # Take a screenshot to see what's on the page
    print("Taking a screenshot...")
    driver.save_screenshot("webpage.png")
    print("Screenshot saved as 'webpage.png'")
    
    # Wait for any table to be present
    print("Waiting for tables to load...")
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.TAG_NAME, "table"))
    )
    
    # Find all tables
    tables = driver.find_elements(By.TAG_NAME, "table")
    print(f"Found {len(tables)} tables on the page")
    
    # Find the table that is most likely to contain our data
    target_table = None
    max_rows = 0
    
    for i, table in enumerate(tables):
        rows = table.find_elements(By.TAG_NAME, "tr")
        print(f"Table {i+1} has {len(rows)} rows")
        
        # Check if table has headers related to credit ratings
        headers = [h.text.lower() for h in table.find_elements(By.TAG_NAME, "th")]
        has_rating_headers = any("rating" in h for h in headers) or any("moody" in h for h in headers) or any("fitch" in h for h in headers)
        
        if has_rating_headers:
            print(f"Table {i+1} has headers related to ratings!")
            target_table = table
            break
        
        if len(rows) > max_rows:
            max_rows = len(rows)
            target_table = table
    
    if target_table is None and len(tables) > 0:
        # If we couldn't identify a specific table, use the first one
        target_table = tables[0]
        print("Using the first table found")
    
    # Extract headers from the target table
    print("Extracting table headers...")
    headers = []
    header_elements = target_table.find_elements(By.TAG_NAME, "th")
    
    for header in header_elements:
        header_text = header.text.strip()
        if header_text:  # Only add non-empty headers
            headers.append(header_text)
        else:
            # If header is empty, use a placeholder
            headers.append(f"Column_{len(headers)+1}")
    
    print(f"Headers: {headers}")
    
    # Extract rows with color information
    print("Extracting table rows with color information...")
    rows = []
    row_elements = target_table.find_elements(By.TAG_NAME, "tr")
    
    for row in row_elements:
        # Skip header row
        if row.find_elements(By.TAG_NAME, "th"):
            continue
            
        # Get all cells in the row
        cell_elements = row.find_elements(By.TAG_NAME, "td")
        
        if cell_elements:
            row_data = []
            color_data = []  # To store color information
            
            for cell in cell_elements:
                # Get text content
                cell_text = cell.text.strip()
                row_data.append(cell_text)
                
                # Check for background color
                try:
                    # Get the background color using JavaScript
                    bg_color = driver.execute_script(
                        "return window.getComputedStyle(arguments[0]).backgroundColor", 
                        cell
                    )
                    
                    # Check if the background has a red or green color
                    if "rgb(255," in bg_color.lower():  # Red has high R value
                        color_data.append("red")
                    elif "rgb(0, 128" in bg_color.lower() or "rgb(0, 255" in bg_color.lower():  # Green has high G value
                        color_data.append("green")
                    else:
                        color_data.append("")
                except:
                    color_data.append("")
                    
                # Alternative: Check for colored elements inside the cell
                colored_elements = cell.find_elements(By.CSS_SELECTOR, 
                                                     ".red, .green, [style*='color:red'], [style*='color:green'], [style*='background-color:red'], [style*='background-color:green']")
                if colored_elements:
                    for elem in colored_elements:
                        style = elem.get_attribute("style") or ""
                        class_name = elem.get_attribute("class") or ""
                        
                        if "red" in style.lower() or "red" in class_name.lower():
                            color_data[-1] = "red"
                        elif "green" in style.lower() or "green" in class_name.lower():
                            color_data[-1] = "green"
            
            # Only add non-empty rows
            if any(row_data):
                # Combine row data with color information
                enhanced_row_data = []
                for i, value in enumerate(row_data):
                    color_info = color_data[i] if i < len(color_data) else ""
                    if color_info:
                        # Add color indicator to the value
                        enhanced_row_data.append(f"{value} [{color_info}]")
                    else:
                        enhanced_row_data.append(value)
                
                rows.append(enhanced_row_data)
    
    print(f"Extracted {len(rows)} data rows")
    
    # Handle case where headers count doesn't match column count
    if rows and len(headers) != len(rows[0]):
        print(f"Warning: Headers count ({len(headers)}) doesn't match column count in first row ({len(rows[0])})")
        
        if len(headers) > len(rows[0]):
            headers = headers[:len(rows[0])]  # Truncate headers
        else:
            # Add generic headers if needed
            while len(headers) < len(rows[0]):
                headers.append(f"Column_{len(headers)+1}")
    
    # Create DataFrame
    if rows:
        df = pd.DataFrame(rows, columns=headers)
        print(f"Created DataFrame with shape {df.shape}")
        
        # Save to CSV
        output_file = "world_credit_ratings_with_colors.csv"
        df.to_csv(output_file, index=False)
        print(f"Data saved to {output_file}")
        
        # Print preview
        print("Preview of the data:")
        print(df.head())
    else:
        print("No data rows found, cannot create DataFrame")

except Exception as e:
    print(f"An error occurred: {e}")
    
    # Save the page source in case of error
    try:
        with open("error_page_source.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        print("Saved page source to 'error_page_source.html' for debugging")
    except:
        print("Could not save page source")

finally:
    # Close the driver
    driver.quit()
    print("WebDriver closed")

Initializing Chrome WebDriver...


WebDriverException: Message: Service /Users/mac/.wdm/drivers/chromedriver/mac64/135.0.7049.114/chromedriver-mac-arm64/chromedriver unexpectedly exited. Status code was: -9
