In [1]:
# In[1]: Import Required Libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
import json
from typing import Dict, List, Optional
import warnings
warnings.filterwarnings('ignore')

# In[2]: ASX Energy Futures Scraper Class
class ASXEnergyFuturesScraper:
    """
    A comprehensive scraper for ASX Energy futures data from https://asxenergy.com.au/futures_au
    """
    
    def __init__(self, headless: bool = True, wait_timeout: int = 20):
        """
        Initialize the scraper with Chrome WebDriver
        
        Args:
            headless (bool): Run browser in headless mode (no GUI)
            wait_timeout (int): Maximum time to wait for elements to load
        """
        self.url = "https://asxenergy.com.au/futures_au"
        self.wait_timeout = wait_timeout
        self.driver = None
        self.setup_driver(headless)
        
    def setup_driver(self, headless: bool):
        """Setup Chrome WebDriver with optimal options"""
        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        
        try:
            # Initialize the Chrome driver
            self.driver = webdriver.Chrome(options=chrome_options)
            print("✓ Chrome WebDriver initialized successfully")
        except Exception as e:
            print(f"✗ Error initializing WebDriver: {e}")
            print("Please ensure ChromeDriver is installed and in PATH")
            raise

    def load_page(self) -> bool:
        """
        Load the ASX Energy futures page and wait for data to load
        
        Returns:
            bool: True if page loaded successfully, False otherwise
        """
        try:
            print(f"Loading page: {self.url}")
            self.driver.get(self.url)
            
            # Wait for the main content to load
            wait = WebDriverWait(self.driver, self.wait_timeout)
            
            # Wait for tables to be present
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            
            # Additional wait for dynamic content to fully load
            time.sleep(3)
            
            print("✓ Page loaded successfully")
            return True
            
        except Exception as e:
            print(f"✗ Error loading page: {e}")
            return False

    def extract_table_data(self, table_element) -> List[Dict]:
        """
        Extract data from a single table element
        
        Args:
            table_element: Selenium WebElement representing a table
            
        Returns:
            List[Dict]: List of dictionaries containing row data
        """
        data = []
        
        try:
            # Find header row
            header_row = table_element.find_element(By.TAG_NAME, "thead").find_element(By.TAG_NAME, "tr")
            headers = [th.text.strip() for th in header_row.find_elements(By.TAG_NAME, "th")]
            
            # Find data rows
            tbody = table_element.find_element(By.TAG_NAME, "tbody")
            rows = tbody.find_elements(By.TAG_NAME, "tr")
            
            for row in rows:
                cells = row.find_elements(By.TAG_NAME, "td")
                if len(cells) == len(headers):
                    row_data = {}
                    for i, cell in enumerate(cells):
                        row_data[headers[i]] = cell.text.strip()
                    data.append(row_data)
                    
        except Exception as e:
            print(f"Warning: Error extracting table data: {e}")
            
        return data

    def scrape_all_futures_data(self) -> Dict[str, pd.DataFrame]:
        """
        Scrape all futures data from the page
        
        Returns:
            Dict[str, pd.DataFrame]: Dictionary with section names as keys and DataFrames as values
        """
        if not self.load_page():
            return {}
        
        all_data = {}
        
        try:
            # Find all sections/tables on the page
            tables = self.driver.find_elements(By.TAG_NAME, "table")
            print(f"Found {len(tables)} tables on the page")
            
            # Common section names based on the image provided
            section_names = [
                "Base_Month", "Base_Quarter", "Base_Strip", "Calendar", 
                "Caps_Strip", "Peak_Quarter", "Peak_Strip"
            ]
            
            for i, table in enumerate(tables):
                try:
                    # Try to find section title near the table
                    section_name = f"Table_{i+1}"
                    if i < len(section_names):
                        section_name = section_names[i]
                    
                    # Extract data from the table
                    table_data = self.extract_table_data(table)
                    
                    if table_data:
                        df = pd.DataFrame(table_data)
                        all_data[section_name] = df
                        print(f"✓ Extracted {len(table_data)} rows from {section_name}")
                    
                except Exception as e:
                    print(f"Warning: Error processing table {i+1}: {e}")
                    continue
            
            return all_data
            
        except Exception as e:
            print(f"✗ Error scraping data: {e}")
            return {}

    def get_specific_contracts(self, contract_types: List[str] = None) -> pd.DataFrame:
        """
        Get data for specific contract types
        
        Args:
            contract_types (List[str]): List of contract types to filter
            
        Returns:
            pd.DataFrame: Combined DataFrame with specified contracts
        """
        all_data = self.scrape_all_futures_data()
        
        if contract_types:
            filtered_data = {}
            for contract_type in contract_types:
                if contract_type in all_data:
                    filtered_data[contract_type] = all_data[contract_type]
            all_data = filtered_data
        
        # Combine all data into a single DataFrame
        combined_df = pd.DataFrame()
        
        for section_name, df in all_data.items():
            df['Section'] = section_name
            combined_df = pd.concat([combined_df, df], ignore_index=True)
        
        return combined_df

    def save_data(self, data: Dict[str, pd.DataFrame], output_format: str = 'excel'):
        """
        Save scraped data to files
        
        Args:
            data (Dict[str, pd.DataFrame]): Dictionary of DataFrames to save
            output_format (str): 'excel', 'csv', or 'json'
        """
        timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
        
        if output_format.lower() == 'excel':
            filename = f"asx_energy_futures_{timestamp}.xlsx"
            with pd.ExcelWriter(filename, engine='openpyxl') as writer:
                for sheet_name, df in data.items():
                    df.to_excel(writer, sheet_name=sheet_name, index=False)
            print(f"✓ Data saved to {filename}")
            
        elif output_format.lower() == 'csv':
            for section_name, df in data.items():
                filename = f"asx_energy_{section_name}_{timestamp}.csv"
                df.to_csv(filename, index=False)
                print(f"✓ Data saved to {filename}")
                
        elif output_format.lower() == 'json':
            filename = f"asx_energy_futures_{timestamp}.json"
            combined_data = {}
            for section_name, df in data.items():
                combined_data[section_name] = df.to_dict('records')
            
            with open(filename, 'w') as f:
                json.dump(combined_data, f, indent=2)
            print(f"✓ Data saved to {filename}")

    def close(self):
        """Close the WebDriver"""
        if self.driver:
            self.driver.quit()
            print("✓ WebDriver closed")

# In[3]: Usage Example - Basic Scraping
def main_example():
    """Example usage of the ASX Energy Futures Scraper"""
    
    # Initialize the scraper
    scraper = ASXEnergyFuturesScraper(headless=True)
    
    try:
        # Scrape all futures data
        print("Starting data scraping...")
        all_data = scraper.scrape_all_futures_data()
        
        # Display summary
        print("\n" + "="*50)
        print("SCRAPING SUMMARY")
        print("="*50)
        
        for section_name, df in all_data.items():
            print(f"{section_name}: {len(df)} rows, {len(df.columns)} columns")
            if not df.empty:
                print(f"  Columns: {list(df.columns)}")
            print()
        
        # Save data in multiple formats
        if all_data:
            scraper.save_data(all_data, 'excel')
            scraper.save_data(all_data, 'csv')
            
            # Display sample data from first section
            if all_data:
                first_section = list(all_data.keys())[0]
                print(f"Sample data from {first_section}:")
                print(all_data[first_section].head())
        
    except Exception as e:
        print(f"✗ Error in main execution: {e}")
    
    finally:
        scraper.close()

# In[4]: Advanced Usage - Specific Contract Types
def scrape_specific_contracts():
    """Example of scraping specific contract types"""
    
    scraper = ASXEnergyFuturesScraper(headless=True)
    
    try:
        # Get data for specific contract types
        contract_types = ["Base_Month", "Peak_Quarter", "Base_Strip"]
        specific_data = scraper.get_specific_contracts(contract_types)
        
        print("Specific contracts data:")
        print(specific_data.head(10))
        
        # Save specific data
        specific_data.to_csv("asx_specific_contracts.csv", index=False)
        print("✓ Specific contracts data saved")
        
    except Exception as e:
        print(f"✗ Error: {e}")
    
    finally:
        scraper.close()

# In[5]: Real-time Monitoring Function
def monitor_futures_data(interval_minutes: int = 15):
    """
    Monitor futures data at regular intervals
    
    Args:
        interval_minutes (int): Interval between scraping sessions in minutes
    """
    import schedule
    from datetime import datetime
    
    def scrape_and_save():
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(f"\n[{timestamp}] Starting scheduled scraping...")
        
        scraper = ASXEnergyFuturesScraper(headless=True)
        try:
            data = scraper.scrape_all_futures_data()
            if data:
                scraper.save_data(data, 'excel')
                print(f"[{timestamp}] ✓ Data scraped and saved successfully")
            else:
                print(f"[{timestamp}] ✗ No data retrieved")
        except Exception as e:
            print(f"[{timestamp}] ✗ Error: {e}")
        finally:
            scraper.close()
    
    # Schedule the scraping
    schedule.every(interval_minutes).minutes.do(scrape_and_save)
    
    print(f"Monitoring started - will scrape every {interval_minutes} minutes")
    print("Press Ctrl+C to stop monitoring")
    
    try:
        while True:
            schedule.run_pending()
            time.sleep(60)  # Check every minute
    except KeyboardInterrupt:
        print("\nMonitoring stopped")

# In[6]: Run the scraper
if __name__ == "__main__":
    # Uncomment the function you want to run:
    
    main_example()  # Basic scraping example
    # scrape_specific_contracts()  # Specific contracts only
    # monitor_futures_data(15)  # Real-time monitoring every 15 minutes

✓ Chrome WebDriver initialized successfully
Starting data scraping...
Loading page: https://asxenergy.com.au/futures_au
✓ Page loaded successfully
Found 30 tables on the page
  (Session info: chrome=137.0.7151.68); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x0x7ff671a7fea5+79173]
	GetHandleVerifier [0x0x7ff671a7ff00+79264]
	(No symbol) [0x0x7ff671839e5a]
	(No symbol) [0x0x7ff671890586]
	(No symbol) [0x0x7ff67189083c]
	(No symbol) [0x0x7ff671882e4c]
	(No symbol) [0x0x7ff6718b89af]
	(No symbol) [0x0x7ff671882d16]
	(No symbol) [0x0x7ff6718b8b80]
	(No symbol) [0x0x7ff6718e100d]
	(No symbol) [0x0x7ff6718b8743]
	(No symbol) [0x0x7ff6718814c1]
	(No symbol) [0x0x7ff671882253]
	GetHandleVerifier [0x0x7ff671d4a2dd+3004797]
	GetHandleVerifier [0x0x7ff671d4472d+2981325]
	GetHandleVerifier [0x0x7ff671d63380+3107360]
	GetHandleVerifier [0x0x7ff671a9aa2e+188622]
	GetH