In [1]:
%pip install selenium webdriver-manager beautifulsoup4 lxml pandas

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Collecting requests
  Downloading requests-2.32.5-py3-none-any.whl (64 kB)
Installing collected packages: requests, webdriver-manager
  Attempting uninstall: requests
    Found existing installation: requests 2.26.0
    Uninstalling requests-2.26.0:
      Successfully uninstalled requests-2.26.0
Successfully installed requests-2.32.5 webdriver-manager-4.0.2
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
conda-repo-cli 1.0.4 requires pathlib, which is not installed.
anaconda-project 0.10.1 requires ruamel-yaml, which is not installed.


In [2]:
"""
Juniper Networks ACX Series EOL Table Scraper using Selenium
Extracts tables from dynamically loaded JavaScript content
"""

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

URL = "https://support.juniper.net/support/eol/product/acx_series/"

def _drop_unnamed_cols(df: pd.DataFrame) -> pd.DataFrame:
    """Remove unnamed columns from DataFrame"""
    return df.loc[:, ~df.columns.astype(str).str.match(r"^Unnamed")]

def scrape_acx_tables_selenium(url: str = URL, headless: bool = True, wait_time: int = 10):
    """
    Scrape tables from Juniper Networks EOL page using Selenium
    
    Args:
        url: The URL to scrape
        headless: Run browser in headless mode (True) or visible mode (False)
        wait_time: Time in seconds to wait for tables to load
    
    Returns:
        List of DataFrames containing the scraped tables
    """
    # Setup Chrome options
    chrome_options = Options()
    if headless:
        chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36"
    )
    
    # Initialize driver
    driver = webdriver.Chrome(options=chrome_options)
    
    try:
        print(f"Loading page: {url}")
        driver.get(url)
        
        # Wait for tables to load
        print(f"Waiting {wait_time} seconds for JavaScript to execute...")
        time.sleep(wait_time)
        
        # Try to explicitly wait for table elements
        try:
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.TAG_NAME, "table"))
            )
            print("✓ Tables detected on page")
        except Exception as e:
            print(f"⚠ Warning: {str(e)}")
            print("Proceeding with available content...")
        
        # Get page source after JavaScript execution
        html = driver.page_source
        soup = BeautifulSoup(html, "lxml")
        
        # Find all tables
        table_nodes = soup.find_all("table")
        print(f"Found {len(table_nodes)} table(s)")
        
        if len(table_nodes) == 0:
            print("\n❌ No tables found!")
            print("The page may use a different loading mechanism.")
            print("Try increasing wait_time or check browser console for errors.")
            return []
        
        # Parse tables using pandas
        dfs = pd.read_html(html)
        
        out = []
        for i, (df, table) in enumerate(zip(dfs, table_nodes), start=1):
            print(f"\nProcessing Table {i}:")
            print(f"  Shape: {df.shape}")
            print(f"  Columns: {list(df.columns)}")
            
            # Clean DataFrame
            df = _drop_unnamed_cols(df)
            
            # Extract hyperlinks from first column
            links = []
            rows = table.select("tr")[1:]  # Skip header row
            for tr in rows:
                td = tr.find("td")
                a = td.find("a") if td else None
                links.append(a["href"] if a and a.has_attr("href") else None)
            
            # Align link list length with df length
            if len(links) != len(df):
                links = (links + [None] * len(df))[:len(df)]
            
            # Insert URL column after first column
            if len(df.columns) > 0:
                first_col = df.columns[0]
                df.insert(1, f"{first_col}_url", links)
            
            out.append(df)
        
        return out
        
    except Exception as e:
        print(f"\n❌ Error during scraping: {str(e)}")
        import traceback
        traceback.print_exc()
        return []
        
    finally:
        driver.quit()
        print("\n✓ Browser closed")


if __name__ == "__main__":
    print("="*70)
    print("JUNIPER NETWORKS ACX SERIES EOL TABLE SCRAPER")
    print("="*70 + "\n")
    
    # Run the scraper
    tables = scrape_acx_tables_selenium(headless=True, wait_time=10)
    
    if tables:
        print(f"\n{'='*70}")
        print("✓ EXTRACTION SUCCESSFUL!")
        print("="*70 + "\n")
        
        # Save each table to CSV
        for idx, df in enumerate(tables, start=1):
            filename = f"juniper_acx_series_table_{idx}.csv"
            df.to_csv(filename, index=False)
            print(f"✓ Saved: {filename} ({df.shape[0]} rows × {df.shape[1]} columns)")
        
        print(f"\n{'='*70}")
        print("DATA PREVIEW")
        print("="*70 + "\n")
        
        # Display preview of each table
        for idx, df in enumerate(tables, start=1):
            print(f"\n--- Table {idx}: {df.shape} ---")
            print(df.head(3).to_string())
            print()
            
    else:
        print("\n❌ EXTRACTION FAILED")
        print("No tables were extracted from the page.")
        print("\nTroubleshooting tips:")
        print("1. Increase wait_time parameter (try 15-20 seconds)")
        print("2. Run with headless=False to see what's loading")
        print("3. Check if the page requires authentication")
        print("4. Inspect browser Network tab for the actual API endpoint")


JUNIPER NETWORKS ACX SERIES EOL TABLE SCRAPER

Loading page: https://support.juniper.net/support/eol/product/acx_series/
Waiting 10 seconds for JavaScript to execute...
✓ Tables detected on page
Found 6 table(s)

Processing Table 1:
  Shape: (1, 8)
  Columns: ['Technical Support Bulletin (TSB)', 'SKU Description', 'EOL Announced', 'Last Order', 'End of Hardware Failure Analysis', 'End of Engineering', 'Last Software Version', 'End of Support']

Processing Table 2:
  Shape: (10, 7)
  Columns: ['Product', 'EOL Announced', 'Last Order', 'Last Date to Convert Warranty', 'Same Day Support Discontinued', 'Next Day Support Discontinued', 'End of Support']

Processing Table 3:
  Shape: (2, 3)
  Columns: [0, 1, 2]

Processing Table 4:
  Shape: (1, 2)
  Columns: [0, 1]

Processing Table 5:
  Shape: (1, 2)
  Columns: [0, 1]

✓ Browser closed

✓ EXTRACTION SUCCESSFUL!

✓ Saved: juniper_acx_series_table_1.csv (1 rows × 9 columns)
✓ Saved: juniper_acx_series_table_2.csv (10 rows × 8 columns)
✓ Saved