In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
import pandas as pd
import time
import requests
import logging
import re
import os
from datetime import datetime

# Set up logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler(f"zscaler_job_scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"),
                        logging.StreamHandler()
                    ])
logger = logging.getLogger(__name__)

# Function to scroll and load all jobs
def scroll_to_load_all(driver, max_scrolls=20, wait_time=1.5):
    """
    Scroll the page to load all content with a maximum number of scrolls
    For Greenhouse-based sites like ZScaler
    """
    scrolls = 0
    last_height = driver.execute_script("return document.body.scrollHeight")
    last_job_count = 0
    consecutive_no_change = 0
    
    logger.info("Starting to scroll to load all content...")
    
    while scrolls < max_scrolls:
        # Scroll down
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(wait_time)  # Wait time for content to load
        
        # Take screenshot for debugging
        driver.save_screenshot(f"screenshots/zscaler_scroll_{scrolls+1}.png")
        
        # Try to find the "Load More" or similar buttons and click them
        try:
            load_more_buttons = driver.find_elements(By.XPATH, 
                "//button[contains(text(), 'Load More') or contains(text(), 'View More')]")
            if load_more_buttons:
                for button in load_more_buttons:
                    if button.is_displayed() and button.is_enabled():
                        driver.execute_script("arguments[0].click();", button)
                        logger.info("Clicked 'Load More' button")
                        time.sleep(wait_time + 1)  # Extra wait for new content
        except Exception as e:
            logger.info(f"No 'Load More' button found or error clicking it: {e}")
        
        # Check height and job count to determine if we've loaded all content
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        # Use Greenhouse's job opening selector
        job_count = len(driver.find_elements(By.CLASS_NAME, "opening"))
        
        logger.info(f"Scroll {scrolls+1}: Height {last_height} → {new_height}, Jobs found: {job_count}")
        
        # If no change in height and job count, we might have reached the end
        if new_height == last_height and job_count == last_job_count:
            consecutive_no_change += 1
            logger.info(f"No change detected ({consecutive_no_change}/3)")
            if consecutive_no_change >= 3:  # If no change for 3 consecutive scrolls
                logger.info("No more content loading after multiple scrolls. Stopping scroll operation.")
                break
        else:
            consecutive_no_change = 0
            
        last_height = new_height
        last_job_count = job_count
        scrolls += 1
    
    logger.info(f"Completed scrolling after {scrolls} scrolls. Found approximately {last_job_count} job items.")
    return last_job_count

# Function to handle pagination for ZScaler
def handle_pagination(driver, max_pages=10):
    """
    Handle pagination for ZScaler Greenhouse site
    Greenhouse often loads all jobs on a single page, but this handles pagination if present
    """
    page = 1
    all_jobs = []
    
    logger.info("Starting pagination handling for ZScaler...")
    
    while page <= max_pages:
        logger.info(f"Processing page {page}")
        
        # Take screenshot for debugging
        driver.save_screenshot(f"screenshots/zscaler_page_{page}.png")
        
        # Extract current page's jobs
        jobs_on_page = extract_job_listings_zscaler(driver)
        all_jobs.extend(jobs_on_page)
        logger.info(f"Found {len(jobs_on_page)} jobs on page {page}")
        
        # Look for next page button (if any)
        next_button = None
        try:
            next_buttons = driver.find_elements(By.XPATH, 
                "//a[contains(text(), 'Next') or contains(@class, 'next') or contains(@aria-label, 'Next')]")
            
            for button in next_buttons:
                if button.is_displayed() and not "disabled" in button.get_attribute("class"):
                    next_button = button
                    break
        except Exception:
            pass
        
        if not next_button:
            logger.info("No next page button found. Reached last page.")
            break
            
        # Click next page 
        try:
            driver.execute_script("arguments[0].click();", next_button)
            logger.info("Clicked next page button")
            time.sleep(3)  # Wait for page to load
            page += 1
            
            # Wait for job listings to reload
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "opening"))
                )
            except TimeoutException:
                logger.warning("Timed out waiting for jobs to load after pagination")
                break
                
        except Exception as e:
            logger.error(f"Error clicking next page: {e}")
            break
            
    return all_jobs

# Extract job information from ZScaler page
def extract_job_listings_zscaler(driver):
    """
    Extract job listings from ZScaler Greenhouse page
    """
    jobs_data = []
    
    # Greenhouse typically uses 'opening' class for job listings
    try:
        job_elements = driver.find_elements(By.CLASS_NAME, "opening")
        logger.info(f"Found {len(job_elements)} job elements on page")
        
        # Take a screenshot of the found elements (for debugging)
        if len(job_elements) > 0:
            try:
                driver.execute_script("arguments[0].style.border='3px solid red'", job_elements[0])
                driver.save_screenshot("screenshots/zscaler_job_elements_found.png")
                driver.execute_script("arguments[0].style.border=''", job_elements[0])
            except:
                pass
                
        for index, job in enumerate(job_elements):
            try:
                # Extract job title and link
                a_elem = job.find_element(By.TAG_NAME, "a")
                title = a_elem.text.strip()
                link = a_elem.get_attribute("href")
                
                # Extract location
                location_elem = job.find_element(By.CLASS_NAME, "location")
                location = location_elem.text.strip() if location_elem else "Not specified"
                
                # Extract department (if available)
                department = "Not specified"
                try:
                    department_elem = job.find_element(By.CLASS_NAME, "department")
                    department = department_elem.text.strip()
                except:
                    pass
                
                # Add job if we have title and link
                if title and title.strip() and link and link.strip():
                    # Check for duplicate before adding
                    is_duplicate = False
                    for existing_job in jobs_data:
                        if existing_job["Title"] == title and existing_job["Link"] == link:
                            is_duplicate = True
                            break
                    
                    if not is_duplicate:
                        jobs_data.append({
                            "Title": title,
                            "Location": location,
                            "Department": department,
                            "Link": link,
                            "Description": "",  # We'll get descriptions later
                            "Company": "ZScaler"
                        })
                        logger.info(f"Added job: {title} at {location}")
                
            except (StaleElementReferenceException, Exception) as e:
                logger.error(f"Error extracting job details for job {index+1}: {e}")
                continue
                
    except Exception as e:
        logger.error(f"Error finding job elements: {e}")
        driver.save_screenshot("screenshots/zscaler_extract_error.png")
    
    return jobs_data

# Function to get job descriptions
def get_job_descriptions(driver, jobs_data, max_descriptions=100):
    """
    Get job descriptions for ZScaler jobs by visiting their individual pages
    """
    logger.info(f"Getting descriptions for {len(jobs_data)} jobs (up to {max_descriptions})")
    
    # Create a dedicated folder for job description screenshots
    job_desc_folder = 'zscaler_job_description_screenshots'
    os.makedirs(job_desc_folder, exist_ok=True)
    
    # Store current URL to return to afterward
    original_url = driver.current_url
    original_window = driver.current_window_handle
    
    # Process up to max_descriptions
    for i, job in enumerate(jobs_data[:max_descriptions]):
        if not job.get("Link"):
            continue
            
        logger.info(f"Getting description for job {i+1}/{min(len(jobs_data), max_descriptions)}: {job['Title']}")
        
        # Create a clean filename from the job title
        clean_title = re.sub(r'[\\/*?:"<>|]', "", job['Title'])
        clean_title = re.sub(r'\s+', "_", clean_title)
        clean_title = clean_title[:100] if len(clean_title) > 100 else clean_title
        
        # Create a new tab for each job
        try:
            # Open new tab
            driver.execute_script("window.open('about:blank', '_blank');")
            driver.switch_to.window(driver.window_handles[-1])
            
            # Navigate to job details page
            driver.get(job["Link"])
            time.sleep(4)  # Wait for page to load
            
            # Take a screenshot
            screenshot_file = f"{job_desc_folder}/job_{i+1}_{clean_title}.png"
            driver.save_screenshot(screenshot_file)
            
            # Extract description - Greenhouse typically uses 'content' class for job descriptions
            description = ""
            desc_selectors = [".content", "#content", ".description", "#job-description", ".job-description"]
            
            for selector in desc_selectors:
                try:
                    desc_elements = driver.find_elements(By.CSS_SELECTOR, selector)
                    if desc_elements:
                        description = desc_elements[0].text.strip()
                        if description:
                            logger.info(f"Got description for '{job['Title']}' ({len(description)} chars)")
                            break
                except Exception as e:
                    logger.debug(f"Selector {selector} failed: {e}")
            
            # Update the job with the description
            if description:
                job["Description"] = description
            
            # Close tab
            driver.close()
            driver.switch_to.window(original_window)
            
        except Exception as e:
            logger.error(f"Error getting description for {job['Title']}: {e}")
            # Make sure we're back to the original window
            try:
                driver.close()
                driver.switch_to.window(original_window)
            except:
                pass
    
    # Return to original page
    try:
        driver.get(original_url)
        time.sleep(2)
    except:
        pass
        
    logger.info(f"Completed fetching descriptions for {min(len(jobs_data), max_descriptions)} jobs")
    return jobs_data

# Function to handle different types of popups
def handle_popups(driver):
    try:
        # Common buttons for accepting cookies, terms, etc.
        popup_selectors = [
            "//button[contains(text(), 'Accept')]", 
            "//button[contains(text(), 'I agree')]",
            "//button[contains(@id, 'accept')]",
            "//button[contains(@class, 'accept')]",
            "//button[contains(text(), 'Continue')]",
            "//button[contains(text(), 'Got it')]",
            "//button[contains(text(), 'Close')]",
            "//button[@aria-label='Close']",
            "//div[contains(@class, 'cookie')]//button",
            "//div[contains(@id, 'consent')]//button"
        ]
        
        for xpath in popup_selectors:
            try:
                buttons = driver.find_elements(By.XPATH, xpath)
                for button in buttons:
                    if button.is_displayed():
                        button.click()
                        logger.info(f"Clicked popup/cookie button with xpath: {xpath}")
                        time.sleep(1)
            except Exception:
                continue
                
        # Handle alerts
        try:
            alert = Alert(driver)
            alert.accept()
            logger.info("Accepted alert popup")
        except:
            pass
            
    except Exception as e:
        logger.warning(f"Error handling popups: {e}")

# Function to validate URL
def is_valid_link(url):
    if not url or not isinstance(url, str) or not url.startswith("http"):
        return False
        
    try:
        response = requests.head(url, timeout=5, allow_redirects=True)
        return response.status_code < 400  # Accept any non-error status
    except requests.RequestException:
        logger.warning(f"Invalid link: {url}")
        return False

# Main ZScaler job scraper function
def scrape_zscaler_jobs(search_keyword="", max_pages=10, headless=False):
    """
    Scrape ZScaler jobs using Greenhouse job board
    
    Parameters:
    search_keyword (str): Keyword to filter jobs by title
    max_pages (int): Maximum number of pages to scrape
    headless (bool): Whether to run in headless mode
    
    Returns:
    list: List of all job dictionaries found
    """
    options = webdriver.ChromeOptions()
    
    if headless:
        options.add_argument('--headless')
        
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36')
    
    # Create directories for debugging
    os.makedirs('screenshots', exist_ok=True)
    
    driver = None
    jobs_data = []

    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.set_page_load_timeout(30)
        
        # ZScaler's base job URL (Greenhouse job board)
        base_url = "https://boards.greenhouse.io/zscaler"
        logger.info(f"Scraping jobs from ZScaler" + (f", filtering by keyword '{search_keyword}'" if search_keyword else ""))
        
        # Open the ZScaler careers page
        driver.get(base_url)
        driver.save_screenshot("screenshots/zscaler_initial.png")
        
        # Handle popups
        handle_popups(driver)
        
        # Wait for job listings to load
        try:
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CLASS_NAME, "opening"))
            )
            logger.info("Job listings loaded successfully")
        except TimeoutException:
            logger.warning("Timed out waiting for job listings to load")
            driver.save_screenshot("screenshots/zscaler_timeout.png")
            
        # If there's a search box, use it (some Greenhouse sites have job filters)
        if search_keyword:
            try:
                search_box = driver.find_element(By.ID, "search_keywords")
                if search_box:
                    search_box.clear()
                    search_box.send_keys(search_keyword)
                    search_box.send_keys(Keys.RETURN)
                    time.sleep(3)  # Wait for results to load
                    logger.info(f"Searched for '{search_keyword}'")
                    driver.save_screenshot("screenshots/zscaler_search_results.png")
                    
                    # Wait for search results
                    try:
                        WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.CLASS_NAME, "opening"))
                        )
                    except TimeoutException:
                        logger.warning("Timed out waiting for search results")
            except NoSuchElementException:
                logger.info("No search box found. Will filter results after scraping.")
        
        # Scroll to load all results
        scroll_to_load_all(driver)
        driver.save_screenshot("screenshots/zscaler_after_scroll.png")
        
        # Handle pagination and collect all jobs
        all_jobs = handle_pagination(driver, max_pages=max_pages)
        logger.info(f"Collected {len(all_jobs)} total jobs after pagination")
        
        # If search keyword was provided, filter the results by job title
        if search_keyword and all_jobs:
            filtered_jobs = [job for job in all_jobs if search_keyword.lower() in job["Title"].lower()]
            logger.info(f"Filtered from {len(all_jobs)} to {len(filtered_jobs)} jobs matching '{search_keyword}'")
            all_jobs = filtered_jobs
        
        # Get job descriptions for all collected jobs
        jobs_with_descriptions = get_job_descriptions(driver, all_jobs, max_descriptions=100)
        
        # Save all jobs
        jobs_data = jobs_with_descriptions
        logger.info(f"Saving {len(jobs_data)} jobs found in search results")

    except Exception as e:
        logger.error(f"Error during scraping from ZScaler: {e}")
        if driver:
            driver.save_screenshot("screenshots/zscaler_error.png")

    finally:
        if driver:
            driver.quit()

    return jobs_data

# Main function to scrape and save results to CSV
def main(search_keyword="", max_pages=10, headless=False):
    start_time = time.time()
    logger.info(f"Starting ZScaler job scraper at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Create screenshots directory
    os.makedirs('screenshots', exist_ok=True)
    
    # Create job description screenshots directory
    os.makedirs('zscaler_job_description_screenshots', exist_ok=True)
    
    # Scrape ZScaler jobs
    jobs_data = scrape_zscaler_jobs(search_keyword, max_pages, headless)
    
    # Generate filenames with timestamps
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    keyword_slug = search_keyword.replace(' ', '_') if search_keyword else 'all'
    detailed_filename = f"zscaler_jobs_detailed_{keyword_slug}_{timestamp}.csv"
    simple_filename = f"zscaler_jobs_simple_{keyword_slug}_{timestamp}.csv"
    
    # Save results
    if jobs_data:
        # Create DataFrame
        df = pd.DataFrame(jobs_data)
        
        # Save detailed CSV with descriptions
        df.to_csv(detailed_filename, index=False, encoding='utf-8-sig')
        logger.info(f"Detailed jobs saved to '{detailed_filename}'")
        
        # Create and save a simplified CSV without descriptions
        simple_df = df[['Title', 'Location', 'Department', 'Link']].copy()
        simple_df.to_csv(simple_filename, index=False, encoding='utf-8-sig')
        logger.info(f"Simplified jobs list saved to '{simple_filename}'")
        
        # Print summary
        print("\n" + "="*50)
        print(f"ZScaler Job Scraping Results:")
        print(f"Total jobs found: {len(df)}")
        print(f"Unique locations: {len(df['Location'].unique())}")
        print(f"Sample jobs:")
        print(df[['Title', 'Location']].head())
        print("\nTop locations:")
        print(df['Location'].value_counts().head())
        print(f"\nResults saved to:")
        print(f"- {detailed_filename}")
        print(f"- {simple_filename}")
        print("="*50)
        
        elapsed_time = time.time() - start_time
        logger.info(f"Completed in {elapsed_time:.2f} seconds")
        
        return df
    else:
        logger.warning(f"No jobs found with search keyword '{search_keyword}'")
        print("\nNo jobs found to display.")
        
        elapsed_time = time.time() - start_time
        logger.info(f"Process completed with no results in {elapsed_time:.2f} seconds")
        
        return pd.DataFrame()

if __name__ == "__main__":
    print("ZScaler Job Scraper")
    print("="*50)
    
    # Get user input
    job_title = input("Enter job title to search for (leave empty to get all jobs): ").strip()
    
    try:
        max_pages = int(input("Maximum number of pages to scrape (default 10): ") or "10")
    except ValueError:
        max_pages = 10
        print("Invalid input. Using default of 10 pages.")
    
    headless_mode = input("Run in headless mode? (y/n, default: n): ").strip().lower() == 'y'
    
    print("\nStarting job scraper...")
    print("This may take several minutes depending on the number of jobs and pages.")
    print("Progress will be logged to the console and a log file.")
    
    # Run the scraper
    main(job_title, max_pages, headless_mode)

ZScaler Job Scraper


Enter job title to search for (leave empty to get all jobs):  Machine learning
Maximum number of pages to scrape (default 10):  2
Run in headless mode? (y/n, default: n):  y


2025-03-15 14:19:44,274 - INFO - Starting ZScaler job scraper at 2025-03-15 14:19:44



Starting job scraper...
This may take several minutes depending on the number of jobs and pages.
Progress will be logged to the console and a log file.


2025-03-15 14:19:45,144 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-15 14:19:45,218 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-15 14:19:45,264 - INFO - Driver [/Users/srikar/.wdm/drivers/chromedriver/mac64/134.0.6998.88/chromedriver-mac-arm64/chromedriver] found in cache
2025-03-15 14:19:46,717 - INFO - Scraping jobs from ZScaler, filtering by keyword 'Machine learning'
2025-03-15 14:19:47,914 - INFO - Job listings loaded successfully
2025-03-15 14:19:47,921 - INFO - No search box found. Will filter results after scraping.
2025-03-15 14:19:47,924 - INFO - Starting to scroll to load all content...
2025-03-15 14:19:49,912 - INFO - Scroll 1: Height 18872 → 18872, Jobs found: 242
2025-03-15 14:19:51,890 - INFO - Scroll 2: Height 18872 → 18872, Jobs found: 242
2025-03-15 14:19:51,890 - INFO - No change detected (1/3)
2025-03-15 14:19:53,867 - INFO - Scroll 3: Height 18872 → 18872, Jobs found: 242
2025-03-15 14:19:53,867 - INFO - No change detec


ZScaler Job Scraping Results:
Total jobs found: 2
Unique locations: 1
Sample jobs:
                                 Title                   Location
0  Principal Machine Learning Engineer  San Jose, California, USA
1  Sr. Staff Machine Learning Engineer  San Jose, California, USA

Top locations:
Location
San Jose, California, USA    2
Name: count, dtype: int64

Results saved to:
- zscaler_jobs_detailed_Machine_learning_20250315_142016.csv
- zscaler_jobs_simple_Machine_learning_20250315_142016.csv


In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
import pandas as pd
import time
import requests
import logging
import re
import os
from datetime import datetime

# Set up logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler(f"zscaler_job_scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"),
                        logging.StreamHandler()
                    ])
logger = logging.getLogger(__name__)

# Function to scroll and load all jobs
def scroll_to_load_all(driver, max_scrolls=20, wait_time=1.5):
    """
    Scroll the page to load all content with a maximum number of scrolls
    For Greenhouse-based sites like ZScaler
    """
    scrolls = 0
    last_height = driver.execute_script("return document.body.scrollHeight")
    last_job_count = 0
    consecutive_no_change = 0
    
    logger.info("Starting to scroll to load all content...")
    
    while scrolls < max_scrolls:
        # Scroll down
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(wait_time)  # Wait time for content to load
        
        # Take screenshot for debugging
        driver.save_screenshot(f"screenshots/zscaler_scroll_{scrolls+1}.png")
        
        # Try to find the "Load More" or similar buttons and click them
        try:
            load_more_buttons = driver.find_elements(By.XPATH, 
                "//button[contains(text(), 'Load More') or contains(text(), 'View More')]")
            if load_more_buttons:
                for button in load_more_buttons:
                    if button.is_displayed() and button.is_enabled():
                        driver.execute_script("arguments[0].click();", button)
                        logger.info("Clicked 'Load More' button")
                        time.sleep(wait_time + 1)  # Extra wait for new content
        except Exception as e:
            logger.info(f"No 'Load More' button found or error clicking it: {e}")
        
        # Check height and job count to determine if we've loaded all content
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        # Use Greenhouse's job opening selector
        job_count = len(driver.find_elements(By.CLASS_NAME, "opening"))
        
        logger.info(f"Scroll {scrolls+1}: Height {last_height} → {new_height}, Jobs found: {job_count}")
        
        # If no change in height and job count, we might have reached the end
        if new_height == last_height and job_count == last_job_count:
            consecutive_no_change += 1
            logger.info(f"No change detected ({consecutive_no_change}/3)")
            if consecutive_no_change >= 3:  # If no change for 3 consecutive scrolls
                logger.info("No more content loading after multiple scrolls. Stopping scroll operation.")
                break
        else:
            consecutive_no_change = 0
            
        last_height = new_height
        last_job_count = job_count
        scrolls += 1
    
    logger.info(f"Completed scrolling after {scrolls} scrolls. Found approximately {last_job_count} job items.")
    return last_job_count

# Function to handle pagination for ZScaler
def handle_pagination(driver, max_pages=10):
    """
    Handle pagination for ZScaler Greenhouse site
    Greenhouse often loads all jobs on a single page, but this handles pagination if present
    """
    page = 1
    all_jobs = []
    
    logger.info("Starting pagination handling for ZScaler...")
    
    while page <= max_pages:
        logger.info(f"Processing page {page}")
        
        # Take screenshot for debugging
        driver.save_screenshot(f"screenshots/zscaler_page_{page}.png")
        
        # Extract current page's jobs
        jobs_on_page = extract_job_listings_zscaler(driver)
        all_jobs.extend(jobs_on_page)
        logger.info(f"Found {len(jobs_on_page)} jobs on page {page}")
        
        # Look for next page button (if any)
        next_button = None
        try:
            next_buttons = driver.find_elements(By.XPATH, 
                "//a[contains(text(), 'Next') or contains(@class, 'next') or contains(@aria-label, 'Next')]")
            
            for button in next_buttons:
                if button.is_displayed() and not "disabled" in button.get_attribute("class"):
                    next_button = button
                    break
        except Exception:
            pass
        
        if not next_button:
            logger.info("No next page button found. Reached last page.")
            break
            
        # Click next page 
        try:
            driver.execute_script("arguments[0].click();", next_button)
            logger.info("Clicked next page button")
            time.sleep(3)  # Wait for page to load
            page += 1
            
            # Wait for job listings to reload
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "opening"))
                )
            except TimeoutException:
                logger.warning("Timed out waiting for jobs to load after pagination")
                break
                
        except Exception as e:
            logger.error(f"Error clicking next page: {e}")
            break
            
    return all_jobs

# Extract job information from ZScaler page
def extract_job_listings_zscaler(driver):
    """
    Extract job listings from ZScaler Greenhouse page
    """
    jobs_data = []
    
    # Greenhouse typically uses 'opening' class for job listings
    try:
        job_elements = driver.find_elements(By.CLASS_NAME, "opening")
        logger.info(f"Found {len(job_elements)} job elements on page")
        
        # Take a screenshot of the found elements (for debugging)
        if len(job_elements) > 0:
            try:
                driver.execute_script("arguments[0].style.border='3px solid red'", job_elements[0])
                driver.save_screenshot("screenshots/zscaler_job_elements_found.png")
                driver.execute_script("arguments[0].style.border=''", job_elements[0])
            except:
                pass
                
        for index, job in enumerate(job_elements):
            try:
                # Extract job title and link
                a_elem = job.find_element(By.TAG_NAME, "a")
                title = a_elem.text.strip()
                link = a_elem.get_attribute("href")
                
                # Extract location
                location_elem = job.find_element(By.CLASS_NAME, "location")
                location = location_elem.text.strip() if location_elem else "Not specified"
                
                # Extract department (if available)
                department = "Not specified"
                try:
                    department_elem = job.find_element(By.CLASS_NAME, "department")
                    department = department_elem.text.strip()
                except:
                    pass
                
                # Add job if we have title and link
                if title and title.strip() and link and link.strip():
                    # Check for duplicate before adding
                    is_duplicate = False
                    for existing_job in jobs_data:
                        if existing_job["Title"] == title and existing_job["Link"] == link:
                            is_duplicate = True
                            break
                    
                    if not is_duplicate:
                        jobs_data.append({
                            "Title": title,
                            "Location": location,
                            "Department": department,
                            "Link": link,
                            "Description": "",  # We'll get descriptions later
                            "Company": "ZScaler"
                        })
                        logger.info(f"Added job: {title} at {location}")
                
            except (StaleElementReferenceException, Exception) as e:
                logger.error(f"Error extracting job details for job {index+1}: {e}")
                continue
                
    except Exception as e:
        logger.error(f"Error finding job elements: {e}")
        driver.save_screenshot("screenshots/zscaler_extract_error.png")
    
    return jobs_data

# Function to get job descriptions
def get_job_descriptions(driver, jobs_data, max_descriptions=100):
    """
    Get job descriptions for ZScaler jobs by visiting their individual pages
    """
    logger.info(f"Getting descriptions for {len(jobs_data)} jobs (up to {max_descriptions})")
    
    # Create a dedicated folder for job description screenshots
    job_desc_folder = 'zscaler_job_description_screenshots'
    os.makedirs(job_desc_folder, exist_ok=True)
    
    # Store current URL to return to afterward
    original_url = driver.current_url
    original_window = driver.current_window_handle
    
    # Process up to max_descriptions
    for i, job in enumerate(jobs_data[:max_descriptions]):
        if not job.get("Link"):
            continue
            
        logger.info(f"Getting description for job {i+1}/{min(len(jobs_data), max_descriptions)}: {job['Title']}")
        
        # Create a clean filename from the job title
        clean_title = re.sub(r'[\\/*?:"<>|]', "", job['Title'])
        clean_title = re.sub(r'\s+', "_", clean_title)
        clean_title = clean_title[:100] if len(clean_title) > 100 else clean_title
        
        # Create a new tab for each job
        try:
            # Open new tab
            driver.execute_script("window.open('about:blank', '_blank');")
            driver.switch_to.window(driver.window_handles[-1])
            
            # Navigate to job details page
            driver.get(job["Link"])
            time.sleep(4)  # Wait for page to load
            
            # Take a screenshot
            screenshot_file = f"{job_desc_folder}/job_{i+1}_{clean_title}.png"
            driver.save_screenshot(screenshot_file)
            
            # Extract description - Greenhouse typically uses 'content' class for job descriptions
            description = ""
            desc_selectors = [".content", "#content", ".description", "#job-description", ".job-description"]
            
            for selector in desc_selectors:
                try:
                    desc_elements = driver.find_elements(By.CSS_SELECTOR, selector)
                    if desc_elements:
                        description = desc_elements[0].text.strip()
                        if description:
                            logger.info(f"Got description for '{job['Title']}' ({len(description)} chars)")
                            break
                except Exception as e:
                    logger.debug(f"Selector {selector} failed: {e}")
            
            # Update the job with the description
            if description:
                job["Description"] = description
            
            # Close tab
            driver.close()
            driver.switch_to.window(original_window)
            
        except Exception as e:
            logger.error(f"Error getting description for {job['Title']}: {e}")
            # Make sure we're back to the original window
            try:
                driver.close()
                driver.switch_to.window(original_window)
            except:
                pass
    
    # Return to original page
    try:
        driver.get(original_url)
        time.sleep(2)
    except:
        pass
        
    logger.info(f"Completed fetching descriptions for {min(len(jobs_data), max_descriptions)} jobs")
    return jobs_data

# Function to handle different types of popups
def handle_popups(driver):
    try:
        # Common buttons for accepting cookies, terms, etc.
        popup_selectors = [
            "//button[contains(text(), 'Accept')]", 
            "//button[contains(text(), 'I agree')]",
            "//button[contains(@id, 'accept')]",
            "//button[contains(@class, 'accept')]",
            "//button[contains(text(), 'Continue')]",
            "//button[contains(text(), 'Got it')]",
            "//button[contains(text(), 'Close')]",
            "//button[@aria-label='Close']",
            "//div[contains(@class, 'cookie')]//button",
            "//div[contains(@id, 'consent')]//button"
        ]
        
        for xpath in popup_selectors:
            try:
                buttons = driver.find_elements(By.XPATH, xpath)
                for button in buttons:
                    if button.is_displayed():
                        button.click()
                        logger.info(f"Clicked popup/cookie button with xpath: {xpath}")
                        time.sleep(1)
            except Exception:
                continue
                
        # Handle alerts
        try:
            alert = Alert(driver)
            alert.accept()
            logger.info("Accepted alert popup")
        except:
            pass
            
    except Exception as e:
        logger.warning(f"Error handling popups: {e}")

# Function to validate URL
def is_valid_link(url):
    if not url or not isinstance(url, str) or not url.startswith("http"):
        return False
        
    try:
        response = requests.head(url, timeout=5, allow_redirects=True)
        return response.status_code < 400  # Accept any non-error status
    except requests.RequestException:
        logger.warning(f"Invalid link: {url}")
        return False

# Main ZScaler job scraper function
def scrape_zscaler_jobs(search_keyword="", max_pages=10, headless=False):
    """
    Scrape ZScaler jobs using Greenhouse job board
    
    Parameters:
    search_keyword (str): Keyword to filter jobs by title
    max_pages (int): Maximum number of pages to scrape
    headless (bool): Whether to run in headless mode
    
    Returns:
    list: List of all job dictionaries found
    """
    options = webdriver.ChromeOptions()
    
    if headless:
        options.add_argument('--headless')
        
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36')
    
    # Create directories for debugging
    os.makedirs('screenshots', exist_ok=True)
    
    driver = None
    jobs_data = []

    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.set_page_load_timeout(30)
        
        # ZScaler's base job URL (Greenhouse job board)
        base_url = "https://boards.greenhouse.io/zscaler"
        logger.info(f"Scraping jobs from ZScaler" + (f", filtering by keyword '{search_keyword}'" if search_keyword else ""))
        
        # Open the ZScaler careers page
        driver.get(base_url)
        driver.save_screenshot("screenshots/zscaler_initial.png")
        
        # Handle popups
        handle_popups(driver)
        
        # Wait for job listings to load
        try:
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CLASS_NAME, "opening"))
            )
            logger.info("Job listings loaded successfully")
        except TimeoutException:
            logger.warning("Timed out waiting for job listings to load")
            driver.save_screenshot("screenshots/zscaler_timeout.png")
            
        # If there's a search box, use it (some Greenhouse sites have job filters)
        if search_keyword:
            try:
                search_box = driver.find_element(By.ID, "search_keywords")
                if search_box:
                    search_box.clear()
                    search_box.send_keys(search_keyword)
                    search_box.send_keys(Keys.RETURN)
                    time.sleep(3)  # Wait for results to load
                    logger.info(f"Searched for '{search_keyword}'")
                    driver.save_screenshot("screenshots/zscaler_search_results.png")
                    
                    # Wait for search results
                    try:
                        WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.CLASS_NAME, "opening"))
                        )
                    except TimeoutException:
                        logger.warning("Timed out waiting for search results")
            except NoSuchElementException:
                logger.info("No search box found. Will filter results after scraping.")
        
        # Scroll to load all results
        scroll_to_load_all(driver)
        driver.save_screenshot("screenshots/zscaler_after_scroll.png")
        
        # Handle pagination and collect all jobs
        all_jobs = handle_pagination(driver, max_pages=max_pages)
        logger.info(f"Collected {len(all_jobs)} total jobs after pagination")
        
        # If search keyword was provided, filter the results by job title
        if search_keyword and all_jobs:
            filtered_jobs = [job for job in all_jobs if search_keyword.lower() in job["Title"].lower()]
            logger.info(f"Filtered from {len(all_jobs)} to {len(filtered_jobs)} jobs matching '{search_keyword}'")
            all_jobs = filtered_jobs
        
        # Get job descriptions for all collected jobs
        jobs_with_descriptions = get_job_descriptions(driver, all_jobs, max_descriptions=100)
        
        # Save all jobs
        jobs_data = jobs_with_descriptions
        logger.info(f"Saving {len(jobs_data)} jobs found in search results")

    except Exception as e:
        logger.error(f"Error during scraping from ZScaler: {e}")
        if driver:
            driver.save_screenshot("screenshots/zscaler_error.png")

    finally:
        if driver:
            driver.quit()

    return jobs_data

# Main function to scrape and save results to CSV
def main(search_keyword="", max_pages=10, headless=False):
    start_time = time.time()
    logger.info(f"Starting ZScaler job scraper at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Create screenshots directory
    os.makedirs('screenshots', exist_ok=True)
    
    # Create job description screenshots directory
    os.makedirs('zscaler_job_description_screenshots', exist_ok=True)
    
    # Scrape ZScaler jobs
    jobs_data = scrape_zscaler_jobs(search_keyword, max_pages, headless)
    
    # Generate filenames with timestamps
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    keyword_slug = search_keyword.replace(' ', '_') if search_keyword else 'all'
    detailed_filename = f"zscaler_jobs_detailed_{keyword_slug}_{timestamp}.csv"
    simple_filename = f"zscaler_jobs_simple_{keyword_slug}_{timestamp}.csv"
    
    # Save results
    if jobs_data:
        # Create DataFrame
        df = pd.DataFrame(jobs_data)
        
        # Save detailed CSV with descriptions
        df.to_csv(detailed_filename, index=False, encoding='utf-8-sig')
        logger.info(f"Detailed jobs saved to '{detailed_filename}'")
        
        # Create and save a simplified CSV without descriptions
        simple_df = df[['Title', 'Location', 'Department', 'Link']].copy()
        simple_df.to_csv(simple_filename, index=False, encoding='utf-8-sig')
        logger.info(f"Simplified jobs list saved to '{simple_filename}'")
        
        # Print summary
        print("\n" + "="*50)
        print(f"ZScaler Job Scraping Results:")
        print(f"Total jobs found: {len(df)}")
        print(f"Unique locations: {len(df['Location'].unique())}")
        print(f"Sample jobs:")
        print(df[['Title', 'Location']].head())
        print("\nTop locations:")
        print(df['Location'].value_counts().head())
        print(f"\nResults saved to:")
        print(f"- {detailed_filename}")
        print(f"- {simple_filename}")
        print("="*50)
        
        elapsed_time = time.time() - start_time
        logger.info(f"Completed in {elapsed_time:.2f} seconds")
        
        return df
    else:
        logger.warning(f"No jobs found with search keyword '{search_keyword}'")
        print("\nNo jobs found to display.")
        
        elapsed_time = time.time() - start_time
        logger.info(f"Process completed with no results in {elapsed_time:.2f} seconds")
        
        return pd.DataFrame()

if __name__ == "__main__":
    print("ZScaler Job Scraper")
    print("="*50)
    
    # Get user input
    job_title = input("Enter job title to search for (leave empty to get all jobs): ").strip()
    
    try:
        max_pages = int(input("Maximum number of pages to scrape (default 10): ") or "10")
    except ValueError:
        max_pages = 10
        print("Invalid input. Using default of 10 pages.")
    
    headless_mode = input("Run in headless mode? (y/n, default: n): ").strip().lower() == 'y'
    
    print("\nStarting job scraper...")
    print("This may take several minutes depending on the number of jobs and pages.")
    print("Progress will be logged to the console and a log file.")
    
    # Run the scraper
    main(job_title, max_pages, headless_mode)

ZScaler Job Scraper


Enter job title to search for (leave empty to get all jobs):  
Maximum number of pages to scrape (default 10):  2
Run in headless mode? (y/n, default: n):  y


2025-03-18 10:08:00,405 - INFO - Starting ZScaler job scraper at 2025-03-18 10:08:00



Starting job scraper...
This may take several minutes depending on the number of jobs and pages.
Progress will be logged to the console and a log file.


2025-03-18 10:08:01,295 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-18 10:08:01,423 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-18 10:08:01,473 - INFO - Driver [/Users/srikar/.wdm/drivers/chromedriver/mac64/134.0.6998.88/chromedriver-mac-arm64/chromedriver] found in cache
2025-03-18 10:08:03,072 - INFO - Scraping jobs from ZScaler
2025-03-18 10:08:04,916 - INFO - Job listings loaded successfully
2025-03-18 10:08:04,921 - INFO - Starting to scroll to load all content...
2025-03-18 10:08:06,887 - INFO - Scroll 1: Height 17644 → 17644, Jobs found: 225
2025-03-18 10:08:08,846 - INFO - Scroll 2: Height 17644 → 17644, Jobs found: 225
2025-03-18 10:08:08,847 - INFO - No change detected (1/3)
2025-03-18 10:08:10,806 - INFO - Scroll 3: Height 17644 → 17644, Jobs found: 225
2025-03-18 10:08:10,806 - INFO - No change detected (2/3)
2025-03-18 10:08:12,758 - INFO - Scroll 4: Height 17644 → 17644, Jobs found: 225
2025-03-18 10:08:12,759 - INFO - No chan


ZScaler Job Scraping Results:
Total jobs found: 225
Unique locations: 49
Sample jobs:
                                               Title                  Location
0  Senior Machine Learning Engineer (New Jersey-b...              Remote - USA
1                          Customer Success Engineer            Remote - Japan
2           Customer Success Engineer - WEST/CENTRAL  Remote - California, USA
3                           Customer Success Manager        Remote - Hong Kong
4                    Customer Success Manager - East    Remote - New York, USA

Top locations:
Location
Bangalore, IND               34
Mohali, IND                  33
San Jose, California, USA    32
Tokyo, JPN                   22
Remote - USA                  9
Name: count, dtype: int64

Results saved to:
- zscaler_jobs_detailed_all_20250318_101543.csv
- zscaler_jobs_simple_all_20250318_101543.csv


In [3]:
df=pd.read_csv('zscaler_jobs_detailed_Machine_Learning_20250317_235434.csv')
df

Unnamed: 0,Title,Location,Department,Link,Description,Company
0,Senior Machine Learning Engineer (New Jersey-b...,Remote - USA,Not specified,https://boards.greenhouse.io/zscaler/jobs/4660...,About Zscaler\nServing thousands of enterprise...,ZScaler
1,Senior Staff Machine Learning Engineer,"Bangalore, IND",Not specified,https://boards.greenhouse.io/zscaler/jobs/4674...,About Zscaler\nServing thousands of enterprise...,ZScaler
2,Principal Machine Learning Engineer,"San Jose, California, USA",Not specified,https://boards.greenhouse.io/zscaler/jobs/4672...,About Zscaler\nServing thousands of enterprise...,ZScaler
