In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
import pandas as pd
import time
import requests
import logging
import re
import os
from datetime import datetime

# Set up logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler(f"zscaler_job_scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"),
                        logging.StreamHandler()
                    ])
logger = logging.getLogger(__name__)

# Function to scroll and load all jobs
def scroll_to_load_all(driver, max_scrolls=20, wait_time=1.5):
    """
    Scroll the page to load all content with a maximum number of scrolls.
    For Greenhouse-based sites like ZScaler.
    """
    scrolls = 0
    last_height = driver.execute_script("return document.body.scrollHeight")
    last_job_count = 0
    consecutive_no_change = 0

    logger.info("Starting to scroll to load all content...")

    while scrolls < max_scrolls:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(wait_time)
        driver.save_screenshot(f"screenshots/zscaler_scroll_{scrolls+1}.png")

        # Try clicking 'Load More' buttons if available
        try:
            load_more_buttons = driver.find_elements(By.XPATH, 
                "//button[contains(text(), 'Load More') or contains(text(), 'View More')]")
            if load_more_buttons:
                for button in load_more_buttons:
                    if button.is_displayed() and button.is_enabled():
                        driver.execute_script("arguments[0].click();", button)
                        logger.info("Clicked 'Load More' button")
                        time.sleep(wait_time + 1)
        except Exception as e:
            logger.info(f"No 'Load More' button found or error clicking it: {e}")

        new_height = driver.execute_script("return document.body.scrollHeight")
        job_count = len(driver.find_elements(By.CLASS_NAME, "opening"))

        logger.info(f"Scroll {scrolls+1}: Height {last_height} → {new_height}, Jobs found: {job_count}")

        if new_height == last_height and job_count == last_job_count:
            consecutive_no_change += 1
            logger.info(f"No change detected ({consecutive_no_change}/3)")
            if consecutive_no_change >= 3:
                logger.info("No more content loading after multiple scrolls. Stopping scroll operation.")
                break
        else:
            consecutive_no_change = 0

        last_height = new_height
        last_job_count = job_count
        scrolls += 1

    logger.info(f"Completed scrolling after {scrolls} scrolls. Found approximately {last_job_count} job items.")
    return last_job_count

# Function to extract job information from the ZScaler page
def extract_job_listings_zscaler(driver):
    """
    Extract job listings (Title, Location, Department, Link) from ZScaler Greenhouse page.
    """
    jobs_data = []
    
    try:
        job_elements = driver.find_elements(By.CLASS_NAME, "opening")
        logger.info(f"Found {len(job_elements)} job elements on page")
        
        # Optional: Take a screenshot of found job elements for debugging
        if job_elements:
            try:
                driver.execute_script("arguments[0].style.border='3px solid red'", job_elements[0])
                driver.save_screenshot("screenshots/zscaler_job_elements_found.png")
                driver.execute_script("arguments[0].style.border=''", job_elements[0])
            except:
                pass
        
        for index, job in enumerate(job_elements):
            try:
                a_elem = job.find_element(By.TAG_NAME, "a")
                title = a_elem.text.strip()
                link = a_elem.get_attribute("href")
                
                # Extract location
                location = "Not specified"
                try:
                    location_elem = job.find_element(By.CLASS_NAME, "location")
                    location_text = location_elem.text.strip()
                    if location_text and len(location_text) < 50 and "footer" not in location_text.lower():
                        location = location_text
                    logger.info(f"Extracted location for '{title}': '{location}'")
                except NoSuchElementException:
                    logger.info(f"No location found for '{title}', set to 'Not specified'")
                
                department = "Not specified"
                try:
                    department_elem = job.find_element(By.CLASS_NAME, "department")
                    department = department_elem.text.strip()
                except:
                    pass
                
                if title and title.strip() and link and link.strip():
                    is_duplicate = any(existing_job["Title"] == title and existing_job["Link"] == link 
                                     for existing_job in jobs_data)
                    if not is_duplicate:
                        jobs_data.append({
                            "Title": title,
                            "Location": location,
                            "Department": department,
                            "Link": link
                        })
                        logger.info(f"Added job: {title} at {location}")
                
            except (StaleElementReferenceException, Exception) as e:
                logger.error(f"Error extracting job details for job {index+1}: {e}")
                continue
                
    except Exception as e:
        logger.error(f"Error finding job elements: {e}")
        driver.save_screenshot("screenshots/zscaler_extract_error.png")
    
    return jobs_data

# Function to handle pagination for ZScaler
def handle_pagination(driver, max_pages=10):
    """
    Handle pagination for ZScaler Greenhouse site.
    """
    page = 1
    all_jobs = []
    
    logger.info("Starting pagination handling for ZScaler...")
    
    while page <= max_pages:
        logger.info(f"Processing page {page}")
        driver.save_screenshot(f"screenshots/zscaler_page_{page}.png")
        
        jobs_on_page = extract_job_listings_zscaler(driver)
        all_jobs.extend(jobs_on_page)
        logger.info(f"Found {len(jobs_on_page)} jobs on page {page}")
        
        next_button = None
        try:
            next_buttons = driver.find_elements(By.XPATH, 
                "//a[contains(text(), 'Next') or contains(@class, 'next') or contains(@aria-label, 'Next')]")
            for button in next_buttons:
                if button.is_displayed() and "disabled" not in button.get_attribute("class"):
                    next_button = button
                    break
        except Exception:
            pass
        
        if not next_button:
            logger.info("No next page button found. Reached last page.")
            break
            
        try:
            driver.execute_script("arguments[0].click();", next_button)
            logger.info("Clicked next page button")
            time.sleep(3)
            page += 1
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "opening"))
            )
        except Exception as e:
            logger.error(f"Error clicking next page: {e}")
            break
            
    return all_jobs

# Function to handle different types of popups
def handle_popups(driver):
    try:
        popup_selectors = [
            "//button[contains(text(), 'Accept')]", 
            "//button[contains(text(), 'I agree')]",
            "//button[contains(@id, 'accept')]",
            "//button[contains(@class, 'accept')]",
            "//button[contains(text(), 'Continue')]",
            "//button[contains(text(), 'Got it')]",
            "//button[contains(text(), 'Close')]",
            "//button[@aria-label='Close']",
            "//div[contains(@class, 'cookie')]//button",
            "//div[contains(@id, 'consent')]//button"
        ]
        
        for xpath in popup_selectors:
            try:
                buttons = driver.find_elements(By.XPATH, xpath)
                for button in buttons:
                    if button.is_displayed():
                        button.click()
                        logger.info(f"Clicked popup/cookie button with xpath: {xpath}")
                        time.sleep(1)
            except Exception:
                continue
                
        # Handle alert popups if present
        try:
            alert = driver.switch_to.alert
            alert.accept()
            logger.info("Accepted alert popup")
        except Exception:
            pass
            
    except Exception as e:
        logger.warning(f"Error handling popups: {e}")

# Function to validate URL (optional)
def is_valid_link(url):
    if not url or not isinstance(url, str) or not url.startswith("http"):
        return False
    try:
        response = requests.head(url, timeout=5, allow_redirects=True)
        return response.status_code < 400
    except requests.RequestException:
        logger.warning(f"Invalid link: {url}")
        return False

# Main ZScaler job scraper function
def scrape_zscaler_jobs(search_keyword="", max_pages=10, headless=False):
    """
    Scrape ZScaler jobs using the Greenhouse job board.
    """
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36')
    
    os.makedirs('screenshots', exist_ok=True)
    
    driver = None
    jobs_data = []

    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.set_page_load_timeout(30)
        
        base_url = "https://boards.greenhouse.io/zscaler"
        logger.info(f"Scraping jobs from ZScaler" + (f", filtering by keyword '{search_keyword}'" if search_keyword else ""))
        
        driver.get(base_url)
        driver.save_screenshot("screenshots/zscaler_initial.png")
        
        handle_popups(driver)
        
        try:
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CLASS_NAME, "opening"))
            )
            logger.info("Job listings loaded successfully")
        except TimeoutException:
            logger.warning("Timed out waiting for job listings to load")
            driver.save_screenshot("screenshots/zscaler_timeout.png")
            
        # If a search keyword is provided, use the search box if available
        if search_keyword:
            try:
                search_box = driver.find_element(By.ID, "search_keywords")
                if search_box:
                    search_box.clear()
                    search_box.send_keys(search_keyword)
                    search_box.send_keys(Keys.RETURN)
                    time.sleep(3)
                    logger.info(f"Searched for '{search_keyword}'")
                    driver.save_screenshot("screenshots/zscaler_search_results.png")
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "opening"))
                    )
            except NoSuchElementException:
                logger.info("No search box found. Will filter results after scraping.")
        
        scroll_to_load_all(driver)
        driver.save_screenshot("screenshots/zscaler_after_scroll.png")
        
        all_jobs = handle_pagination(driver, max_pages=max_pages)
        logger.info(f"Collected {len(all_jobs)} total jobs after pagination")
        
        # If search keyword filtering wasn't done using the search box, filter the results here
        if search_keyword and all_jobs:
            filtered_jobs = [job for job in all_jobs if search_keyword.lower() in job["Title"].lower()]
            logger.info(f"Filtered from {len(all_jobs)} to {len(filtered_jobs)} jobs matching '{search_keyword}'")
            all_jobs = filtered_jobs
        
        jobs_data = all_jobs
        logger.info(f"Total jobs found: {len(jobs_data)}")
        
    except Exception as e:
        logger.error(f"Error during scraping from ZScaler: {e}")
        if driver:
            driver.save_screenshot("screenshots/zscaler_error.png")

    finally:
        if driver:
            driver.quit()

    return jobs_data

# Main function to scrape and save results to CSV
def main(search_keyword="", max_pages=10, headless=False):
    start_time = time.time()
    logger.info(f"Starting ZScaler job scraper at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    os.makedirs('screenshots', exist_ok=True)
    
    jobs_data = scrape_zscaler_jobs(search_keyword, max_pages, headless)
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    keyword_slug = search_keyword.replace(' ', '_') if search_keyword else 'all'
    output_filename = f"zscaler_jobs_{keyword_slug}_{timestamp}.csv"
    
    if jobs_data:
        df = pd.DataFrame(jobs_data)
        # Save only the essential columns
        df = df[['Title', 'Location', 'Department', 'Link']]
        df.to_csv(output_filename, index=False, encoding='utf-8-sig')
        logger.info(f"Jobs saved to '{output_filename}'")
        
        print("\n" + "="*50)
        print(f"ZScaler Job Scraping Results:")
        print(f"Total jobs found: {len(df)}")
        print(f"Unique locations: {len(df['Location'].unique())}")
        print(f"Sample jobs:")
        print(df[['Title', 'Location']].head())
        print("="*50)
        
        elapsed_time = time.time() - start_time
        logger.info(f"Completed in {elapsed_time:.2f} seconds")
        
        return df
    else:
        logger.warning(f"No jobs found with search keyword '{search_keyword}'")
        print("\nNo jobs found to display.")
        elapsed_time = time.time() - start_time
        logger.info(f"Process completed with no results in {elapsed_time:.2f} seconds")
        return pd.DataFrame()

if __name__ == "__main__":
    print("ZScaler Job Scraper")
    print("="*50)
    
    job_title = input("Enter job title to search for (leave empty to get all jobs): ").strip()
    
    try:
        max_pages = int(input("Maximum number of pages to scrape (default 10): ") or "10")
    except ValueError:
        max_pages = 10
        print("Invalid input. Using default of 10 pages.")
    
    headless_mode = input("Run in headless mode? (y/n, default: n): ").strip().lower() == 'y'
    
    print("\nStarting job scraper...")
    print("This may take several minutes depending on the number of jobs and pages.")
    print("Progress will be logged to the console and a log file.")
    
    main(job_title, max_pages, headless_mode)


ZScaler Job Scraper


Enter job title to search for (leave empty to get all jobs):  
Maximum number of pages to scrape (default 10):  
Run in headless mode? (y/n, default: n):  y


2025-03-20 19:50:26,155 - INFO - Starting ZScaler job scraper at 2025-03-20 19:50:26



Starting job scraper...
This may take several minutes depending on the number of jobs and pages.
Progress will be logged to the console and a log file.


2025-03-20 19:50:26,871 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-20 19:50:26,938 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-20 19:50:26,991 - INFO - Driver [/Users/srikar/.wdm/drivers/chromedriver/mac64/134.0.6998.90/chromedriver-mac-arm64/chromedriver] found in cache
2025-03-20 19:50:28,664 - INFO - Scraping jobs from ZScaler
2025-03-20 19:50:29,502 - INFO - Job listings loaded successfully
2025-03-20 19:50:29,505 - INFO - Starting to scroll to load all content...
2025-03-20 19:50:31,494 - INFO - Scroll 1: Height 19532 → 19532, Jobs found: 254
2025-03-20 19:50:33,463 - INFO - Scroll 2: Height 19532 → 19532, Jobs found: 254
2025-03-20 19:50:33,464 - INFO - No change detected (1/3)
2025-03-20 19:50:35,455 - INFO - Scroll 3: Height 19532 → 19532, Jobs found: 254
2025-03-20 19:50:35,455 - INFO - No change detected (2/3)
2025-03-20 19:50:37,444 - INFO - Scroll 4: Height 19532 → 19532, Jobs found: 254
2025-03-20 19:50:37,445 - INFO - No chan


ZScaler Job Scraping Results:
Total jobs found: 254
Unique locations: 58
Sample jobs:
                                               Title                  Location
0  Senior Machine Learning Engineer (New Jersey-b...              Remote - USA
1                          Customer Success Engineer            Remote - Japan
2           Customer Success Engineer - WEST/CENTRAL  Remote - California, USA
3                           Customer Success Manager        Remote - Hong Kong
4                    Customer Success Manager - East    Remote - New York, USA


In [2]:
import pandas as pd 
df=pd.read_csv('zscaler_jobs_all_20250320_195046.csv')
df

Unnamed: 0,Title,Location,Department,Link
0,Senior Machine Learning Engineer (New Jersey-b...,Remote - USA,Not specified,https://boards.greenhouse.io/zscaler/jobs/4660...
1,Customer Success Engineer,Remote - Japan,Not specified,https://boards.greenhouse.io/zscaler/jobs/4544...
2,Customer Success Engineer - WEST/CENTRAL,"Remote - California, USA",Not specified,https://boards.greenhouse.io/zscaler/jobs/4660...
3,Customer Success Manager,Remote - Hong Kong,Not specified,https://boards.greenhouse.io/zscaler/jobs/4650...
4,Customer Success Manager - East,"Remote - New York, USA",Not specified,https://boards.greenhouse.io/zscaler/jobs/4676...
...,...,...,...,...
249,Senior Product Marketing Manager - Risk Manage...,"San Jose, California, USA",Not specified,https://boards.greenhouse.io/zscaler/jobs/4482...
250,Senior Sales Engineer - Continuous Threat Expo...,Remote - USA,Not specified,https://boards.greenhouse.io/zscaler/jobs/4684...
251,Senior Sales Engineer - CTEM,"Remote - Ohio, USA",Not specified,https://boards.greenhouse.io/zscaler/jobs/4684...
252,Senior Sales Engineer - CTEM,"Remote - Michigan, USA",Not specified,https://boards.greenhouse.io/zscaler/jobs/4684...


In [3]:
d=df.drop(columns='Department')
d

Unnamed: 0,Title,Location,Link
0,Senior Machine Learning Engineer (New Jersey-b...,Remote - USA,https://boards.greenhouse.io/zscaler/jobs/4660...
1,Customer Success Engineer,Remote - Japan,https://boards.greenhouse.io/zscaler/jobs/4544...
2,Customer Success Engineer - WEST/CENTRAL,"Remote - California, USA",https://boards.greenhouse.io/zscaler/jobs/4660...
3,Customer Success Manager,Remote - Hong Kong,https://boards.greenhouse.io/zscaler/jobs/4650...
4,Customer Success Manager - East,"Remote - New York, USA",https://boards.greenhouse.io/zscaler/jobs/4676...
...,...,...,...
249,Senior Product Marketing Manager - Risk Manage...,"San Jose, California, USA",https://boards.greenhouse.io/zscaler/jobs/4482...
250,Senior Sales Engineer - Continuous Threat Expo...,Remote - USA,https://boards.greenhouse.io/zscaler/jobs/4684...
251,Senior Sales Engineer - CTEM,"Remote - Ohio, USA",https://boards.greenhouse.io/zscaler/jobs/4684...
252,Senior Sales Engineer - CTEM,"Remote - Michigan, USA",https://boards.greenhouse.io/zscaler/jobs/4684...


In [4]:
for i in range(254):
    links=d['Link'][i]
    print(links)

https://boards.greenhouse.io/zscaler/jobs/4660023007
https://boards.greenhouse.io/zscaler/jobs/4544554007
https://boards.greenhouse.io/zscaler/jobs/4660622007
https://boards.greenhouse.io/zscaler/jobs/4650052007
https://boards.greenhouse.io/zscaler/jobs/4676877007
https://boards.greenhouse.io/zscaler/jobs/4649114007
https://boards.greenhouse.io/zscaler/jobs/4627670007
https://boards.greenhouse.io/zscaler/jobs/4613303007
https://boards.greenhouse.io/zscaler/jobs/4675890007
https://boards.greenhouse.io/zscaler/jobs/4633959007
https://boards.greenhouse.io/zscaler/jobs/4640362007
https://boards.greenhouse.io/zscaler/jobs/4658725007
https://boards.greenhouse.io/zscaler/jobs/4652722007
https://boards.greenhouse.io/zscaler/jobs/4676181007
https://boards.greenhouse.io/zscaler/jobs/4630518007
https://boards.greenhouse.io/zscaler/jobs/4548321007
https://boards.greenhouse.io/zscaler/jobs/4674235007
https://boards.greenhouse.io/zscaler/jobs/4518817007
https://boards.greenhouse.io/zscaler/jobs/4662

In [1]:
import time
import re
import pandas as pd
import os

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# -------------------------------------------------------------------------------------
# Helper function to gather text under a heading (until the next heading or end of section)
def get_section_text(driver, heading_element):
    """
    Gathers all text (e.g., paragraphs, lists) between the given heading_element
    and the next section heading.
    """
    text_list = []
    current_elem = heading_element
    
    # Get the parent element of the heading (usually a <p> tag)
    parent = driver.execute_script("return arguments[0].parentElement", current_elem)
    
    # Start from the parent's next sibling
    current_elem = parent
    
    while True:
        # Move to the next sibling
        next_elem = driver.execute_script("return arguments[0].nextElementSibling", current_elem)
        if not next_elem:
            # No more siblings, end of content
            break
            
        # Check if we've reached the next section (usually marked by another <p><strong> structure)
        try:
            if next_elem.tag_name.lower() == "p" and next_elem.find_element(By.TAG_NAME, "strong"):
                strong_text = next_elem.find_element(By.TAG_NAME, "strong").text.strip().lower()
                if "qualifications" in strong_text or "what" in strong_text:
                    break
        except:
            pass
            
        # Check if we've reached a different section type
        if next_elem.tag_name.lower() == "div" and "content-nav" in next_elem.get_attribute("class"):
            break
            
        # Otherwise, gather its text
        text_list.append(next_elem.text.strip())
        current_elem = next_elem
        
    # Join all pieces of text
    return "\n".join(t for t in text_list if t)

# -------------------------------------------------------------------------------------
# Main script
def main():
    # 1. Read your CSV
    df = pd.read_csv('zscaler_jobs_all_20250320_195046.csv')
    print("Original DataFrame:\n", df.head())
    
    # 2. Drop the 'Department' column
    d = df.drop(columns='Department')
    print("\nAfter dropping 'Department':\n", d.head())
    
    # 3. Create new columns to store extracted sections
    d['Minimum Qualifications'] = ""
    d['Preferred Qualifications'] = ""
    
    # 4. Set up Selenium Chrome driver
    options = webdriver.ChromeOptions()
    # Uncomment if you want headless mode:
    # options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    try:
        # 5. Process only the first 5 URLs (or fewer if the DataFrame has fewer rows)
        for i in range(min(5, len(d))):
            link = d.loc[i, 'Link']
            print(f"\nProcessing {i+1}/5: {link}")
            
            # Open the job link
            driver.get(link)
            time.sleep(3)  # Adjust if needed, or use WebDriverWait
            
            # 6. Instead of looking for h3 elements, find all <strong> elements
            strong_elements = driver.find_elements(By.TAG_NAME, "strong")
            
            # Placeholders for the extracted text
            min_qual_text = ""
            pref_qual_text = ""
            
            # 7. Check each strong element's text to see if it matches the sections we need
            for strong in strong_elements:
                heading_text = strong.text.strip().lower()
                
                # Check for minimum qualifications heading
                if ("what we're looking for" in heading_text or "minimum qualifications" in heading_text):
                    min_qual_text = get_section_text(driver, strong)
                    print(f"Found minimum qualifications: {min_qual_text[:100]}...")
                    
                # Check for preferred qualifications heading
                elif ("what will make you stand out" in heading_text or "preferred qualifications" in heading_text):
                    pref_qual_text = get_section_text(driver, strong)
                    print(f"Found preferred qualifications: {pref_qual_text[:100]}...")
            
            # Store the results back into the DataFrame
            d.at[i, 'Minimum Qualifications'] = min_qual_text
            d.at[i, 'Preferred Qualifications'] = pref_qual_text
            
    finally:
        # Close the browser when done
        driver.quit()
        
    # 8. Save final DataFrame to a new CSV with a test indicator
    output_filename = "zscaler_jobs_test_5_urls.csv"
    d.to_csv(output_filename, index=False, encoding='utf-8-sig')
    print(f"\nDone! Test results for first 5 URLs saved to '{output_filename}'.")
    print("\nSample of final DataFrame:")
    print(d.head())

# -------------------------------------------------------------------------------------
if __name__ == "__main__":
    main()

Original DataFrame:
                                                Title  \
0  Senior Machine Learning Engineer (New Jersey-b...   
1                          Customer Success Engineer   
2           Customer Success Engineer - WEST/CENTRAL   
3                           Customer Success Manager   
4                    Customer Success Manager - East   

                   Location     Department  \
0              Remote - USA  Not specified   
1            Remote - Japan  Not specified   
2  Remote - California, USA  Not specified   
3        Remote - Hong Kong  Not specified   
4    Remote - New York, USA  Not specified   

                                                Link  
0  https://boards.greenhouse.io/zscaler/jobs/4660...  
1  https://boards.greenhouse.io/zscaler/jobs/4544...  
2  https://boards.greenhouse.io/zscaler/jobs/4660...  
3  https://boards.greenhouse.io/zscaler/jobs/4650...  
4  https://boards.greenhouse.io/zscaler/jobs/4676...  

After dropping 'Department':
     

In [3]:
dp=pd.read_csv('zscaler_jobs_test_5_urls.csv')
dp.head(5)

Unnamed: 0,Title,Location,Link,Minimum Qualifications,Preferred Qualifications
0,Senior Machine Learning Engineer (New Jersey-b...,Remote - USA,https://boards.greenhouse.io/zscaler/jobs/4660...,2+ years experience as a Machine Learning Engi...,"Experience with cloud platforms such as AWS, A..."
1,Customer Success Engineer,Remote - Japan,https://boards.greenhouse.io/zscaler/jobs/4544...,4+ years of prior work experience in similar c...,"Knowledge of Internet and Routing Protocols, a..."
2,Customer Success Engineer - WEST/CENTRAL,"Remote - California, USA",https://boards.greenhouse.io/zscaler/jobs/4660...,5+ years work experience in similar customer f...,Expert (SME) in 2 or more areas among: (Intern...
3,Customer Success Manager,Remote - Hong Kong,https://boards.greenhouse.io/zscaler/jobs/4650...,"5+ years of experience in in Cloud Services, N...","Bachelor’s Degree in Engineering, Information ..."
4,Customer Success Manager - East,"Remote - New York, USA",https://boards.greenhouse.io/zscaler/jobs/4676...,"3+ years experience in a customer-facing role,...",A background in Internet and Security technolo...


In [4]:
dp['Minimum Qualifications'][0]

'2+ years experience as a Machine Learning Engineer or Data Scientist along with a Bachelor’s or advanced degree in Mathematics, Physics, Chemistry, Computer Science, Machine Learning, Statistics, or a related field, with 5+ years of applied experience in machine learning and data modeling.\nProficient in Python, SQL, and ML frameworks (TensorFlow, PyTorch, Scikit-learn), with expertise in statistical modeling techniques like regression, clustering, and decision trees.\nHands-on experience deploying ML models in production using modern tools, combined with strong data manipulation and analysis skills; familiarity with visualization tools like Matplotlib or Tableau.\nDemonstrated problem-solving abilities and capability to work independently on complex tasks.\nA high bias to action and a thirst to learn and grow.'

In [5]:
import time
import re
import pandas as pd
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# -------------------------------------------------------------------------------------
# Helper function to clean and preprocess text
def preprocess_text(text):
    """
    Clean and preprocess qualification text.
    """
    if not text or pd.isna(text):
        return ""
    
    # Remove multiple newlines and replace with a single newline
    text = re.sub(r'\n+', '\n', text)
    
    # Remove list markers like ':marker' that might be in the text
    text = re.sub(r':\s*marker', '', text)
    
    # Remove any HTML tags that might have been captured
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove any excessive whitespace at the start and end of lines
    lines = [line.strip() for line in text.split('\n')]
    
    # Remove any empty lines
    lines = [line for line in lines if line]
    
    # Join back into a clean string
    cleaned_text = '\n'.join(lines)
    
    # Final overall strip to remove any leading/trailing whitespace
    return cleaned_text.strip()

# -------------------------------------------------------------------------------------
# Helper function to gather text under a heading (until the next heading or end of section)
def get_section_text(driver, heading_element):
    """
    Gathers all text (e.g., paragraphs, lists) between the given heading_element
    and the next section heading.
    """
    text_list = []
    current_elem = heading_element
    
    # Get the parent element of the heading (usually a <p> tag)
    parent = driver.execute_script("return arguments[0].parentElement", current_elem)
    
    # Start from the parent's next sibling
    current_elem = parent
    
    while True:
        # Move to the next sibling
        next_elem = driver.execute_script("return arguments[0].nextElementSibling", current_elem)
        if not next_elem:
            # No more siblings, end of content
            break
            
        # Check if we've reached the next section (usually marked by another <p><strong> structure)
        try:
            if next_elem.tag_name.lower() == "p" and next_elem.find_element(By.TAG_NAME, "strong"):
                strong_text = next_elem.find_element(By.TAG_NAME, "strong").text.strip().lower()
                if "qualifications" in strong_text or "what" in strong_text:
                    break
        except:
            pass
            
        # Check if we've reached a different section type
        if next_elem.tag_name.lower() == "div" and "content-nav" in next_elem.get_attribute("class"):
            break
            
        # Otherwise, gather its text
        text_list.append(next_elem.text.strip())
        current_elem = next_elem
        
    # Join all pieces of text
    raw_text = "\n".join(t for t in text_list if t)
    
    # Clean and preprocess the text
    return preprocess_text(raw_text)

# -------------------------------------------------------------------------------------
# Function to parse list items from HTML content
def extract_list_items(driver, heading_element):
    """
    Extract list items that follow a heading element, with improved structure.
    This provides better parsing of bullet points and list items.
    """
    items = []
    parent = driver.execute_script("return arguments[0].parentElement", heading_element)
    current_elem = parent
    
    while True:
        next_elem = driver.execute_script("return arguments[0].nextElementSibling", current_elem)
        if not next_elem:
            break
            
        # Check if we've reached the next section
        try:
            if next_elem.tag_name.lower() == "p" and next_elem.find_element(By.TAG_NAME, "strong"):
                strong_text = next_elem.find_element(By.TAG_NAME, "strong").text.strip().lower()
                if "qualifications" in strong_text or "what" in strong_text:
                    break
        except:
            pass
            
        # Check if we've reached a different section type
        if next_elem.tag_name.lower() == "div" and "content-nav" in next_elem.get_attribute("class"):
            break
            
        # If it's a list, extract its items properly
        if next_elem.tag_name.lower() == "ul":
            li_elements = next_elem.find_elements(By.TAG_NAME, "li")
            for li in li_elements:
                item_text = li.text.strip()
                if item_text:  # Only add non-empty items
                    items.append(f"• {item_text}")
        else:
            # For non-list elements, just add the text
            text = next_elem.text.strip()
            if text:
                items.append(text)
                
        current_elem = next_elem
    
    # Join and clean
    text = "\n".join(items)
    return preprocess_text(text)

# -------------------------------------------------------------------------------------
# Main script
def main():
    # 1. Read your CSV
    df = pd.read_csv('zscaler_jobs_all_20250320_195046.csv')
    print("Original DataFrame:\n", df.head())
    
    # 2. Drop the 'Department' column
    d = df.drop(columns='Department')
    print("\nAfter dropping 'Department':\n", d.head())
    
    # 3. Create new columns to store extracted sections
    d['Minimum Qualifications'] = ""
    d['Preferred Qualifications'] = ""
    
    # 4. Set up Selenium Chrome driver
    options = webdriver.ChromeOptions()
    # Uncomment if you want headless mode:
    # options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    try:
        # 5. Process only the first 5 URLs (or fewer if the DataFrame has fewer rows)
        for i in range(min(5, len(d))):
            link = d.loc[i, 'Link']
            print(f"\nProcessing {i+1}/5: {link}")
            
            # Open the job link
            driver.get(link)
            time.sleep(3)  # Adjust if needed, or use WebDriverWait
            
            # 6. Instead of looking for h3 elements, find all <strong> elements
            strong_elements = driver.find_elements(By.TAG_NAME, "strong")
            
            # Placeholders for the extracted text
            min_qual_text = ""
            pref_qual_text = ""
            
            # 7. Check each strong element's text to see if it matches the sections we need
            for strong in strong_elements:
                heading_text = strong.text.strip().lower()
                
                # Check for minimum qualifications heading
                if ("what we're looking for" in heading_text or "minimum qualifications" in heading_text):
                    # Use the better list extraction method
                    min_qual_text = extract_list_items(driver, strong)
                    print(f"Found minimum qualifications: {min_qual_text[:100]}...")
                    
                # Check for preferred qualifications heading
                elif ("what will make you stand out" in heading_text or "preferred qualifications" in heading_text):
                    # Use the better list extraction method
                    pref_qual_text = extract_list_items(driver, strong)
                    print(f"Found preferred qualifications: {pref_qual_text[:100]}...")
            
            # Store the results back into the DataFrame
            d.at[i, 'Minimum Qualifications'] = min_qual_text
            d.at[i, 'Preferred Qualifications'] = pref_qual_text
            
            # Print a sample of the cleaned data
            print(f"\nCleaned Minimum Qualifications (sample):")
            if min_qual_text:
                print("\n".join(min_qual_text.split("\n")[:3]) + "\n...")
            else:
                print("None found")
                
            print(f"\nCleaned Preferred Qualifications (sample):")
            if pref_qual_text:
                print("\n".join(pref_qual_text.split("\n")[:3]) + "\n...")
            else:
                print("None found")
            
    finally:
        # Close the browser when done
        driver.quit()
        
    # 8. Save final DataFrame to a new CSV with a test indicator
    output_filename = "zscaler_jobs_cleaned_qualifications.csv"
    d.to_csv(output_filename, index=False, encoding='utf-8-sig')
    print(f"\nDone! Cleaned qualifications saved to '{output_filename}'.")
    print("\nSample of final DataFrame:")
    print(d.head())

# -------------------------------------------------------------------------------------
if __name__ == "__main__":
    main()

Original DataFrame:
                                                Title  \
0  Senior Machine Learning Engineer (New Jersey-b...   
1                          Customer Success Engineer   
2           Customer Success Engineer - WEST/CENTRAL   
3                           Customer Success Manager   
4                    Customer Success Manager - East   

                   Location     Department  \
0              Remote - USA  Not specified   
1            Remote - Japan  Not specified   
2  Remote - California, USA  Not specified   
3        Remote - Hong Kong  Not specified   
4    Remote - New York, USA  Not specified   

                                                Link  
0  https://boards.greenhouse.io/zscaler/jobs/4660...  
1  https://boards.greenhouse.io/zscaler/jobs/4544...  
2  https://boards.greenhouse.io/zscaler/jobs/4660...  
3  https://boards.greenhouse.io/zscaler/jobs/4650...  
4  https://boards.greenhouse.io/zscaler/jobs/4676...  

After dropping 'Department':
     