In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException, ElementNotInteractableException
from webdriver_manager.chrome import ChromeDriverManager
import time
import os
import random
import undetected_chromedriver as uc

# Scrape job info from Indeed

In [None]:
# ===== CONFIGURATION =====
# Search parameters
JOB_TITLE = "Data Scientist"
DAYS_AGO = 7
WAIT_TIME = 2  # seconds to wait for page load

# Browser settings
WINDOW_SIZE = "1920,1080"

# ===== WEBDRIVER SETUP =====
def setup_driver():
    """Setup undetected-chromedriver with version management"""
    options = uc.ChromeOptions()
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-notifications')
    options.add_argument('--disable-popup-blocking')
    
    # Let undetected-chromedriver handle version matching
    driver = uc.Chrome(
        options=options,
        driver_executable_path=None,  # Let it auto-download appropriate version
        version_main=130  # Specify your Chrome version
    )
    
    driver.maximize_window()
    return driver

def get_search_url(job_title, days):
    return f"https://www.indeed.com/jobs?q={'+'.join(job_title.split())}&fromage={days}"

def scrape_job_card(card, driver):
    """Extract information from a single job card, requiring only title, company, and URL"""
    try:
        # First verify the card is attached to DOM
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "job_seen_beacon"))
        )
        
        job_data = {}
        
        # Only check for required fields first
        required_fields = {
            'title': {
                'primary': (By.CSS_SELECTOR, "a.jcs-JobTitle"),
                'backup': (By.CSS_SELECTOR, "[class*='title']")
            },
            'company': {
                'primary': (By.CSS_SELECTOR, "[data-testid='company-name']"),
                'backup': (By.CSS_SELECTOR, ".companyName")
            },
            'link': {
                'primary': (By.CSS_SELECTOR, "a.jcs-JobTitle"),
                'backup': (By.CSS_SELECTOR, "a[class*='jobTitle']")
            }
        }
        
        # Optional fields
        optional_fields = {
            'location': {
                'primary': (By.CSS_SELECTOR, "[data-testid='text-location']"),
                'backup': (By.CSS_SELECTOR, ".companyLocation")
            },
            'salary': {
                'primary': (By.CSS_SELECTOR, "div[class*='salary-snippet-container']"),
                'backup': (By.CSS_SELECTOR, ".salary-snippet")
            }
        }
        
        # Check required fields first
        missing_required = False
        for field, locators in required_fields.items():
            try:
                element = card.find_element(*locators['primary'])
            except NoSuchElementException:
                try:
                    element = card.find_element(*locators['backup'])
                except NoSuchElementException:
                    missing_required = True
                    print(f"Missing required field: {field}")
                    break
                    
            if element:
                if field == 'link':
                    job_data[field] = element.get_attribute('href')
                else:
                    job_data[field] = element.text.strip()
        
        # If any required field is missing, return None
        if missing_required or not all(job_data.get(field) for field in required_fields.keys()):
            return None
            
        # Get optional fields
        for field, locators in optional_fields.items():
            try:
                element = card.find_element(*locators['primary'])
            except NoSuchElementException:
                try:
                    element = card.find_element(*locators['backup'])
                except NoSuchElementException:
                    job_data[field] = None
                    continue
                    
            if element:
                job_data[field] = element.text.strip()
            else:
                job_data[field] = None
        
        print(f"Scraped: {job_data['title']} at {job_data['company']}")
        return job_data
        
    except Exception as e:
        print(f"Failed to scrape a job card: {str(e)}")
        return None

def scrape_page(driver):
    """Scrape all job cards from the current page and return a DataFrame"""
    jobs_data = []
    
    # Add random delay between 2-5 seconds
    time.sleep(random.uniform(2, 5))
    
    # Scroll slowly down the page
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        # Scroll down slowly in steps
        for i in range(10):
            driver.execute_script(f"window.scrollTo(0, {i * last_height/10});")
            time.sleep(random.uniform(0.1, 0.3))
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    # Wait for job cards to load with random delay
    time.sleep(random.uniform(1, 3))
    
    # Get all job cards on current page
    job_cards = driver.find_elements(By.CLASS_NAME, "job_seen_beacon")
    print(f"Found {len(job_cards)} job cards on the page")
    
    # Scrape each card with random delays
    for card in job_cards:
        # Move mouse to random positions occasionally
        if random.random() < 0.3:
            action = webdriver.ActionChains(driver)
            action.move_to_element(card).perform()
            time.sleep(random.uniform(0.5, 1.5))
            
        job_data = scrape_job_card(card, driver)
        if job_data:
            jobs_data.append(job_data)
        
        # Add small random delay between cards
        time.sleep(random.uniform(0.5, 1.5))
    
    # Convert to DataFrame
    if jobs_data:
        df = pd.DataFrame(jobs_data)
        # Reorder columns
        columns = ['title', 'company', 'location', 'salary', 'link']
        df = df[columns]
        return df
    else:
        return pd.DataFrame(columns=['title', 'company', 'location', 'salary', 'link'])

def handle_captcha(driver):
    """Handle CAPTCHA with extended wait time"""
    print("\n>>> CAPTCHA detected! Please:")
    print("1. Solve the CAPTCHA")
    print("2. Wait for the page to fully load")
    print("3. Press Enter ONLY after you see the job listings...")
    input()
    
    # Extra wait after user confirmation
    time.sleep(5)
    return True

def is_captcha_present(driver):
    """Check if CAPTCHA is present"""
    captcha_indicators = [
        "captcha" in driver.current_url.lower(),
        len(driver.find_elements(By.ID, "captcha-page")) > 0,
        len(driver.find_elements(By.CSS_SELECTOR, "[class*='captcha']")) > 0
    ]
    return any(captcha_indicators)

def scrape_with_captcha_handling(driver, max_retries=3):
    """Scrape page with CAPTCHA handling"""
    retries = 0
    while retries < max_retries:
        if is_captcha_present(driver):
            if handle_captcha(driver):
                print("CAPTCHA solved successfully!")
            else:
                print(f"CAPTCHA handling failed, attempt {retries + 1} of {max_retries}")
                retries += 1
                continue
        
        try:
            return scrape_page(driver)
        except Exception as e:
            print(f"Error during scraping: {str(e)}")
            retries += 1
    
    return pd.DataFrame()  # Return empty DataFrame if all retries failed

def click_next_page(driver):
    """Attempt to click the next page button with improved error handling"""
    try:
        # Check for CAPTCHA before clicking
        if is_captcha_present(driver):
            if not handle_captcha(driver):
                return False
        
        # Wait for the next button to be clickable
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "a[data-testid='pagination-page-next']"))
        )
        
        # Scroll the button into view smoothly
        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", next_button)
        time.sleep(random.uniform(1, 2))
        
        # Add random mouse movement
        action = webdriver.ActionChains(driver)
        action.move_to_element(next_button).perform()
        time.sleep(random.uniform(0.5, 1))
        
        # Try to click using JavaScript if regular click fails
        try:
            next_button.click()
        except (ElementClickInterceptedException, ElementNotInteractableException):
            driver.execute_script("arguments[0].click();", next_button)
        
        # Wait for page to load
        WebDriverWait(driver, 10).until(
            lambda driver: driver.execute_script("return document.readyState") == "complete"
        )
        
        time.sleep(random.uniform(3, 5))  # Additional delay after page load
        return True
        
    except Exception as e:
        print(f"Could not navigate to next page: {str(e)}")
        return False

def main():
    driver = setup_driver()
    all_jobs_df = pd.DataFrame(columns=['title', 'company', 'location', 'salary', 'link'])
    page_number = 1
    
    try:
        # Initial setup with longer delays
        driver.get("https://www.indeed.com")
        time.sleep(random.uniform(3, 5))
        
        # Accept cookies if present
        try:
            cookie_button = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.ID, "onetrust-accept-btn-handler"))
            )
            cookie_button.click()
            time.sleep(random.uniform(2, 3))
        except TimeoutException:
            pass
        
        url = get_search_url(JOB_TITLE, DAYS_AGO)
        print(f"Searching: {url}")
        
        driver.get(url)
        time.sleep(random.uniform(3, 5))
        
        while len(all_jobs_df) < 100:
            print(f"\nScraping page {page_number}...")
            
            # Check for CAPTCHA before scraping
            if is_captcha_present(driver):
                if not handle_captcha(driver):
                    print("Failed to handle CAPTCHA, ending scraping")
                    break
            
            # Scrape with CAPTCHA handling
            df = scrape_with_captcha_handling(driver)
            
            if not df.empty:
                before_dedup = len(all_jobs_df)
                all_jobs_df = pd.concat([all_jobs_df, df], ignore_index=True)
                
                # Only deduplicate based on the URL
                all_jobs_df = all_jobs_df.drop_duplicates(subset='link', keep='first')
                print(f"Total unique jobs: {len(all_jobs_df)} (removed {before_dedup + len(df) - len(all_jobs_df)} duplicates)")
                
                # Save progress after each successful page
                all_jobs_df.to_csv('indeed_jobs_progress.csv', index=False)
            else:
                print("No jobs found on this page")
                break
            
            if not click_next_page(driver):
                print("No more pages available")
                break
            
            page_number += 1
            time.sleep(random.uniform(3, 5))
        
        if not all_jobs_df.empty:
            print("\nFinal Results:")
            print(all_jobs_df.head())
            all_jobs_df.to_csv('indeed_jobs_final.csv', index=False)
            print(f"\nData saved to 'indeed_jobs_final.csv' with {len(all_jobs_df)} jobs")
        else:
            print("No jobs data was collected")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        if not all_jobs_df.empty:
            all_jobs_df.to_csv('indeed_jobs_error_backup.csv', index=False)
            print("Partial results saved to 'indeed_jobs_error_backup.csv'")
    
    finally:
        driver.quit()

if __name__ == "__main__":
    main()

# Analyze job descriptions with Llama

In [1]:
import pandas as pd
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
def setup_driver():
    """Setup undetected-chromedriver with version management"""
    options = uc.ChromeOptions()
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-notifications')
    options.add_argument('--disable-popup-blocking')
    
    # Let undetected-chromedriver handle version matching
    driver = uc.Chrome(
        options=options,
        driver_executable_path=None,  # Let it auto-download appropriate version
        version_main=130  # Specify your Chrome version
    )
    
    driver.maximize_window()
    return driver

def get_job_description(driver, url, max_retries=3):
    """Scrape the job description with retry logic"""
    for attempt in range(max_retries):
        try:
            driver.get(url)
            # Initial wait for page load
            time.sleep(3 + random.uniform(1, 2))
            
            # Wait for job description
            description_element = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#jobDescriptionText"))
            )
            
            # Additional wait to ensure content is loaded
            time.sleep(1)
            
            text = description_element.text
            if not text:
                raise ValueError("Empty description text")
                
            return text
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for URL {url}: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(5)  # Longer delay between retries
            driver.refresh()
    return None

def setup_llm():
    """Initialize Ollama with Llama3"""
    try:
        # Initialize Ollama with streaming
        llm = Ollama(
            model="llama3",
            callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
            temperature=0.7,
        )
        return llm
    except Exception as e:
        print(f"Error setting up Ollama: {str(e)}")
        print("Full error details:", e.__class__.__name__)
        return None
    
def analyze_with_llama(description, llm):
    """Use Ollama to analyze job description"""
    prompt = f"""Analyze this job description and return a JSON object with exactly this structure:
    {{
        "technical_skills": {{
            "programming_languages": [],
            "packages_frameworks": [],
            "tools_and_platforms": [],
            "quant_methods": []
        }},
        "requirements": {{
            "minimum_degree": "",
            "educational_field": [],
            "years_experience": "",
            "preferred_expertise":
        }},
        "job_level_type": {{
            "career_level": "",
            "work_arrangement": ""
        }},
        "industry_sector": "",
        "domain_expertise": []
    }}

    Job Description:
    {description}"""

    try:
        response = llm.invoke(prompt)
        import json
        # Find the first { and last } to extract JSON
        json_start = response.find('{')
        json_end = response.rfind('}') + 1
        if json_start >= 0 and json_end > json_start:
            json_str = response[json_start:json_end]
            parsed_data = json.loads(json_str)
            return parsed_data
        else:
            print("No valid JSON found in response")
            return None
    except Exception as e:
        print(f"Error analyzing with Ollama: {str(e)}")
        return None
    
def main():
    # Read the CSV file
    df = pd.read_csv('indeed_jobs_final.csv')
    
    # Initialize the LLM
    llm = setup_llm()
    if llm is None:
        print("Failed to initialize LLaMA. Exiting.")
        return
    
    # Initialize the webdriver using your setup
    driver = setup_driver()
    
    # Add new columns for the extracted information
    new_columns = [
        'job_description',
        'programming_languages',
        'packages_frameworks',
        'tools_and_platforms',
        'techniques',
        'minimum_degree',
        'educational_field',
        'years_experience',
        'preferred_expertise',
        'career_level',
        'work_arrangement',
        'industry_sector',
        'domain_expertise'
    ]
    
    for col in new_columns:
        df[col] = None
    
    # Process each job URL
    for idx, row in df.iterrows():
        print(f"\nProcessing job {idx + 1} of {len(df)}")
        print(f"URL: {row['link']}")
        
        # Get the job description
        description = get_job_description(driver, row['link'])
        
        if description:
            # Store the full description
            df.at[idx, 'job_description'] = description
            
            # Analyze with LLaMA
            analysis = analyze_with_llama(description, llm)
            
            if analysis:
                try:
                    # Store the extracted information
                    tech_skills = analysis.get('technical_skills', {})
                    requirements = analysis.get('requirements', {})
                    job_level = analysis.get('job_level_type', {})
                    
                    df.at[idx, 'programming_languages'] = ', '.join(tech_skills.get('programming_languages', []))
                    df.at[idx, 'packages_frameworks'] = ', '.join(tech_skills.get('packages_frameworks', []))
                    df.at[idx, 'tools_and_platforms'] = ', '.join(tech_skills.get('tools_and_platforms', []))
                    df.at[idx, 'quant_methods'] = ', '.join(tech_skills.get('quant_methods', []))
                    
                    df.at[idx, 'minimum_degree'] = requirements.get('minimum_degree', '')
                    df.at[idx, 'educational_field'] = ', '.join(requirements.get('educational_field', []))
                    df.at[idx, 'years_experience'] = requirements.get('years_experience', '')
                    df.at[idx, 'preferred_expertise'] = requirements.get('preferred_expertise', '')
                    
                    df.at[idx, 'career_level'] = job_level.get('career_level', '')
                    df.at[idx, 'work_arrangement'] = job_level.get('work_arrangement', '')
                    
                    df.at[idx, 'industry_sector'] = analysis.get('industry_sector', '')
                    df.at[idx, 'domain_expertise'] = ', '.join(analysis.get('domain_expertise', []))
                    
                except Exception as e:
                    print(f"Error processing analysis for job {idx + 1}: {str(e)}")
                    
            # Save progress after each job
            df.to_csv('enriched_jobs_data.csv', index=False)
            print(f"Successfully processed job {idx + 1}")
        else:
            print(f"Failed to process job {idx + 1}")
        
        # Random delay between requests
        time.sleep(random.uniform(2, 4))
    
    # Close the browser
    driver.quit()
    
    # Final save
    df.to_csv('analyzed_jobs.csv', index=False)
    print("\nProcessing complete! Check analyzed_jobs.csv for results.")

if __name__ == "__main__":
    main()

  llm = Ollama(
  llm = Ollama(



Processing job 1 of 105
URL: https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0C-qoG3_Og7rWvzN3HicE2rE9tMKr3pHcMFbL63Yv9qxS21M6OuqGulc9yJ2bP6suxAbuVv5GXij1RzyExnuvyXXcGEWX-n9nAw1ihd7s1Q96lYWde9AfHiKjbHcK91nDQ_3ni3wZc_jqwg5HiBv_v5HOVxnaVqVmjYKD93xcfnqXNKFmhKENgEHJanOm4EblNLuiVaLNudSqHQnKUOr3iW79LgQTKM6WCZUkd6oCFIWAne52t5YLf3zOGUeZAYSGUW4ILr_-wldv_ojI2JuYEJ5Z150PbpSosrnES-LLBiY21juTUtBrZYR9EekQ5qGj9yPGCuN9YlsjFTbsFve5oX_o4mH6kOk2OgSfnjHOWW5_fjRgEHCSrdF3-oGkQNUBYcvOhymysQu9B1-P42tlhhXnjLAp1XulhFxd2W3Im-ENPL90D_x-IfRUiH-SRM66UXmJbJZInnPiTmDThDXJf6QjJftBCcvAqdT1u_RvBDu4FsQQUbS5dsVrtcu7teYHcqLUwe8xs1MclcVZwRxGNA4mf96x-9pruMVQIg5C3uqYs5AZRzbQYD1FFcQZ2-zI9AY4i3XSO07pqsfePuP_UQEheca1mPHOJnF48cLGyXNX92zagAJzeMYtptyJgYh_XJFM2DoyPFUtsTwgHqQRqmhr3TKi4HRvn4dupvgzi8D0AAm4n99afxxduiBmPY70crn4su-w3LRztuRB13zxRrZGZ3ss6Zy6dkwdjJUZHYfmPHJbdWyRwSAZ6w4aZNsXbpiKBmuLqjhXkonkHcK99Eo-Vk-CMoywpFteinH9qnO4GH8Ipsuut0XZhjm0nsN2Zn6L1HLDR8133O74NXaADtcbtk332seVSZLMaFyfc_pfKEhxM7SJesJmo2LyaoSQRt_FSZy0Q5kWCLg-mzRjU1

  response = llm(prompt)


Here is the JSON object with the required structure:

{
"technical_skills": {
"programming_languages": ["SQL", "Python", "R"],
"ml_frameworks": [],
"cloud_platforms": [],
"big_data_tools": []
},
"education": {
"minimum_degree": "Bachelor's degree",
"preferred_fields": ["Data science", "Actuarial science", "Related field"]
},
"experience": {
"years_required": "Five or more years",
"specific_experience": ["Data science", "Actuarial science", "Data analysis"]
},
"job_level_type": {
"career_level": "",
"work_arrangement": "Hybrid and remote work arrangements"
},
"industry_sector": "Property-and-casualty insurance",
"domain_expertise": ["Commercial and personal lines"],
"business_problems": []
}

Note that some fields in the original job description, such as "Compensation overview" and "Culture and Total Rewards", are not relevant to the technical skills, education, experience, or job level type, so I did not include them in this JSON object.Successfully processed job 1

Processing job 2 of

# Clean the data

In [2]:
import pandas as pd
import ast
import re

def extract_location_components(location):
    """Extract city and state from location string, handling remote/hybrid prefixes"""
    # Handle NaN or empty values
    if pd.isna(location) or location == '':
        return pd.Series({'city': None, 'state': None, 'work_type': None})
    
    # Initialize work type
    work_type = 'On-site'
    
    # Clean the location string first
    location = location.strip()
    
    # More comprehensive patterns to catch remote/hybrid variations
    remote_patterns = [
        r'remote\s+in\s+',
        r'remote\s+work\s+in\s+',
        r'remote\s+-\s+',
        r'remote\s+position\s+in\s+',
        r'remote\s+from\s+'
    ]
    
    hybrid_patterns = [
        r'hybrid\s+work\s+in\s+',
        r'hybrid\s+in\s+',
        r'hybrid\s+-\s+',
        r'hybrid\s+position\s+in\s+'
    ]
    
    # Check and clean remote patterns
    if any(re.search(pattern, location.lower()) for pattern in remote_patterns):
        work_type = 'Remote'
        for pattern in remote_patterns:
            location = re.sub(pattern, '', location, flags=re.IGNORECASE)
    
    # Check and clean hybrid patterns
    elif any(re.search(pattern, location.lower()) for pattern in hybrid_patterns):
        work_type = 'Hybrid'
        for pattern in hybrid_patterns:
            location = re.sub(pattern, '', location, flags=re.IGNORECASE)
    
    # Remove ZIP codes if present
    location = re.sub(r'\s+\d{5}(?:-\d{4})?', '', location)
    
    # City, State pattern
    city_state_pattern = r'([^,]+),\s*([A-Z]{2})'
    
    # Try to match city and state pattern
    match = re.search(city_state_pattern, location)
    if match:
        city = match.group(1).strip()
        state = match.group(2).strip()
        return pd.Series({'city': city, 'state': state, 'work_type': work_type})
    
    # If no match, return original cleaned location as city and None for state
    return pd.Series({'city': location.strip(), 'state': None, 'work_type': work_type})

def clean_data(df):
    # Extract city, state, and work type from location
    location_components = df['location'].apply(extract_location_components)
    df['city'] = location_components['city']
    df['state'] = location_components['state']
    df['location_work_type'] = location_components['work_type']
    
    # Split programming languages into separate rows
    df['programming_languages'] = df['programming_languages'].fillna('')
    df['programming_languages'] = df['programming_languages'].str.split(',')
    
    # Extract salary range
    df[['salary_min', 'salary_max']] = df['salary'].str.extract(r'\$(\d+,?\d*)\s*-\s*\$(\d+,?\d*)')
    df['salary_min'] = df['salary_min'].str.replace(',', '').astype(float)
    df['salary_max'] = df['salary_max'].str.replace(',', '').astype(float)
    df['salary_avg'] = (df['salary_min'] + df['salary_max']) / 2
    
    # Clean work arrangement
    df['is_remote'] = df['work_arrangement'].str.contains('remote', case=False, na=False).astype(int)
    df['is_hybrid'] = df['work_arrangement'].str.contains('hybrid', case=False, na=False).astype(int)
    
    # Extract minimum years of experience
    df['min_years'] = df['years_experience'].str.extract(r'(\d+)').astype(float)
    
    return df


df = pd.read_csv('analyzed_jobs.csv')
df_clean = clean_data(df)
df_clean
#df_clean.to_csv('tableau_ready_jobs.csv', index=False)

Unnamed: 0,title,company,location,salary,link,job_description,programming_languages,ml_frameworks,cloud_platforms,big_data_tools,...,business_problems,city,state,location_work_type,salary_min,salary_max,salary_avg,is_remote,is_hybrid,min_years
0,Data Scientist III,Western National Group & Umialik Insurance,"Edina, MN 55435 \n(Pentagon area)","$100,100 - $150,200 a year",https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Description:\n\nWho are we?\nWestern National ...,"[SQL, Python, R]",,,,...,,Edina,MN,On-site,100100.0,150200.0,125150.0,1,1,
1,Data Scientist,SAP,"San Ramon, CA","$101,300 - $172,200 a year",https://www.indeed.com/rc/clk?jk=38e72807e5701...,"We help the world run better\n\nAt SAP, we ena...",[Python],"Tensorflow, Keras, scikit-learn","GCP, AWS, Azure",,...,,San Ramon,CA,On-site,101300.0,172200.0,136750.0,0,0,
2,Data Scientist,Insurity,Remote,"$65,000 - $120,000 a year",https://www.indeed.com/rc/clk?jk=5d623413f1266...,Who We Are\nInsurity’s vision is all about emp...,[],,,"statistical modeling software, data visualizat...",...,,Remote,,Remote,65000.0,120000.0,92500.0,0,0,1.0
3,"Data Scientist, New College Grad - 2025",Visa,"Hybrid work in Foster City, CA",,https://www.indeed.com/rc/clk?jk=7813733a5b300...,Company Description\n\nVisa is a world leader ...,[],"Python, SAS, R, Stata",,"SQL, Tableau",...,"Optimization and improvement of product, marke...",Foster City,CA,Hybrid,,,,0,1,2.0
4,Data Scientist (Remote),BD,"Remote in San Diego, CA",,https://www.indeed.com/rc/clk?jk=da617daea875a...,Job Description Summary\nBD is one of the larg...,"[Python, R]",,AWS,,...,"Improving clinical outcomes and processes, Opt...",San Diego,CA,Remote,,,,1,0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,Data Scientist II - Talent & DEI - Hybrid,Redfin,"Hybrid work in Seattle, WA 98101","$97,100 - $145,600 a year",https://www.indeed.com/rc/clk?jk=1d2f54e83b778...,This position is a hybrid role requiring emplo...,[R],,,,...,,Seattle,WA,Hybrid,97100.0,145600.0,121350.0,1,1,2.0
101,Data Scientist II (Sleep),WHOOP,"Boston, MA",,https://www.indeed.com/rc/clk?jk=70f88cde36be7...,WHOOP is an advanced health and fitness wearab...,[Python],,,,...,"Analyzing sleep data, Developing meaningful in...",Boston,MA,On-site,,,,0,0,2.0
102,Ai And Data Opportunity Seeker,Solix Technologies,"La Jolla, CA 92037",,https://www.indeed.com/rc/clk?jk=6198ca83e2a2b...,We would like to extend an exclusive invitatio...,[],,,,...,,La Jolla,CA,On-site,,,,0,0,
103,Data Scientist II,Black Eagle Defense,"Wahiawā, HI 96786","$115,000 - $125,000 a year",https://www.indeed.com/rc/clk?jk=ef9e92ca21eba...,"Job Description\n\nSALARY RANGE $115,000 - $12...",[Python],,,,...,,Wahiawā,HI,On-site,115000.0,125000.0,120000.0,0,0,3.0


In [23]:
driver = setup_driver()
search_url = get_search_url(JOB_TITLE, DAYS_AGO)

try:
    # Navigate to page
    driver.get(search_url)
    
    # Scrape data
    jobs_data = scrape_page(driver)
    
    # Process results
    df = process_data(jobs_data)
    
    # Display results
    display_results(df)
    
except Exception as e:
    print(f"An error occurred: {str(e)}")

finally:
    time.sleep(5)  # Keep browser open briefly
    driver.quit()

Found 0 job cards on the page
No jobs were scraped successfully


In [None]:
driver.get(search_url)

# Initialize list to store job data
jobs_data = []
page = 1

try:
    while True:
        # Wait for job cards to load
        time.sleep(2)  # Simple delay to ensure page loads
        
        # Get all job cards on current page
        job_cards = driver.find_elements(By.CLASS_NAME, "job_seen_beacon")
        
        for card in job_cards:
            try:
                job_data = {
                    'title': card.find_element(By.CLASS_NAME, "jobTitle").text,
                    'company': card.find_element(By.CSS_SELECTOR, "[data-testid='company-name']").text,
                    'location': card.find_element(By.CSS_SELECTOR, "[data-testid='text-location']").text,
                    'date_posted': card.find_element(By.CLASS_NAME, "date").text,
                    'link': card.find_element(By.CLASS_NAME, "jobTitle").get_attribute("href")
                }
                jobs_data.append(job_data)
            except NoSuchElementException:
                continue
        
        print(f"Scraped page {page} ({len(jobs_data)} jobs so far)")
        
        # Try to go to next page
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, "[aria-label='Next Page']")
            if not next_button.is_enabled():
                break
            next_button.click()
            page += 1
            time.sleep(2)
        except NoSuchElementException:
            break
            
except Exception as e:
    print(f"An error occurred: {str(e)}")
    
finally:
    driver.quit()


# Convert to DataFrame
df = pd.DataFrame(jobs_data)

# Clean date posted
df['date_posted'] = df['date_posted'].str.lower().replace('posted', '', regex=True).str.strip()
df['date_posted'] = df['date_posted'].replace({'today': '0 days ago', 'just': '0 days ago'})

# Extract number of days
df['days_ago'] = df['date_posted'].str.extract('(\d+)').astype(float)

# Sort by days ago
df = df.sort_values('days_ago')