# Scrape data from Indeed

### Import necessary moduels 

In [1]:
import re 
import time
import requests
import math
import random
from csv import writer
from bs4 import BeautifulSoup
from lxml import etree as et
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException
from webdriver_manager.chrome import ChromeDriverManager

### Function definitions

In [2]:
# Initialize WebDriver

def initialize_driver():
    options = webdriver.ChromeOptions()
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver
    
# Function to get DOM from the given URL
def get_dom(url):
    global driver
    try:
        driver.get(url)
        #time.sleep(3)  # Ensure page loads
        time.sleep(random.uniform(2, 5))  # Sleep for a random time between 2 to 5 seconds
        page_content = driver.page_source
        product_soup = BeautifulSoup(page_content, 'html.parser')
        dom = et.HTML(str(product_soup))
        return dom
    except WebDriverException as e:
        print("WebDriver disconnected, restarting the browser:", e)
        driver.quit()
        driver = initialize_driver()
        return None

# Functions to extract job details (job link, description, title, company name, location, and salary)
def get_job_link(job):
    try:
        return job.xpath('./descendant::h2/a/@href')[0]
    except Exception:
        return 'Not available'

def get_job_desc(job_link):
    job_dom = get_dom(job_link)
    try:
        job_desc = job_dom.xpath('//*[@id="jobDescriptionText"]//text()')
        return " ".join(job_desc).strip() if job_desc else 'Not available'
    except Exception:
        return 'Not available'

def get_job_title(job):
    try:
        job_title = job.xpath('./descendant::h2/a/span/text()')[0]
    except Exception:
        job_title = 'Not available'
    return job_title

def get_company_name(job):
    try:
        company_name = job.xpath('.//span[@data-testid="company-name"]/text()')[0]
    except Exception:
        company_name = 'Not available'
    return company_name

def get_company_location(job):
    try:
        company_location = job.xpath('.//div[@data-testid="text-location"]/text()')[0]
    except Exception:
        company_location = 'Not available'
    return company_location

def get_salary(job_link):
    job_dom = get_dom(job_link)
    try:
        salary = job_dom.xpath('//*[@id="salaryInfoAndJobType"]//text()')
        return " ".join(salary).strip() if salary else 'Not available'
    except Exception:
        return 'Not available'

# Function to estimate the total number of pages for a specific search
'''
def get_total_pages(job_keyword, location_keyword, base_url):
    url = f"{base_url}/jobs?q={job_keyword}&l={location_keyword}"
    driver.get(url)
    try:
        job_count_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[contains(@class, "jobsearch-JobCountAndSortPane-jobCount")]'))
        )
        job_count_text = job_count_element.text
        job_count = int(job_count_text.split('+')[0].replace(',', '').strip())  # Handle commas and extra spaces
        jobs_per_page = 15
        total_pages = math.ceil(job_count / jobs_per_page)
        return total_pages
    except Exception as e:
        print(f"Error extracting job count: {e}")
        return 0
'''


def get_total_pages(driver, job_keyword, location_keyword, base_url):
    url = f"{base_url}/jobs?q={job_keyword}&l={location_keyword}"
    driver.get(url)
    try:
        job_count_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[contains(@class, "jobsearch-JobCountAndSortPane-jobCount")]'))
        )
        job_count_text = job_count_element.text
        print(f"Job count text: {job_count_text}")
        
        # Use regex to find all digits in the job_count_text and join them together
        job_count = int(''.join(re.findall(r'\d+', job_count_text)))  # Extract only digits
        
        print(f"Parsed job count: {job_count}")
        
        # If job count is 0 or not available, handle that gracefully
        if job_count == 0:
            print("No jobs found.")
            return 0
        
        # Number of jobs listed per page (Indeed typically lists 15 per page)
        jobs_per_page = 15
        
        # Calculate total number of pages, rounding up
        total_pages = math.ceil(job_count / jobs_per_page)
        return total_pages

    except Exception as e:
        print(f"Error extracting job count: {e}")
        return 0

### Job title definitions

In [3]:
# Define job search keywords
job_search_keywords = ['Data+Analyst', 'Data+Scientist', 'Product+Analyst', 'BI+Analyst']

# Define country configurations (base URLs and city lists)
country_configs = {
    'USA': {
        'base_url': 'https://www.indeed.com',
        'cities': ['New+York, NY', 'Los+Angeles, CA', 'Chicago, IL']
    },
    'France': {
        'base_url': 'https://www.indeed.fr',
        'cities': ['Paris (75)', 'Marseille (13)', 'Lyon (69)']
    },
    'Italy': {
        'base_url': 'https://it.indeed.com',
        'cities': ['Roma, Lazio', 'Milano, Lombardia', 'Napoli, Campania']
    },
    'Sweden': {
        'base_url': 'https://se.indeed.com',
        'cities': ['Stockholm', 'Göteborg', 'Malmö']
    }
}

### Run this cell to select a country

In [12]:
print("Select a country from the following list:")
for i, country in enumerate(country_configs.keys()):
    print(f"{i + 1}. {country}")

country_choice = int(input("Enter the number of your choice: ")) - 1
selected_country = list(country_configs.keys())[country_choice]
print(f"You have selected: {selected_country}")

# Fetch the country-specific configurations
selected_country_config = country_configs[selected_country]
base_url = selected_country_config['base_url']
location_search_keywords = selected_country_config['cities']

Select a country from the following list:
1. USA
2. France
3. Italy
4. Sweden


Enter the number of your choice:  3


You have selected: Italy


### Main script 
Saves the data to a csv file. 

In [13]:
# Initialize WebDriver
driver = initialize_driver()

# Open CSV file to write job listings data
with open(f'indeed_jobs_{selected_country}.csv', 'w', newline='', encoding='utf-8') as f:
    theWriter = writer(f)
    heading = ['page', 'country', 'job_link', 'search_keyword', 'search_location', 'job_title', 'company_name', 'company_location', 'salary', 'job_description']
    theWriter.writerow(heading)

    # Loop through job keywords and locations for the selected country
    for job_keyword in job_search_keywords:
        for location_keyword in location_search_keywords:
            print(f"Searching for: {job_keyword} in {location_keyword} ({selected_country})")
            
            # Get total number of pages for the job search
            #total_pages = get_total_pages(job_keyword, location_keyword, base_url)
            total_pages = get_total_pages(driver, job_keyword, location_keyword, base_url)
            print(f"Total pages found in {location_keyword} in {selected_country}: {total_pages}")
            
            for page_no in range(total_pages):  # Modify range to get more or fewer pages if needed
                print(f"Fetching page {page_no + 1} for {job_keyword} in {location_keyword} ({selected_country})")
                url = f"{base_url}/jobs?q={job_keyword}&l={location_keyword}&start={page_no * 10}"
                page_dom = get_dom(url)
                
                # Extract jobs from the current page
                jobs = page_dom.xpath('//div[@class="job_seen_beacon"]')
                print(f"Jobs found on page {page_no + 1}: {len(jobs)}")
                
                for job in jobs:
                    job_link = base_url + get_job_link(job)
                    job_title = get_job_title(job)
                    company_name = get_company_name(job)
                    company_location = get_company_location(job)
                    salary = get_salary(job_link)
                    job_desc = get_job_desc(job_link)
                    record = [page_no + 1, selected_country, job_link, job_keyword, location_keyword, job_title, company_name, company_location, salary, job_desc]
                    theWriter.writerow(record)  # Write the record to CSV
                    #time.sleep(2)
                    time.sleep(random.uniform(2, 5))  # Sleep for a random time between 2 to 5 seconds


# Close the web browser
driver.quit()

Searching for: Data+Analyst in Roma, Lazio (Italy)
Job count text: Più di 100 annunci
Parsed job count: 100
Total pages found in Roma, Lazio in Italy: 7
Fetching page 1 for Data+Analyst in Roma, Lazio (Italy)
Jobs found on page 1: 15
Fetching page 2 for Data+Analyst in Roma, Lazio (Italy)
Jobs found on page 2: 15
Fetching page 3 for Data+Analyst in Roma, Lazio (Italy)
Jobs found on page 3: 15
Fetching page 4 for Data+Analyst in Roma, Lazio (Italy)
Jobs found on page 4: 15
Fetching page 5 for Data+Analyst in Roma, Lazio (Italy)
Jobs found on page 5: 15
Fetching page 6 for Data+Analyst in Roma, Lazio (Italy)
Jobs found on page 6: 15
Fetching page 7 for Data+Analyst in Roma, Lazio (Italy)
Jobs found on page 7: 13
Searching for: Data+Analyst in Milano, Lombardia (Italy)
Job count text: Più di 300 annunci
Parsed job count: 300
Total pages found in Milano, Lombardia in Italy: 20
Fetching page 1 for Data+Analyst in Milano, Lombardia (Italy)
Jobs found on page 1: 15
Fetching page 2 for Data+An