In [1]:
#         salary_scrape = browser.find_elements("css selector", "")
#         result = []
#         for salary in salary_scrape:
#             url = l.get_attribute("href")
#             name = l.find_element("css selector", "span:nth-child(3)").text
#             result.append({"name": name, "link": url})

In [2]:
from bs4 import BeautifulSoup
import glob
import pandas as pd
from datetime import datetime
import re

In [3]:
def name(offer_panel):
    name_elements = offer_panel.select("h3.posting-title__position")
    if name_elements:
        return name_elements[0].text.strip()
    else:
        return ""

In [4]:
def company(offer_panel):
    company_elements = offer_panel.select("h4.company-name")
    if company_elements:
        return company_elements[0].text.strip()
    else:
        return ""

In [5]:
def technology(offer_panel):
    technology_element = offer_panel.select("span[data-cy='category name on the job offer listing']")
    return " ".join([t.text.strip() for t in technology_element])

In [6]:
def location(offer_panel):
    city = ""
    country = ""
    loc_elements = offer_panel.select("nfj-posting-item-city")
    if loc_elements:
        loc = loc_elements[0].text.strip()
        if "ZDALN" in loc.upper():
            city = "ZDALNA"
            country = "N/A"
        else:
            city_country = loc.split(",")
            if len(city_country) > 1:
                city = city_country[0].strip()
                country = city_country[1].strip()
            else:
                city_span = loc_elements[0].select("span")
                city = city_span[0].text.strip()
                country = "PL"
 
    return {
        "city": city,
        "country": country,
    }

In [7]:
def salary(offer_panel):
    s_low = 0.0
    s_high = 0.0
    s_curr = "PLN"
    salary_elements = offer_panel.select("nfj-posting-item-salary")
    if salary_elements:
        sal = salary_elements[0].text.strip().replace("–", "-").replace("\xa0", "")
        m = re.match(r"\s*([\d\s]+)\s*-\s*([\d\s]+)\s*(\w+)", sal)
        if m:
            try:
                s_low = float(m.group(1).strip())
                s_high = float(m.group(2).strip())
                s_curr = m.group(3).strip()
            except:
                pass
    return {
        "low": s_low,
        "high": s_high,
        "curr": s_curr,
    }

In [8]:
def scrape_single_offer(offer_panel, job):
    return {
        "name": name(offer_panel),
        "company": company(offer_panel),
        "technology": technology(offer_panel),
        "job": job,
        "location": location(offer_panel),
        "salary": salary(offer_panel),
    }

In [9]:
def scrape_offers(html_file_path: str):
    job = html_file_path.replace("..\\data\\raw\\", "").replace(".html", "")
    with open(html_file_path, "r", encoding = "UTF-8") as f:
        soup = BeautifulSoup(f.read(), "html.parser")

    offers = []
    offer_panels = soup.select("a.posting-list-item")
    print(f"{len(offer_panels)} job offers found")
    for offer_panel in offer_panels:
        offer_data = scrape_single_offer(offer_panel, job)
        if offer_data:
            offers.append(offer_data)
    print(f"{len(offers)} job offers decoded")
    return offers

In [10]:
offers = []
for html_file in glob.glob(r"..\data\raw\*.html"):
    print(f"{html_file}")
    offers += scrape_offers(html_file)
    print("")

df = pd.json_normalize(offers, sep="_")
date_part = datetime.today().strftime("%Y_%m_%d")
file_path = rf"..\data\interim\job_offers_{date_part}.csv"
df.to_csv(file_path, sep=";", encoding = "UTF-8", index=False)
print(f"Data frame saved to {file_path}")

..\data\raw\data analyst.html
20 job offers found
20 job offers decoded

..\data\raw\data engineer.html
20 job offers found
20 job offers decoded

..\data\raw\data scientist.html
20 job offers found
20 job offers decoded

Data frame saved to ..\data\interim\job_offers_2024_08_05.csv
