In [0]:
%pip install beautifulsoup4

Collecting beautifulsoup4
  Obtaining dependency information for beautifulsoup4 from https://files.pythonhosted.org/packages/04/eb/f4151e0c7377a6e08a38108609ba5cede57986802757848688aeedd1b9e8/beautifulsoup4-4.13.5-py3-none-any.whl.metadata
  Downloading beautifulsoup4-4.13.5-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Obtaining dependency information for soupsieve>1.2 from https://files.pythonhosted.org/packages/14/a0/bb38d3b76b8cae341dad93a2dd83ab7462e6dbcdd84d43f54ee60a8dc167/soupsieve-2.8-py3-none-any.whl.metadata
  Downloading soupsieve-2.8-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.13.5-py3-none-any.whl (105 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/105.1 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m102.4/105.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m
[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0

In [0]:
from bs4 import BeautifulSoup
import requests
import csv
import time
from datetime import datetime

In [0]:
# Lista de comunidades autónomas de España
comunidades = [
    "Andalucía", "Aragon", "Asturias", "Baleares", "Las Palmas, Canarias, España",
    "Cantabria", "Castilla-La Mancha", "Castilla y León", "Barcelona",
    "Comunidad Valenciana", "Extremadura", "Galicia", "Madrid",
    "Murcia", "Navarra", "Euskadi", "La Rioja, La Rioja, España"
]

comunidades



['Andalucía',
 'Aragon',
 'Asturias',
 'Baleares',
 'Las Palmas, Canarias, España',
 'Cantabria',
 'Castilla-La Mancha',
 'Castilla y León',
 'Barcelona',
 'Comunidad Valenciana',
 'Extremadura',
 'Galicia',
 'Madrid',
 'Murcia',
 'Navarra',
 'Euskadi',
 'La Rioja, La Rioja, España']

In [0]:
def retrieve_job_urls(base_url, max_retries=3):
    job_urls = []
    start = 0

    while True:
        url = f"{base_url}&start={start}"
        print(f"🔎 Scraping page: {url} ...")

        retries = 0
        response = None

        while retries < max_retries:
            try:
                response = requests.get(url, timeout=10)
                if response.status_code == 200:
                    break
                else:
                    print(f"⚠️ Error {response.status_code}, reintento {retries+1}/{max_retries}")
            except requests.RequestException as e:
                print(f"⚠️ Error de red: {e}, reintento {retries+1}/{max_retries}")

            retries += 1
            time.sleep(2)

        if not response or response.status_code != 200:
            print(f"❌ No se pudo acceder a {url} después de {max_retries} intentos.")
            break

        html = response.text.strip()
        if not html:
            print("✅ No hay más resultados, paro aquí.")
            break

        soup = BeautifulSoup(html, "html.parser")
        job_url_elements = soup.select("a.base-card__full-link")

        if not job_url_elements:
            print("✅ No se encontraron más ofertas en esta página.")
            break

        for job_url_element in job_url_elements:
            job_url = job_url_element["href"]
            job_urls.append(job_url)

        start += 10
        time.sleep(1)

    return list(set(job_urls))

In [0]:
def scrape_job(job_url, max_retries=3):
    retries = 0
    response = None

    while retries < max_retries:
        try:
            response = requests.get(job_url, timeout=10)
            if response.status_code == 200:
                break
            else:
                print(f"⚠️ Error {response.status_code} en {job_url}, reintento {retries+1}/{max_retries}")
        except requests.RequestException as e:
            print(f"⚠️ Error de red en {job_url}: {e}, reintento {retries+1}/{max_retries}")

        retries += 1
        time.sleep(2)

    if not response or response.status_code != 200:
        print(f"❌ No se pudo acceder a {job_url} después de {max_retries} intentos.")
        return None

    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    title_element = soup.select_one("h1")
    title = title_element.get_text().strip() if title_element else None

    company_element = soup.select_one('[data-tracking-control-name="public_jobs_topcard-org-name"]')
    company_name = company_element.get_text().strip() if company_element else None
    company_url = company_element["href"] if company_element and company_element.has_attr("href") else None

    location_element = soup.select_one(".topcard__flavor--bullet")
    location = location_element.get_text().strip() if location_element else None

    applicants_element = soup.select_one(".num-applicants__caption")
    applicants = applicants_element.get_text().strip() if applicants_element else None

    salary_element = soup.select_one(".salary")
    salary = salary_element.get_text().strip() if salary_element else None

    description_element = soup.select_one(".description__text .show-more-less-html")
    description = description_element.get_text().strip() if description_element else None

    # Seleccionamos todos los <li> que contienen los criterios
    criteria_items = soup.select(".description__job-criteria-item")
    criteria_dict = {}

    # Mapeo de campos
    field_map = {
        "Seniority level": "Nivel de antigüedad",
        "Employment type": "Tipo de empleo",
        "Job function": "Función laboral",
        "Industries": "Sectores",
    }

    for item in criteria_items:
        name_element = item.select_one(".description__job-criteria-subheader")
        value_element = item.select_one(".description__job-criteria-text--criteria")
        if name_element and value_element:
            raw_key = name_element.get_text().strip()
            value = value_element.get_text().strip()

            # Normalizamos usando el map
            key = field_map.get(raw_key, raw_key)
            criteria_dict[key] = value
            print(f"🔍 {key}: {value}")

    # Ahora criteria_dict tendrá todos los campos

    # Construcción final del job
    job = {
        "url": job_url,
        "title": title,
        "company_name": company_name,
        "company_url": company_url,
        "location": location,
        "applications": applicants,
        "salary": salary,
        "description": description,
        "Nivel de antigüedad": criteria_dict.get("Nivel de antigüedad"),
        "Tipo de empleo": criteria_dict.get("Tipo de empleo"),
        "Función laboral": criteria_dict.get("Función laboral"),
        "Sectores": criteria_dict.get("Sectores")
    }
    return job


In [0]:
# Lista de comunidades autónomas de España
comunidades = [
    "Andalucía", "Aragon", "Asturias", "Baleares", "Las Palmas, Canarias, España",
    "Cantabria", "Castilla-La Mancha", "Castilla y León", "Barcelona",
    "Comunidad Valenciana", "Extremadura", "Galicia", "Madrid",
    "Murcia", "Navarra", "Euskadi", "La Rioja, La Rioja, España"
]

# comunidades = [
#     "Murcia", "Navarra",
# ]


In [0]:
from urllib.parse import quote, unquote
tipo_publicacion = dbutils.widgets.get("tipo_publicacion")
empleo = dbutils.widgets.get("empleo")
empleo = quote(empleo)

print(tipo_publicacion)
print(empleo)

In [0]:
all_jobs = []


for comunidad in comunidades:
    print(f"\n🌍 Buscando ofertas en {comunidad}...\n")
    # f_TPR=r2592000 Mensual
    # f_TPR=r86400 Diario

    base_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={empleo}&location={comunidad}&f_TPR={tipo_publicacion}&trk=public_jobs_jobs-search-bar_search-submit"
    job_urls = retrieve_job_urls(base_url)
    
    print(f"✅ {len(job_urls)} ofertas encontradas en {comunidad}\n")

    for job_url in job_urls:
        print(f"➡️ Scraping {job_url}")
        job = scrape_job(job_url)
        if job:
            all_jobs.append(job)
        print("✅ Job scraped\n")

# Guardar en CSV
today = datetime.now().strftime("%Y-%m-%d")
file_name = f"jobs_{today}.csv"

# Definimos todos los fieldnames, incluyendo los campos de criterios
fieldnames = [
    "url", "title", "company_name", "company_url", "location", "applications", "salary", "description",
    "Nivel de antigüedad", "Tipo de empleo", "Función laboral", "Sectores"
]

with open(file_name, "w", encoding="utf-8", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for job in all_jobs:
        # Aseguramos que todos los criterios estén presentes en el diccionario, aunque falten
        for key in ['Nivel de antigüedad', 'Sectores', 'Función laboral', 'Tipo de empleo']:
            if key not in job:
                job[key] = None
        writer.writerow(job)

print(f"🎉 {len(all_jobs)} ofertas guardadas en {file_name}\n")



🌍 Buscando ofertas en Andalucía...

🔎 Scraping page: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%20Scientist&location=Andalucía&f_TPR=r86400&trk=public_jobs_jobs-search-bar_search-submit&start=0 ...
🔎 Scraping page: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%20Scientist&location=Andalucía&f_TPR=r86400&trk=public_jobs_jobs-search-bar_search-submit&start=10 ...
🔎 Scraping page: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%20Scientist&location=Andalucía&f_TPR=r86400&trk=public_jobs_jobs-search-bar_search-submit&start=20 ...
🔎 Scraping page: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%20Scientist&location=Andalucía&f_TPR=r86400&trk=public_jobs_jobs-search-bar_search-submit&start=30 ...
🔎 Scraping page: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%20Scientist&location=Andalucía&f_TPR=r8640

In [0]:
spark.conf.set("fs.azure.account.key.masterhxc001sta.dfs.core.windows.net", "EiVxnncfGqoVlo/NFjFZkPk3CsXHwpfNlgynSLp7zOFbvnc6PBeReMsJi+YOEqsUyTKVUXwe9HrX+AStcTyh/w==")

In [0]:
import pandas as pd
import pyspark.pandas as ps
from datetime import datetime

# Suponiendo que ya tienes la lista all_jobs
# Convertimos a DataFrame de Pandas
df = pd.DataFrame(all_jobs)

# Convertimos a Spark DataFrame
spark_df = ps.from_pandas(df).to_spark()

# Nombre dinámico para el output
today = datetime.now().strftime("%Y-%m-%d")
output_path = f"abfss://ofertas-empleo@masterhxc001sta.dfs.core.windows.net/Staging/jobs_{today}/"

# Guardar directamente en Blob Storage en formato CSV
(spark_df
    .write
    .mode("overwrite")
    .option("header", "true")
    .csv(output_path)
)

print(f"✅ Datos guardados en {output_path}")


✅ Datos guardados en abfss://ofertas-empleo@masterhxc001sta.dfs.core.windows.net/Staging/jobs_2025-09-08/
