In [5]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import requests
import pandas as pd
import asyncio
import aiohttp

In [5]:
def job_offers_wtj(
        job_title: str = "data analyst",
        pages: int = 1
):
    """
    ---
    Web scraping WTJ
    ---
    Lance un navigateur et créée un dataframe contenant les informations de
    chaque offre d'emploi pour un nombre de pages définis sur le site
    Welcome To The Jungle.
    ---
    Paramètres :
    - job_title: str : Le nom du métier pour lequel rechercher des offres.
    - pages: int : Le nombre de pages sur lesquels rechercher les offres.
    ---
    Retourne :
    - Le df contenant les informations de toute les offres récupérées.
    """
    # Instanciation de la liste contenant les liens pour les requêtes APIs.
    api_links = []
    # Lien de l'API de Welcome To The Jungle pour récupérer les données.
    api_link = f"https://api.welcometothejungle.com/api/v1/organizations"
    job = job_title.lower().replace(" ", "+")
    # Instanciation du driver Firefox.
    driver = webdriver.Firefox()
    # Instanciation du dataframe final.
    full_df = pd.DataFrame()
    # Nom des colonnes à garder dans le dataframe final.
    cols_to_keep = [
        "name",
        "salary_period",
        "experience_level",
        "apply_url",
        "contract_duration_min",
        "office.city",
        "office.address",
        "office.district",
        "office.latitude",
        "office.longitude",
        "office.zip_code",
        "profession.category.fr",
        "profession.name.fr"
        "name",
        "education_level",
        "application_fields.mode",
        "application_fields.name",
        "description",
        "organization.average_age",
        "organization.creation_year",
        "organization.default_language",
        "organization.description",
        "organization.industry",
        "organization.nb_employee",
        "contract_type",
        "salary_min",
        "salary_max",
        "education_level",
        "remote"
    ]
    try:
        for i in range(1, pages+1):
            url = f"https://www.welcometothejungle.com/fr/jobs?refinementList%5Boffices.country_code%5D%5B%5D=FR&query={job}&page={i}"
            # Ouvre chaque page sur le navigateur.
            driver.get(url)
            try:
                # Récupère le lien de chaque offre d'emploi sur la page.
                contents = WebDriverWait(driver, 20).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".sc-6i2fyx-0.gIvJqh"))
                )
                for content in contents:
                    link = content.get_attribute("href")
                    end_link = re.findall(r"/companies(.+)", link)[0]
                    full_link = api_link + end_link
                    # Rajoute le lien de chaque offre à la liste.
                    api_links.append(full_link)
            except Exception as e:
                print(f"Error scraping page {i} : {e}")
    finally:
        driver.quit()
    # Pour chaque lien de la liste, fait une requête API et stocke les informations dans un dataframe.
    for link_ in api_links:
        r = requests.get(link_)
        df = pd.json_normalize(
            r.json()["job"]
        )
        full_df = pd.concat([full_df, df], ignore_index=True)
    # Instanciation de la liste des colonnes à drop.
    cols_to_drop = [col for col in full_df.columns if col not in cols_to_keep]
    df = full_df.drop(columns=cols_to_drop)
    return df, api_links

In [6]:
df, api_links = job_offers_wtj("data analyst", 1)

In [9]:
api_links[0]

'https://api.welcometothejungle.com/api/v1/organizations/margo/jobs/margo-analytics-data-engineer-h-f_paris?q=d29961294f3e8be5cef30329cc49055a&o=579036&p=true'

In [8]:
r = requests.get(api_links[0])
r.json()["job"]

{'recruitment_process': None,
 'status': 'published',
 'profession': {'category': {'cs': 'Tech',
   'en': 'Tech',
   'es': 'Tecnología',
   'fr': 'Tech',
   'po': 'Tech',
   'sk': 'Technológie'},
  'name': {'cs': 'Data Engineering',
   'en': 'Data Engineering',
   'es': 'Ingeniería de datos',
   'fr': 'Data Engineering',
   'po': 'Data Engineering',
   'sk': 'Data Engineering'}},
 'is_default': False,
 'contract_duration_min': None,
 'name': 'Margo Analytics - Data Engineer - H/F',
 'contract_type': 'FULL_TIME',
 'videos': [{'external_reference': '/M8Hd_oolT3I',
   'image': {'large': {'url': 'https://cdn-images.welcometothejungle.com/7QhWyhyMc0_IG67p9uZ1eDQmY3ElOLqUF4tNU9Svnek/rs:auto:1500::/q:85/czM6Ly93dHRqLXByb2R1Y3Rpb24vdXBsb2Fkcy92aWRlby9pbWFnZS80NTA0LzE2OTEwNS9tYXJnb192aWRlb193a3JQZDAxLnBuZw'},
    'medium': {'url': 'https://cdn-images.welcometothejungle.com/RkG58C8qk7MUPp_6WhEKHXa4cDypiOZhI550L9rt304/rs:auto:980::/q:85/czM6Ly93dHRqLXByb2R1Y3Rpb24vdXBsb2Fkcy92aWRlby9pbWFnZS80NTA0

In [6]:
from bs4 import BeautifulSoup
import pandas as pd
def clean_html(text):
    soup = BeautifulSoup(text, "html.parser")
    cleaned_text = soup.get_text(separator=" ")
    cleaned_text = cleaned_text.replace("\xa0", " ")
    return cleaned_text

df["description"] = df["description"].apply(clean_html)
df["organization.description"] = df["organization.description"].apply(clean_html)

In [7]:
df.to_csv("WTT_offers.csv")

In [8]:
df = pd.read_csv("WTT_offers.csv")

In [9]:
df

Unnamed: 0,salary_max,salary_period,contract_type,published_at,name,education_level,updated_at,description,contract_duration_min,apply_url,...,office.district,office.latitude,office.longitude,office.zip_code,profession.category.fr,organization.average_age,organization.creation_year,organization.default_language,organization.description,organization.industry
0,,none,FULL_TIME,2023-11-24T15:39:05.887634Z,Margo Analytics - Data Engineer - H/F,BAC_5,2023-11-24T15:39:05.900748Z,Margo Analytics est l'entité experte de Margo...,,https://jobs.lever.co/margo-group/dfd990e5-a46...,...,Paris,48.8797,2.32381,75008.0,Tech,30.0,2005,fr,"Plus qu’un simple groupe de conseil IT, MARGO ...","Software, IT / Digital"
1,,none,INTERNSHIP,2023-12-19T14:02:47.012338Z,Stage - Data Analyst F/H,BAC_4,2023-12-19T14:02:47.012007Z,Vos mission “Cette mission vous donnera l’oppo...,6.0,http://app.mytalentplug.com/redirection.aspx?o...,...,Rhône,45.7514,4.87592,69003.0,Tech,44.0,1994,fr,Orange a besoin de vous pour conquérir de nouv...,"Connected Objects, Big Data, Electronics / Tel..."
2,,none,INTERNSHIP,2023-12-19T13:41:13.692013Z,Data analyst et business analyst - Stage,BAC_5,2023-12-19T13:41:13.701908Z,Véritable interface entre l’équipe technique e...,6.0,,...,Paris,48.87602,2.33944,75009.0,Tech,30.0,2017,fr,"Lancée en 2017 par une infirmière, libheros es...","SaaS / Cloud Services, Health, Home Care Servi..."
3,2000.0,monthly,INTERNSHIP,2023-12-19T11:00:56.681677Z,Data Analyst - Internship - Paris,BAC_4,2023-12-19T11:00:56.690318Z,About Stockly Stockly is a retail-tech company...,5.0,,...,Paris,48.86857,2.3457,75002.0,Tech,28.0,2018,fr,🔭 Stockly is a tech-retail company solving...,"Software, E-commerce"
4,,none,INTERNSHIP,2023-12-19T10:31:50.713030Z,Intern - Sales Operations and Data Analyst,,2023-12-19T10:37:19.080261Z,Mission You will be part of the Operations tea...,6.0,https://careers.gitguardian.com/jobs/3461341-i...,...,Paris,48.8664,2.34351,75002.0,Support,33.0,2017,en,GitGuardian is a global post-series B cybersec...,"IT / Digital, SaaS / Cloud Services, Cyber Sec..."
5,,none,INTERNSHIP,2023-12-19T10:34:56.955436Z,Stagiaire Data Analyst (H/F) en Stage,,2023-12-19T10:34:56.974415Z,Dans le cadre de sa campagne de stage AXA recr...,,https://recrutement.axa.fr/nos-offres-emploi/2...,...,Hauts-de-Seine,48.896581,2.223766,92727.0,Tech,41.0,1985,fr,Avec 6 000 recrutements par an en France rejoi...,"Banking, Insurance, FinTech / InsurTech"
6,,none,FULL_TIME,2023-12-19T10:38:57.473651Z,Data Privacy Analyst F/H,,2023-12-19T10:38:57.491705Z,"Data Privacy Specialist F/H Paris, France Le c...",,https://recrutement.axa.fr/nos-offres-emploi/2...,...,Paris,48.87513,2.33789,75009.0,Support,41.0,1985,fr,Avec 6 000 recrutements par an en France rejoi...,"Banking, Insurance, FinTech / InsurTech"
7,,none,FULL_TIME,2023-12-19T10:37:34.871011Z,Investment & Finance Data Analyst,,2023-12-19T10:37:34.880402Z,AXA IM est un gestionnaire d'actifs internatio...,,https://recrutement.axa.fr/nos-offres-emploi/2...,...,Hauts-de-Seine,48.87725,2.24416,92400.0,Audit / Finance / Assurance,41.0,1985,fr,Avec 6 000 recrutements par an en France rejoi...,"Banking, Insurance, FinTech / InsurTech"
8,,none,FULL_TIME,2023-12-19T10:38:46.735385Z,Data Analyst (F/H),BAC_5,2023-12-19T10:38:46.752093Z,Dans un contexte multi-sites avec plus de 4 mi...,,https://recrutement.axa.fr/nos-offres-emploi/2...,...,Hauts-de-Seine,48.869798,2.219033,,Relation client,41.0,1985,fr,Avec 6 000 recrutements par an en France rejoi...,"Banking, Insurance, FinTech / InsurTech"
9,,none,INTERNSHIP,2023-12-19T10:38:33.102804Z,Data Analyst Transformation (H/F) en stage,,2023-12-19T10:38:33.111629Z,Dans le cadre de sa campagne de stage AXA recr...,,https://recrutement.axa.fr/nos-offres-emploi/2...,...,Hauts-de-Seine,48.896581,2.223766,92727.0,Tech,41.0,1985,fr,Avec 6 000 recrutements par an en France rejoi...,"Banking, Insurance, FinTech / InsurTech"
