In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import requests
import pandas as pd

In [4]:
def job_offers_wtj(
        job_title: str = "data analyst",
        pages: int = 1
):
    """
    ---
    Web scraping WTJ
    ---
    Lance un navigateur et créée un dataframe contenant les informations de
    chaque offre d'emploi pour un nombre de pages définis sur le site
    Welcome To The Jungle.
    ---
    Paramètres :
    - job_title: str : Le nom du métier pour lequel rechercher des offres.
    - pages: int : Le nombre de pages sur lesquels rechercher les offres.
    ---
    Retourne :
    - Le df contenant les informations de toute les offres récupérées.
    """
    # Instanciation de la liste contenant les liens pour les requêtes APIs.
    api_links = []
    # Lien de l'API de Welcome To The Jungle pour récupérer les données.
    api_link = f"https://api.welcometothejungle.com/api/v1/organizations"
    job = job_title.lower().replace(" ", "+")
    # Instanciation du driver Firefox.
    driver = webdriver.Firefox()
    # Instanciation du dataframe final.
    full_df = pd.DataFrame()
    # Nom des colonnes à garder dans le dataframe final.
    cols_to_keep = [
        "name",
        "salary_period",
        "experience_level",
        "apply_url",
        "contract_duration_min",
        "office.city",
        "office.address",
        "office.district",
        "office.latitude",
        "office.longitude",
        "office.zip_code",
        "profession.category.fr",
        "profession.name.fr"
        "name",
        "education_level",
        "application_fields.mode",
        "application_fields.name",
        "description",
        "organization.average_age",
        "organization.creation_year",
        "organization.default_language",
        "organization.description",
        "organization.industry",
        "organization.nb_employee",
        "contract_type",
        "salary_min",
        "salary_max",
        "education_level",
        "remote"
    ]
    try:
        for i in range(1, pages+1):
            url = f"https://www.welcometothejungle.com/fr/jobs?refinementList%5Boffices.country_code%5D%5B%5D=FR&query={job}&page={i}"
            # Ouvre chaque page sur le navigateur.
            driver.get(url)
            try:
                # Récupère le lien de chaque offre d'emploi sur la page.
                contents = WebDriverWait(driver, 20).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".sc-6i2fyx-0.gIvJqh"))
                )
                for content in contents:
                    link = content.get_attribute("href")
                    end_link = re.findall(r"/companies(.+)", link)[0]
                    full_link = api_link + end_link
                    # Rajoute le lien de chaque offre à la liste.
                    api_links.append(full_link)
            except Exception as e:
                print(f"Error scraping page {i} : {e}")
    finally:
        driver.quit()
    # Pour chaque lien de la liste, fait une requête API et stocke les informations dans un dataframe.
    for link_ in api_links:
        r = requests.get(link_)
        df = pd.json_normalize(
            r.json()["job"]
        )
        full_df = pd.concat([full_df, df], ignore_index=True)
    # Instanciation de la liste des colonnes à drop.
    cols_to_drop = [col for col in full_df.columns if col not in cols_to_keep]
    df = full_df.drop(columns=cols_to_drop)
    return df

In [5]:
df = job_offers_wtj("data analyst", 1)

In [6]:
from bs4 import BeautifulSoup
import pandas as pd
def clean_html(text):
    soup = BeautifulSoup(text, "html.parser")
    cleaned_text = soup.get_text(separator=" ")
    cleaned_text = cleaned_text.replace("\xa0", " ")
    return cleaned_text

df["description"] = df["description"].apply(clean_html)
df["organization.description"] = df["organization.description"].apply(clean_html)

In [7]:
df.to_csv("WTT_offers.csv")

In [8]:
df

Unnamed: 0,contract_type,name,description,salary_max,apply_url,salary_period,experience_level,salary_min,education_level,remote,...,organization.creation_year,organization.default_language,organization.description,organization.industry,office.address,office.city,office.district,office.latitude,office.longitude,office.zip_code
0,FULL_TIME,Margo Analytics - Data Engineer - H/F,Margo Analytics est l'entité experte de Margo...,,https://jobs.lever.co/margo-group/dfd990e5-a46...,none,,,BAC_5,partial,...,2005,fr,"Plus qu’un simple groupe de conseil IT, MARGO ...","Software, IT / Digital","1, Rue de Saint-Pétersbourg, Paris",Paris,Paris,48.8797,2.32381,75008.0
1,FULL_TIME,Manager - Data & Analytics Engineering,The Data team at Welcome to the Jungle: Part o...,,,none,,,,fulltime,...,2015,en,Welcome to the Jungle’s mission is simple: to ...,"Media, Recruitment","24, Rue du Mail, Paris",Paris,Paris,48.86695,2.34314,75002.0
2,INTERNSHIP,Stage - Data Analyst sur le SIRH F/H,Votre profil “Nous serons heureux de t’accueil...,1600.0,http://app.mytalentplug.com/redirection.aspx?o...,monthly,LESS_THAN_6_MONTHS,600.0,BAC_2,no,...,1994,fr,Orange a besoin de vous pour conquérir de nouv...,"Connected Objects, Big Data, Electronics / Tel...",,Cesson-Sévigné,Ille-et-Vilaine,48.123778,-1.605273,35510.0
3,FULL_TIME,Data Analyst @eXalt Lille,eXalt Lille recherche son/sa nouveau/elle Dat...,40.0,,yearly,3_TO_4_YEARS,37.0,BAC_5,punctual,...,2018,fr,"eXalt est une formidable aventure humaine, une...",IT / Digital,"19, Rue d'Amiens, Lille",Lille,Nord,50.63388,3.06418,59800.0
4,FULL_TIME,Senior Data Analyst,À propos des Data Analysts 🦸 Rattaché à l'équi...,,https://careers.ornikar.com/jobs/3317886-senio...,none,,,,partial,...,2013,fr,Ornikar’s team dedicates its mission to transf...,"Mobility, FinTech / InsurTech, EdTech",WeWork Boulevard de la Villette,Paris,Paris,48.88436,2.36684,75019.0
5,FULL_TIME,Senior Data Analyst - Assurance,À propos de l'équipe Data 🦸 Les Data Analysts ...,,https://careers.ornikar.com/jobs/3187506-senio...,none,5_TO_7_YEARS,,,partial,...,2013,fr,Ornikar’s team dedicates its mission to transf...,"Mobility, FinTech / InsurTech, EdTech",WeWork Boulevard de la Villette,Paris,Paris,48.88436,2.36684,75019.0
6,INTERNSHIP,Data Analyst - Internship - Paris,About Stockly Stockly is a retail-tech company...,2000.0,,monthly,,1500.0,BAC_4,punctual,...,2018,fr,🔭 Stockly is a tech-retail company solving...,"Software, E-commerce","8, Rue du Sentier, Paris",Paris,Paris,48.86857,2.3457,75002.0
7,INTERNSHIP,Data Analyst - Stage - Paris,Travailler chez papernest : définition. Cer...,,https://careers.papernest.com/jobs/3240543-dat...,none,,,,partial,...,2015,en,We are papernest Our ambition? To be the one a...,"Mobile Apps, Commercial Real Estate, Residenti...",157 boulevard Macdonald,Paris,Paris,48.898163,2.378094,75019.0
8,FULL_TIME,Data Analyst H/F,En lien avec l’augmentation de son portefeuill...,42000.0,,yearly,3_TO_4_YEARS,37000.0,BAC_5,punctual,...,2008,fr,Plateforme Saas de sécurisation des risques B2...,SaaS / Cloud Services,"20, Boulevard Eugène Deruelle, Lyon",Lyon,Rhône,45.76282,4.85396,69003.0
9,FULL_TIME,Senior ESG Data Scientist/Analyst,"En forte phase de croissance, WeeFin renforce ...",,,none,,,,partial,...,2018,fr,🚀 WeeFin est une fintech à impact créée en 201...,"Strategy, Change Management, FinTech / InsurTech","45, Rue des Petites Écuries, Paris",Paris,Paris,48.87384,2.34963,75010.0
