# Crawling Apec webpage (www.apec.fr) with selenium

In [1]:
# importing modules:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

In [2]:
# Creating the the executable link path
link_path = Service('/Users/elhadji/Desktop/Python_Labs/chromedriver')
# Creating the “driver”
driver =  webdriver.Chrome(service=link_path)

## Create a function to generate a list urls of webpages

In [3]:
# define a function to build the url
def build_ful_apecurl(keyword_list, pages_numb):
    # Add %20 join strings in keywordlist
    str_keyword = '%20'.join(keyword_list)
    # set apec web page as variable
    root_url = "https://www.apec.fr/candidat/recherche-emploi.html/emploi?motsCles="
    # url page
    url_page = '&page='
    # empty list store full url or each wbepage
    apec_url_list = []
    # full url by contenating in the following list
    for n in pages_numb:
        apec_url_list.append(root_url+str_keyword+url_page+str(n))
    return apec_url_list

In [4]:
# Keyword to seach : "data scientist" as list two items
keyword_list = ['data', 'scientist']
# page numbers list 
pages_numb = [n for n in range(5)]

In [5]:
# Build the list of five pages
my_apec_links = build_ful_apecurl(keyword_list, pages_numb)
# print to see 
my_apec_links

['https://www.apec.fr/candidat/recherche-emploi.html/emploi?motsCles=data%20scientist&page=0',
 'https://www.apec.fr/candidat/recherche-emploi.html/emploi?motsCles=data%20scientist&page=1',
 'https://www.apec.fr/candidat/recherche-emploi.html/emploi?motsCles=data%20scientist&page=2',
 'https://www.apec.fr/candidat/recherche-emploi.html/emploi?motsCles=data%20scientist&page=3',
 'https://www.apec.fr/candidat/recherche-emploi.html/emploi?motsCles=data%20scientist&page=4']

In [6]:
# Extratct jobs title with driver.find_elements By.TAG_NAME()
def job_title_extractor_tagname(url_list, tag):
    """
    Take an url list and a tag and return all jobs the given webpage
    """
    all_jobs = []
    # loop thorought urls and extract job title by h2 tag
    for url in url_list:
        driver.get(url)
        all_h2 = driver.find_elements(By.TAG_NAME,tag)
        all_jobs += [element.text for element in all_h2]
    # return all_jobs
    return all_jobs
# Probem : It crawl any h2 element in the welpage including job titles 

In [7]:
# job title tag : h2
tag = "h2"
# get all job titles
job_titles = job_title_extractor_tagname(my_apec_links, tag)
# check the lenght
len(job_titles) 
# Include all h2 elements (job title + other section titles) 

95

In [8]:
# Extratct element with driver.find_elements by.CSS_SELECTOR() >  class_name
def job_title_extractor_class(url_list, class_name):
    """
    Take an url list and a class name to return all jobs from all webpages
    """
    all_jobs = []
    for url in url_list:
        driver.get(url)
        all_class = driver.find_elements(By.CSS_SELECTOR,class_name)
        all_jobs += [element.text for element in all_class]

    return all_jobs

In [9]:
# Class name valie take two as follow when using by.CSS_SELECTOR()
class_name = ".card-title.fs-16"
job_titles = job_title_extractor_class(my_apec_links, class_name) # better than TAG_NAME(), more specific !!!
# check the lenght
len(job_titles)

100

In [10]:
print(job_titles[3])

DATA SCIENTIST F/H


In [11]:
# find element by XPATH
def job_company_extractor(url_list, class_xpath):
    """
    Take an url and an Xpath to return all job company in a webpage
    """
    all_company = []
    for url in url_list:
        driver.get(url)
        all_comp = driver.find_elements(By.XPATH, class_xpath)
        all_company += [element.text for element in all_comp]
    #driver.quit()
    return all_company

In [12]:
# define company name xpath
class_xpath = "//*[@class='card-offer__company mb-10']" # company name
# get all the companies
job_company = job_company_extractor(my_apec_links, class_xpath)
# check the lenght
len(job_company)

100

In [13]:
print(job_company[3])

STUDIEL PARTICIPATIONS


In [14]:
# defien salary xpath 
xpath = '//ul[@class="details-offer"]' # one year salary
# get all the details
job_salary = job_company_extractor(my_apec_links,xpath)
# check the lenght
len(job_salary)

100

In [16]:
print(job_salary[3])

35 - 43 k€ brut annuel


In [21]:
# job description class name
class_name = ".card-offer__description.mb-15" # salary
job_descrition = job_title_extractor_class(my_apec_links, class_name) # using CSS_SELECTOR
# check the lenght
len(job_descrition)

100

In [18]:
print(job_descrition[3])

Dans le cadre d'une embauche, vous interviendrez en tant que data scientist. Vous analyserez de gros volumes de données de type série temporelle, développerez et déploierez des modèles prédictifs, et assurerez la communication avec le client. Vous contribuer également aux...


In [22]:
# Job type, location and date 
class_name = ".details-offer.important-list"
type_loc_date = job_title_extractor_class(my_apec_links, class_name) # using CSS_SELECTOR
# check the lenght
len(type_loc_date)

100

In [20]:
print(type_loc_date[3])

CDI
Blagnac - 31
16/11/2021


In [23]:
type_loc_date[3]

'CDI\nBlagnac - 31\n16/11/2021'

## Build csv

In [35]:
def build_apec_csv(keyword_list,class_tags,pages_numb):
    """
    Function that creates a dict with all data gathered from the website 
    """
    ################# code goes here #####################################################
    # builf uls bebpages list
    my_apec_links = build_ful_apecurl(keyword_list, pages_numb)
    
    # Four job items to extract
    job_titles = job_title_extractor_class(my_apec_links, class_tags[0]) # class name
    job_company = job_company_extractor(my_apec_links, class_tags[1]) # xpath
    job_salary = job_company_extractor(my_apec_links, class_tags[2]) # xpath
    job_descrition = job_title_extractor_class(my_apec_links, class_tags[3]) # class name
    type_loc_date = job_title_extractor_class(my_apec_links, class_tags[4]) # class_name
    
    # create a dictionaye to stucture the data crawled
    my_dict = {"Job title":job_titles, "Company":job_company, "Salary":job_salary, "Description":job_descrition, "Job infos":type_loc_date}
    
    # create a data frame
    data = pd.DataFrame(my_dict)
    
    # store data in csv file
    data.to_csv('/Users/elhadji/Desktop/Python_Labs/aspec_ds_jobs_offers.csv', sep=";")
    return None

In [36]:
# make keyword list
keyword_list = ['data', 'scientist']
# page numbers list 
pages_numb = [n for n in range(21)]
# class lists
class_tags = [".card-title.fs-16", "//*[@class='card-offer__company mb-10']",
              '//ul[@class="details-offer"]', ".card-offer__description.mb-15", ".details-offer.important-list"]

In [37]:
# let's build a apec offers data 
build_apec_csv(keyword_list,class_tags,pages_numb)

## exporting crawled data to csv file

In [59]:
df = pd.read_csv('/Users/elhadji/Desktop/Python_Labs/aspec_ds_jobs_offers.csv', sep=";")

In [53]:
df.head(15) # 15 firts offers

Unnamed: 0.1,Unnamed: 0,Job title,Company,Salary,Description,Job infos
0,0,Data Scientist F/H,FULL DATA MANAGEMENT,35 - 45 k€ brut annuel,"Dans le cadre de notre développement, nous rec...",CDI\nLille - 59\n16/11/2021
1,1,Data Scientist F/H,ROBERT WALTERS FRANCE,45 - 50 k€ brut annuel,"Notre client, groupe spécialisé dans l'assuran...",CDI\nToulon - 83\n10/11/2021
2,2,Data Scientist F/H,TALENTS RH,45 - 50 k€ brut annuel,"TALENTS RH, société de recrutement spécialisée...",CDI\nLille - 59\n18/11/2021
3,3,DATA SCIENTIST F/H,STUDIEL PARTICIPATIONS,35 - 43 k€ brut annuel,"Dans le cadre d'une embauche, vous interviendr...",CDI\nBlagnac - 31\n16/11/2021
4,4,Data Scientist F/H,REGIONSJOB,A négocier,"""Recrutez au delà des compétences."" PERSUADERS...",CDI\nLyon 01 - 69\n26/11/2021
5,5,DATA SCIENTIST F/H,PREM CANAGARADJA,A négocier,"D’une manière générale, vous serez en charge d...",CDI\nToulouse - 31\n19/11/2021
6,6,Data Scientist F/H,YSANCE,A négocier,"En tant que Data Scientist, vous contribuerez ...",CDI\nLevallois-Perret - 92\n21/11/2021
7,7,Data Scientist F/H,METEOJOB,A négocier,A la recherche de nouvelles affinités professi...,CDI\nÉcully - 69\n10/11/2021
8,8,Data Scientist F/H,METEOJOB,A négocier,Rattaché(e) à la Direction Générale Exécutive ...,CDI\nClichy - 92\n23/11/2021
9,9,Data Scientist F/H,SAS PROXIEL,A partir de 40 k€ brut annuel,Nous recherchons un Data Scientist (F/H) pour ...,CDI\nNice - 06\n09/11/2021
