

Installation et import des librairies


In [17]:
import requests
import time
import random
#import json
import math
import pandas as pd
from bs4 import BeautifulSoup

Initialisation

In [31]:
random.seed()

# Nombre d'offres d'emploi par page
JOB_PER_BATCH = 25 

# Temps de repos entre chaque page scrappé pour éviter l'erreur 429 Too Many Request
SLEEPING_DELAY = 1

# Temps de repos supplémentaire en cas d'erreur 429
HARD_SLEEPING_DELAY = 30

# Paramètre
# Can't scrap more then 1000 offers  (response 400 after start >= 1000)
# On va donc découper la requete par ville et utilisant uniquement les dernières 24h si Paris, sinon semaine pour une ville de Province
count_url = 'https://fr.linkedin.com/jobs/search?keywords={}&location={}&f_TPR={}&distance=10&position=1&pageNum=0'
base_url = 'https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={}&location={}&sortBy=DD&f_TPR={}&distance=10&position=1&pageNum=0&start={}'
KEYWORDS = 'data' # Le critère de recherche d'emploi
LOCATION = 'Paris' # L'emplacement du job
LOCATION_LIST = ['Paris', 'Marseille', 'Lyon', 'Toulouse', 'Nice', 'Nantes', 'Montpellier', 'Strasbourg', 'Bordeaux', 'Lille', 'Rennes', 'Reims', 'Toulon', 'Saint-Étienne', 'Le Havre', 'Grenoble', 'Dijon', 'Angers', 'Nîmes', 'Clermont-Ferrand', 'Aix-en-Provence', 'Le Mans', 'Brest', 'Tours', 'Amiens', 'Limoges', 'Annecy']
TPR = 'r86400' # r86400 dernière 24h, r604800 dernière semaine, r2592000 dernier mois
sortBy = 'DD' # Sort by DD for date, R for relevance

ct_batch = 0
ct = 0

Chargement des proxy et headers

In [32]:
# Load proxies
df_proxies = pd.read_csv('/workspaces/codespaces-jupyter/data/linkedin_job_analysis/proxies.csv', sep=';')
df_proxies = df_proxies.astype({'port': 'str'})
df_proxies['url'] = df_proxies['ip'] + ":" + df_proxies['port']
proxies = list('https://' + df_proxies['url'])

In [20]:
headers = [
    {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'},
    {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/604.4.7 (KHTML, like Gecko) Version/11.0.2 Safari/604.4.7'},
    {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
]

In [43]:
proxy = {'http': random.choice(proxies)}
header = random.choice(headers)
first_url = count_url.format(KEYWORDS, LOCATION, TPR)
response = requests.get(first_url, proxies=proxy, headers=header)
html = BeautifulSoup(response.text, 'html.parser')
try:
  job_count = int(html.find('span', {'class': 'results-context-header__job-count'}).text)
except Exception as e:
  print('Exception : ', e)
batch_count = math.ceil(job_count / JOB_PER_BATCH)

print(f'Sur l\'url {first_url}')
print(f'Nombre d\'offre à récupérer : {job_count}')

Sur l'url https://fr.linkedin.com/jobs/search?keywords=data&location=Paris&f_TPR=r86400&distance=10&position=1&pageNum=0
Nombre d'offre à récupérer : 85


In [44]:
def get_job_batch_html(start):
  """
    Récupère un lot de 25 offres d'emplois sous forme HTML
  """
  global ct_batch

  while True:
    try:
      time.sleep(SLEEPING_DELAY)
      proxy = {'http': random.choice(proxies)}
      header = random.choice(headers)
      url = base_url.format(KEYWORDS, LOCATION, TPR, start)
      #print(proxy, header)
      #print(url)
      response = requests.get(url, proxies=proxy, headers=header)
      #response = requests.get(base_url.format(keywords, location, tpr, start))
      if response.status_code == 429:
        print("Got response status 429")
        time.sleep(HARD_SLEEPING_DELAY)
      elif response.status_code == 200:
        ct_batch += 1
        print("Batch done : " + str(ct_batch))
        break
      else:
        print("Nothing with code "+ str(response.status_code))
    except Exception as e:
      print('error : ', e)

  #print(response.status_code, response.headers)
  return BeautifulSoup(response.text, 'html.parser')

def get_job_dict(html):
  """
    Extrait les informations des offres d'emploi d'un lot brut HTML d'offres d'emploi
  """

  # On crée un dictionnaire pour stocker les offres d'emplois
  # Le format est choisi pour facilement le convertir en dataframe par la suite avec la fonction pd.from_dict()
  job_dict = {'id':[], 'title':[], 'company':[], 'url':[], 'location':[], 'date':[], 'description':[]}

  # On itère sur chaque offre d'emploi dans le HTML (une carte offre d'emploi est une div avec le paramètre data-entity-urn = True)
  # Et on récupère les informations de chaque offre
  for job in html.find_all('div', {'data-entity-urn': True}):
    id = job['data-entity-urn'].split(":")[3]
    title = job.find('h3').text.strip()
    company = job.find('a', {'class': 'hidden-nested-link'}).text.strip()
    job_url = job.find('a', {'class': 'base-card__full-link'})['href']
    location = job.find('span', {'class': 'job-search-card__location'}).text.strip()
    # date = job.find('time', {'class': 'job-search-card__listdate'})['datetime']

    try:   
      date = job.find('time')['datetime']
    except Exception as e:
      date = ""
      print('Missing date for job :' + job_url)

    # On a pas la description de l'offre d'emploi sur cette page, il faudra aller les chercher une par une par la suite
    description = ""

    job_dict['id'].append(id)
    job_dict['title'].append(title)
    job_dict['company'].append(company)
    job_dict['url'].append(job_url)
    job_dict['location'].append(location)
    job_dict['date'].append(date)
    job_dict['description'].append(description)   

  return job_dict

def get_job_details_html(url): 
  """
    Récupère le HTML brut d'une page détail d'une offre d'emploi
  """ 
  global ct
  #sleeping_delay = 1
  while True:
    try:
      time.sleep(SLEEPING_DELAY)
      proxy = {'http': random.choice(proxies)}
      header = random.choice(headers)
      response = requests.get(url, proxies=proxy, headers=header)

      if response.status_code == 429:
        print("Got response status 429")
        time.sleep(HARD_SLEEPING_DELAY)
      elif response.status_code == 200:
        ct += 1
        print("Job done : " + str(ct))
        break
    except Exception as e:
      print('Exception : ', e)

  return BeautifulSoup(response.text, 'html.parser')

def get_job_description(html):
  """
    Extrait la description du HTML brut d'une page d'emploi
  """
  description = html.find('div', {'class': 'show-more-less-html__markup'})
  try:
    description = description.get_text(separator="\n")
  except AttributeError:
      print("No description")
  return description

In [45]:
def retrieve_n_batch(ct_batch):  
  job_dict = {'id':[], 'title':[],'company':[],'url':[],'location':[],'date':[],'description':[]}

  for index in range(ct_batch):
    start_position = index * JOB_PER_BATCH
    html = get_job_batch_html(start_position)
    job_dict_tmp = get_job_dict(html)

    job_dict['id'] += job_dict_tmp['id']
    job_dict['title'] += job_dict_tmp['title']
    job_dict['company'] += job_dict_tmp['company']
    job_dict['url'] += job_dict_tmp['url']
    job_dict['location'] += job_dict_tmp['location']
    job_dict['date'] += job_dict_tmp['date']
    job_dict['description'] += job_dict_tmp['description']

  return job_dict

def retrieve_description(job_dict):
  for idx, url in enumerate(job_dict['url']):
    job_html = get_job_details_html(url)
    job_description = get_job_description(job_html)
    job_dict['description'][idx] = job_description

In [46]:
jobs = retrieve_n_batch(batch_count)

Batch done : 1
Batch done : 2
Batch done : 3


In [52]:
#with open("/workspaces/codespaces-jupyter/data/linkedin_job_analysis/part_1_job_list.json", "w") as outfile:
#    json.dump(jobs, outfile)

In [None]:
retrieve_description(jobs)

In [49]:
df_job = pd.DataFrame.from_dict(jobs)
df_job.head()

Unnamed: 0,id,title,company,url,location,date,description
0,3520314506,Analyste de données Service Clients (F/H),Louis Vuitton,https://fr.linkedin.com/jobs/view/analyste-de-...,"Paris, Île-de-France, France",2023-03-12,"\n\nAbout The Job\nDepuis plus de 150 ans, les..."
1,3516932318,Data Analyst,Jobs via eFinancialCareers,https://fr.linkedin.com/jobs/view/data-analyst...,"Paris, Île-de-France, France",2023-03-12,\n Vos responsabilités comprendront not...
2,3524834557,Data Analyst / Data Scientist F/H en Alternance,Carrefour,https://fr.linkedin.com/jobs/view/data-analyst...,"Massy, Île-de-France, France",2023-03-12,\n\nA propos de nous:\nCréateur de l’hypermarc...
3,3524734366,Pmo Ou Chef De Projet Transformation Digitale ...,Merck Génériques,https://fr.linkedin.com/jobs/view/pmo-ou-chef-...,"Sèvres, Île-de-France, France",2023-03-12,\n Dans un souci d’accessibilité et de ...
4,3373273124,Data Scientist H/F,Capgemini Engineering,https://fr.linkedin.com/jobs/view/data-scienti...,"Vélizy-Villacoublay, Île-de-France, France",2023-03-12,\n Notre offre\nTESSELLA est le World C...


In [53]:
df_job.to_csv('/workspaces/codespaces-jupyter/data/linkedin_job_analysis/part_1_job_list.csv', sep=',', encoding='utf-16', index=False)

In [54]:
df_test = pd.read_csv('/workspaces/codespaces-jupyter/data/linkedin_job_analysis/part_1_job_list.csv', sep=',', encoding='utf-16')
df_test.head()

Unnamed: 0,id,title,company,url,location,date,description
0,3520314506,Analyste de données Service Clients (F/H),Louis Vuitton,https://fr.linkedin.com/jobs/view/analyste-de-...,"Paris, Île-de-France, France",2023-03-12,"\n\nAbout The Job\nDepuis plus de 150 ans, les..."
1,3516932318,Data Analyst,Jobs via eFinancialCareers,https://fr.linkedin.com/jobs/view/data-analyst...,"Paris, Île-de-France, France",2023-03-12,\n Vos responsabilités comprendront not...
2,3524834557,Data Analyst / Data Scientist F/H en Alternance,Carrefour,https://fr.linkedin.com/jobs/view/data-analyst...,"Massy, Île-de-France, France",2023-03-12,\n\nA propos de nous:\nCréateur de l’hypermarc...
3,3524734366,Pmo Ou Chef De Projet Transformation Digitale ...,Merck Génériques,https://fr.linkedin.com/jobs/view/pmo-ou-chef-...,"Sèvres, Île-de-France, France",2023-03-12,\n Dans un souci d’accessibilité et de ...
4,3373273124,Data Scientist H/F,Capgemini Engineering,https://fr.linkedin.com/jobs/view/data-scienti...,"Vélizy-Villacoublay, Île-de-France, France",2023-03-12,\n Notre offre\nTESSELLA est le World C...
