
Etapes de scrap du site [the muse](https://www.themuse.com/).\
On utilise au maximum les json récupéré dans les appels api site pour avoir des données structurées

### Il y a 3 étapes principales. 

1.  requete sur un type de job exemple : "data engineer".
    - method: get
    - url : https://www.themuse.com/api/search-renderer/jobs?
    - params : ctsEnabled=false&query=Data+Engineer&preference=krcbqorfvz&limit=20&timeout=5000
    - recupération du json qui présente les différentes offres. Données conservées
        - job_title
        - company.short_name
        - short_title
        - posted_at
        - cursor (le dernier cursor est utile pour la pagination) --> start_after = dernier cursor
        - has_more (utile pour la pagination)

2.  récupération de chaque job dans le json reçu et requete pour obtenir le html de chaque job
    - method: get
    - url : https://www.themuse.com/jobs/
    - params: [hit.company.short_name]/[hit.short_title]

3.  dans le html, recupérér le json
    - dans la balise <script id="__NEXT_DATA__" type="application/json"></script>

### données conservées

Pour l'instant on conservce les données suivantes :


In [5]:
import aiohttp
from string import Template
from typing import List

site_url: str = "https://www.themuse.com"
search_url: str ="/api/search-renderer/jobs"
job_url: Template = Template("/jobs/$company/$job_name")
proxies_url:str = "https://api.buyproxies.org"
proxies_api_key: str = "5c38699e3dd06dc81f32a2ce7e8bb091"
proxies: List[str] = []

In [6]:
# appel des proxies

from pprint import pprint
from typing import Dict
import aiohttp

headers = {
     "Accept": "plain/text"
}
params:Dict =dict(
    a="showProxies",
    pid=184389,
    key=proxies_api_key,
    port=12345
)

async with aiohttp.ClientSession() as session:
    try:
        async with session.get(url=proxies_url, params=params) as resp:
            if resp.status == 200:
                proxies_text = await resp.text()
                proxies = proxies_text.split('\n')
                proxies = [f'https://{proxy.split(':')[2]}:{proxy.split(':')[3]}@{proxy.split(':')[0]}:{proxy.split(':')[1]}'
                      for proxy in proxies
                      if proxy ]
    except Exception as exc :
            print(exc)

user_agents: List[str]= [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Windows; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8",
    "Mozilla/5.0 (Windows NT 10.0; Windows; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Windows; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36"
  ]


In [7]:
from typing import Dict, List
import re
import random

async def request_jobs_api(query: str,  next_page: str | None = None)-> List[Dict]:
    """ Effectue une requête pour lister les jobs.
        Dans le cas d'une pagination, on ajoute start_after= cursor du dernier job

    Args:
        query (str): requête dont les espaces sont remplacés par des +
        limit (int, optional): Limite de réponse max. Defaults à 20.
        next (str | None, optional): cursor à partir duquel est lancé la requete si pagination. Defaults to None.

    Returns:
        List[Dict]: liste des jobs
    """
    query:str = re.sub(r'\s+','+', query)
    search_params: Dict = {
                        "ctsEnabled":"false",
                        "query":query,
                        "preference":"krcbqorfvz",
                        "limit":20,
                        "timeout":5000
                    }
    if next_page :
        search_params.update({"start_after":next_page})

    # proxy: str = random.choice(proxies)
    user_agent: str = random.choice(user_agents)
    headers = {"User-Agent": user_agent, "Accept": "application/json"}
    url:str = f"{site_url}{search_url}"
    
    async with aiohttp.ClientSession() as session:
        
        try:
            async with session.get(url=url, params=search_params, headers=headers) as resp:
                if resp.status == 200:
                    return await resp.json()
                    
        except Exception as exc :
            print(exc)




In [8]:
# extraction des données à conserver dans les jobs
# on peut utiliser glom pour l'extraction des json
from typing import Tuple
from glom import glom
from dataclasses import dataclass, field

# creation des Entités qui structurent les données et permettent une conservation en base
@dataclass
class JobUrl:
    company: str
    job_name:str
    url: str  = field(init=False)
    processed: bool = False
    
    def __post_init__(self):
        self.url = job_url.safe_substitute(company=self.company, job_name=self.job_name)
        

async def scrap_jobs(response_api: Dict)-> Dict: # type: ignore
    """Extrait les jobs a partir la reponse de l'api qui liste les jobs

    Args:
        response_api (Dict): dictionnaire renvoyé par l'api

    Returns:
        Dict: dictionnaire comprenant les différentes information résultant du scrap
    """
    job_specs = {
        "company": "hit.company.short_name",
        "job_name": "hit.short_title",
        "score": "score",
        "cursor": "cursor"
    }
    job_list: List[Dict] = [glom(job,  job_specs) for job in response_api.get('hits')]
    job_list = sorted(job_list, key=lambda k:k['score'], reverse=True )
    job_urls: List[JobUrl] = [JobUrl(company=job.get('company'), job_name=job.get('job_name')) for job in job_list]
    
    next_page: str = job_list[-1].get('cursor')
    has_more: bool = response_api.get('has_more', False)

    return dict(has_more=has_more, next_page=next_page, job_urls=job_urls)


In [11]:
# appel de la methode une 1ere fois
from pprint import pprint

joburls_ls: List[JobUrl] = []
count: int = 0
next_page: str | None= None
query: str = "Data engineer"
has_more: bool = True

while has_more :
# while count < 10:

    job_api_reponse: List[Dict] = await request_jobs_api(query=query, next_page=next_page)
    scrap_result: Dict = await scrap_jobs(job_api_reponse)

    has_more: str = scrap_result.get('has_more')
    next_page: str = scrap_result.get('next_page')
    joburls_ls.extend(scrap_result.get('job_urls'))

    print(has_more)
    print(next_page)

    # simulation has_more=False apres 5 iterations
    count+=1
    if count > 2:
        has_more = False

print(len(joburls_ls))
joburls_ls

True
84.74229,1727281535000,04b3be51-2b3d-45a6-90ad-ed17a47db205
True
65.37807,1690931340000,2262703c-b2b6-4ade-9bc3-7022f659b9e0
True
64.38387,1727125877000,d53ec78a-3681-4671-8536-db0a4ffc43af
60


[JobUrl(company='arcadia', job_name='data-engineer-950ad7', url='/jobs/arcadia/data-engineer-950ad7', processed=False),
 JobUrl(company='leidos', job_name='data-engineer-d8bdf7', url='/jobs/leidos/data-engineer-d8bdf7', processed=False),
 JobUrl(company='appfire', job_name='data-engineer', url='/jobs/appfire/data-engineer', processed=False),
 JobUrl(company='constellationbrands', job_name='data-engineer', url='/jobs/constellationbrands/data-engineer', processed=False),
 JobUrl(company='coinbase', job_name='data-engineer-0b1092', url='/jobs/coinbase/data-engineer-0b1092', processed=False),
 JobUrl(company='appfire', job_name='data-engineer-85af17', url='/jobs/appfire/data-engineer-85af17', processed=False),
 JobUrl(company='kyndryl', job_name='data-engineer-f6b05a', url='/jobs/kyndryl/data-engineer-f6b05a', processed=False),
 JobUrl(company='atlassian', job_name='data-engineer-ii', url='/jobs/atlassian/data-engineer-ii', processed=False),
 JobUrl(company='arcadia', job_name='senior-data

In [12]:
# extraction des informaton de chaque job



