In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import os
import re
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()
departements=["11", "74", "91"]

assert os.environ['MONENFANT_HTML_DIR']

def get_html_file_list() -> [str]:
    html_file_list=[]
    root_dir = Path(os.environ['MONENFANT_HTML_DIR'])
    for filename in os.listdir(root_dir):
        if filename.endswith(".html"):
            html_file_uri = "file://" + str(root_dir / filename)
            print(html_file_uri)
            html_file_list.append(html_file_uri)
    return html_file_list


def get_code_postal(adresse: str):
    re_code_postal = re.compile('^.*(?P<codepostal>\d{5}).*$')
    resultat = re_code_postal.match(adresse)
    return resultat.groupdict()['codepostal']


CRECHE_IDS = []
CRECHE_DATA_SCRAPED_1 = []
CRECHE_DUPLICATE_IDS = []
CRECHE_BAD_DEPARTEMENT = []


class CrechesSpider(scrapy.Spider):
    name = "monenfant"
        
    start_urls=get_html_file_list()
    
    def parse(self, response):
        for creche in response.css("div.formulaire-content-liste > div.panel"):
            adresse_brute = creche.css("div.addresse-infos-structures-container > span::text").get()
            code_postal = get_code_postal(adresse_brute)
            adresse_rue = adresse_brute.split(code_postal)[0].strip()
            creche_id = creche.xpath("@id").get()
            
            if creche_id not in CRECHE_IDS and code_postal[0:2] in departements :

                creche_data = {
                    "id": creche_id,
                    "nom": creche.css("div.panel-head h5.panel-title::text").get() || ,
                    "adresse": adresse_rue,
                    "code_postal": code_postal,
                    "telephone": creche.css(".infos-structures-column-phone a.infos-structures-lien::text").get(),
                    "courriel": creche.css("div[name='infos-mail'] a.infos-structures-lien::text").get()
                }
            
                CRECHE_DATA_SCRAPED_1.append(creche_data)
                CRECHE_IDS.append(creche_id)
                
            if creche_id in CRECHE_IDS:
                CRECHE_DUPLICATE_IDS.append(creche_id)
                
            if code_postal[0:2] not in departements:
                CRECHE_BAD_DEPARTEMENT.append(creche_id)
                
process = CrawlerProcess(
    settings={
        "FEEDS": {
            "items.json": {"format": "json"},
        },
        "LOG_LEVEL": "ERROR"
    }
)

process.crawl(CrechesSpider)
process.start() 

print(f"Scraped {len(CRECHE_DATA_SCRAPED_1)} creches.\n
{len(CRECHE_DUPLICATE_IDS)} duplicate ids\n
{len(CRECHE_BAD_DEPARTEMENT)} creches in the wrong departement")





In [None]:
# Retrieve more data


import requests
import time
from datetime import datetime

CRECHE_IDS_LEN = len(CRECHE_IDS)
CRECHE_DATA_SCRAPED_2 = []

print(f"Retrieving the data of {CRECHE_IDS_LEN} crèches...")

now = datetime.now()
today = now.strftime("%d/%m/%Y")


for i in range(0, CRECHE_IDS_LEN):
    creche_id = CRECHE_IDS[i]
    
    url = 'https://monenfant.fr/web/guest/que-recherchez-vous?p_p_id=fr_monenfant_recherche_portlet_RecherchePortlet_INSTANCE_VnedXuapLnSM&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=%2Frecherche%2Frechercher&p_p_cacheability=cacheLevelPage&_fr_monenfant_recherche_portlet_RecherchePortlet_INSTANCE_VnedXuapLnSM_cmd=get_structure_details'

    headers = {
        'User-Agent': 'data.inclusion@beta.gouv.fr',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Language': 'fr,en-US;q=0.7,en;q=0.3',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'X-Requested-With': 'XMLHttpRequest',
        'Origin': 'https://monenfant.fr',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Referer': 'https://monenfant.fr/que-recherchez-vous/mode-d-accueil',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-GPC': '1'
    }

    data = {
        '_fr_monenfant_recherche_portlet_RecherchePortlet_INSTANCE_VnedXuapLnSM_id': creche_id,
        '_fr_monenfant_recherche_portlet_RecherchePortlet_INSTANCE_VnedXuapLnSM_dureeRecherche': '345',
        '_fr_monenfant_recherche_portlet_RecherchePortlet_INSTANCE_VnedXuapLnSM_dateDebutRecherche': today
    }

    response = requests.post(url, headers=headers, data=data).json()

    formatted_creche_data = {
        "id": creche_id,
        "avip": response['avip'],
        "presentation_resume": response['details']['presentation']['structureProjet'],
        "horaires_ouverture": response['details']['infosPratiques']['jourHoraire'],
        "commune": response['ville'],
        "site_web": response['details']['website'],
        "labels_nationaux": 'AVIP' if response['avip'] else "",
        "latitude": response['latitude'],
        "longitude": response['longitude'],
        "source": "monenfant",
        "lien_source": f"https://monenfant.fr/que-recherchez-vous/{creche_id}",
        "date_maj": response['derniereModifDate']
    }
    
    CRECHE_DATA_SCRAPED_2.append(formatted_creche_data)
    print(str(i + 1), response['nom'])
    time.sleep(0.4)


In [None]:
import pandas as pd

df_1 = pd.DataFrame(CRECHE_DATA_SCRAPED_1, dtype=str)
df_1.set_index("id", inplace=True)
df_2 = pd.DataFrame(CRECHE_DATA_SCRAPED_2, dtype=str)
df_2.set_index("id", inplace=True)

df_1.to_parquet(f'./scraped_from_html_{now.isoformat()}.parquet')
df_2.to_parquet(f'./scraped_from_api_{now.isoformat()}.parquet')
print("done")

In [None]:
df = df_1.join(df_2)
df.to_csv(f'./monenfant_{now.isoformat()}.csv')