In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from pathlib import Path
import re
import dateparser
import trafilatura

TESTING_WITH_LOCAL_FILES = False

# URL = "https://www.reseau-alpha.org/trouver-une-formation?form%5BcodePostal%5D%5B%5D=%7C91&form%5BcriteresScolarisation%5D=&form%5BniveauLinguistiqueVise%5D=&form%5Bprogramme%5D=&form%5BmotCle%5D="
URL = 'file:///home/colin/git/data-inclusion/analyse/notebooks/reseau-alpha/structure-list.html'

# Live HTML (don't use too much to avoid being banned!)
# structure_base_url = 'https://www.reseau-alpha.org/structure/apprentissage-du-francais/'

# Local HTML
structure_base_url = "file:///home/colin/git/data-inclusion/analyse/notebooks/reseau-alpha/structures/"
formation_base_path = "file:///home/colin/git/data-inclusion/analyse/notebooks/reseau-alpha/services/"

# Structure avec antennes et formations : https://www.reseau-alpha.org/structure/apprentissage-du-francais/aries
# Structure sans antenne et sans formation : https://www.reseau-alpha.org/structure/apprentissage-du-francais/acafi
# Formation : https://www.reseau-alpha.org/structure/apprentissage-du-francais/aries/formation/francais-a-visee-professionnelle/b8a73-francais-a-visee-sociale-et-ou-professionnelle

def html_to_markdown(s: str):
    if s is None or s == "" :
        return s
    if type(s) == list:
        s = "<br/>".join(s)
    return trafilatura.extract(trafilatura.load_html("<html>" + s + "</html>"))

def clean_adresse(adresses: list or scrapy.Selector) -> {} or []:
    lieux = []
    for adresse in adresses:
        adresse_text_chunks = adresse.xpath('text()').getall()
        clean_lieu = {
            "structure_service_adresse_entiere": "",
            "structure_service_adresse": "",
            "structure_service_code_postal": "",
            "structure_service_commune": ""
        }
        for part in adresse_text_chunks:
            part = part.strip()
            if re.match(r'^\d', part):
                if re.match(r'^\d{5}', part):
                    split_address = part.split(" - ")
                    clean_lieu["structure_service_code_postal"] = split_address[0]
                    clean_lieu["structure_service_commune"] = split_address[1]
                else:
                    clean_lieu["structure_service_adresse"] = part
            clean_lieu["structure_service_adresse_entiere"] += part + ", "
        lieux.append(clean_lieu)
    return lieux

def strip(maybe_string):
    if type(maybe_string) == str:
        return maybe_string.strip()
    if maybe_string == None:
        return ""
    else:
        return maybe_string


class AlphaSpider(scrapy.Spider):
    name = "alpha"
    custom_settings = {
        "DOWNLOAD_DELAY": 0 if TESTING_WITH_LOCAL_FILES else 0.5
    }

    def start_requests(self):
        urls = [
            URL
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        
        formations_links = response.css('div#div-accordion-formation > div.contact-content a.readon')
        
        if TESTING_WITH_LOCAL_FILES:
            for slug in formations_links.xpath('@href').getall():
                next_page = formation_base_path + slug.split("/")[-1]
                yield scrapy.Request(next_page, callback=self.parse_formation)
        else:
            for a in formations_links:
                yield response.follow(a, callback=self.parse_formation)


        # for slug in dataslugs:
        #     next_page = structure_base_url + slug
        #     yield scrapy.Request(next_page, callback=self.parse_structure)

    def parse_formation(self, response):

        # Downloading HTML content
        # page = response.url.split("/")[-1]
        # filename = f"services/{page}"
        # Path(filename).write_bytes(response.body)

        formation_entete = response.css('div.entete')
        formation_contenu = response.css('div.entete + div')
        formation_contenu_col1 = response.css('div.entete + div > div:nth-child(1)')
        formation_contenu_col2 = response.css('div.entete + div > div:nth-child(2)')
        formation_inscription_info = formation_contenu_col2.css('div:nth-of-type(1)')
        formation_inscription_contact = formation_contenu_col2.css('div:nth-of-type(2)')
        formation_informations_pratiques = formation_contenu_col2.css('div:nth-of-type(3)')
        formation_lieux_horaires = response.css('div#lieux-formation')


        # SERVICE
        item = {}

        # Id
        item["id"] = response.url.split("/")[-1]
        
        # Nom
        service_nom_1 = strip(response.css("div.titre-element > strong::text").get())
        service_nom_2 = strip(response.css("a.underline.red-alpha + div::text").get())
        item["nom"] = f"{service_nom_1} ({service_nom_2})"

        # Date de màj
        date_maj_fr = strip(response.css("a.underline.red-alpha + div + div::text").get().split(":")[-1])
        item["date_maj"] = dateparser.parse(date_maj_fr).isoformat()
        
        # Description
        contenu_objectif_public = formation_contenu_col1.css(".row").getall()
        contenu_objectif_public += formation_informations_pratiques.get()
        # les descriptions sont très longues et rendent difficiles le test des autres champs
        if TESTING_WITH_LOCAL_FILES is False:
            item["presentation_detail"] = html_to_markdown(contenu_objectif_public)

        # Lien vers la source
        item["lien_source"] = response.url

        # Courriel
        item["courriel"] = strip(formation_inscription_contact.css('div.email.red-alpha > a::attr(href)').get()).split(":")[-1]

        # Adresse
        clean_lieux = clean_adresse(formation_lieux_horaires.css("div.adresse"))

        # Téléphone
        item["telephone"] = ""
        
        # Contact nom prénom
        item["contact_nom_prenom"] = ""

        # Thématiques
        item["thematiques"] = ["apprendre-francais--suivre-formation"]
        if service_nom_2 == "Français à visée professionnelle":
            item["thematiques"].append("apprendre-francais--accompagnement-insertion-pro")
        if service_nom_2 == "Français à visée sociale et communicative":
            item["thematiques"].append("apprendre-francais--communiquer-vie-tous-les-jours")

        # Hard coded fields
        item["zone_diffusion_type"] = "departement"
        item["zone_diffusion_code"] = "91"
        item["zone_diffusion_nom"] = "Essonne"
        item["types"] = ["formation"]
        item["cumulable"] = True
        item["contact_public"] = True
        item["modes_accueil"] = ["en-presentiel"]

        
        # STRUCTURE
        structure_link = formation_entete.css("div.titre-element ~ a.underline.red-alpha")
        # ID la structure
        item["structure_id"] = structure_link.xpath("@href").get().split("/")[-1]

        # Une ligne/record de service par lieu
        for lieu in clean_lieux:
            print(lieu)
            item = item | lieu
            yield from response.follow_all(structure_link, callback=self.parse_structure, meta={"item": item}, dont_filter=True)
    
    def parse_structure(self, response):
        item = response.meta.get("item")
        
        # filename = f"structures/{structure_id}"
        # Path(filename).write_bytes(response.body)

        # Nom
        item["structure_nom"] = strip(response.css('div#structure > strong::text').get())

        # Data màj
        item["structure_date_maj"] = strip(response.css('div.structures-dates > div:nth-child(2)').xpath('text()').get())
        item["structure_date_maj"] = item["structure_date_maj"].split(" : ")[-1]
        item["structure_date_maj"] = dateparser.parse(item["structure_date_maj"]).isoformat()

        # Adresse
        # Sur le site Web, une structure a autant d'adresses qu'elle a de lieux pour ses services
        # Certains services sont proposés sur toutes les adresses de la structure, certains non.

        # Téléphone
        telephone = response.css('div.lieu div.telephone > a::attr(href)').get()
        if type(telephone) == str:
            # Les numéro de téléphone sont préfixés par tel:
            telephone = telephone.strip()[4:]
        else:
            telephone = ""
        item["structure_telephone"] = telephone
        
        # Site Web
        item["structure_site_web"] = strip(response.css('div.lieu div.facebook::text').get())

        # Lien source
        item["structure_lien_source"] = response.url

        # Labels
        item["structure_labels_autres"] = ["reseau-alpha"]

        # Thématiques
        item["structure_thematiques"] = ["apprendre-francais--suivre-formation"]


        yield item

    
process = CrawlerProcess(settings={
    "FEEDS": {
        "alpha.json": {
            "format": "json",
            "overwrite": True,
            "ensure_ascii": False,
            'encoding': 'utf8',
            'store_empty': False,
            },
        "alpha.csv": {
            "format": "csv",
            "overwrite": True,
            'encoding': 'utf8',
            },
    },
})
process.crawl(AlphaSpider)
process.start()

In [None]:
import pandas as pd

df = pd.read_csv('./alpha.csv', dtype = str, index_col=None)
df.info()