In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from pathlib import Path
import re

# URL = "https://www.reseau-alpha.org/trouver-une-formation?form%5BcodePostal%5D%5B%5D=%7C91&form%5BcriteresScolarisation%5D=&form%5BniveauLinguistiqueVise%5D=&form%5Bprogramme%5D=&form%5BmotCle%5D="
URL = 'file:///home/colin/git/data-inclusion/analyse/notebooks/reseau-alpha/structure-list.html'

# Live HTML (don't use too much!)
# structure_base_url = 'https://www.reseau-alpha.org/structure/apprentissage-du-francais/'

# Local HTML
structure_base_url = "file:///home/colin/git/data-inclusion/analyse/notebooks/reseau-alpha/structures/"

class AlphaSpider(scrapy.Spider):
    name = "alpha"

    def start_requests(self):
        urls = [
            URL
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        dataslugs = response.css('div#div-accordion-structure > h3').xpath('@data-slug').getall()

        for slug in dataslugs:
            next_page = structure_base_url + slug
            yield scrapy.Request(next_page, callback=self.parse_structure)
    
    def parse_structure(self, response):
        slug = response.url.split('/')[-1]
        filename = f"structures/{slug}"
        # Path(filename).write_bytes(response.body)

        adresse_parts = response.css('div.lieu div.adresse::text').getall()


        code_postal = adresse = ""
        
        # Adresse
        clean_adresse_parts = []
        for part in adresse_parts:
            print(part.strip())
            if re.match(r'^\d}', part):
                if re.match(r'^\d{5}', part):
                    code_postal = part.strip()[0:5]
                else:
                    adresse = part.strip()
            clean_adresse_parts.append(part.strip())
        print("")


        # Téléphone
        telephone = response.css('div.lieu div.telephone > a::attr(href)').get()
        if type(telephone) == str:
            telephone = telephone.strip()[4:]
        else:
            telephone = ""

        yield {
            "structure_id": slug,
            "structure_name": response.css('div#structure > strong::text').get().strip(),
            "code_postal": code_postal,
            "adresse": adresse,
            "adresse_entière": clean_adresse_parts,
            "site_web": response.css('div.lieu div.facebook::text').get().strip(),
            "telephone": telephone,

        }
            
    
process = CrawlerProcess(settings={
    "FEEDS": {
        "structures.csv": {
            "format": "csv",
            "overwrite": True,
            },
        
    },
})
process.crawl(AlphaSpider)
process.start()