In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from pathlib import Path
import re
import dateparser
import trafilatura

TESTING_WITH_LOCAL_FILES = True

# URL = "https://www.reseau-alpha.org/trouver-une-formation?form%5BcodePostal%5D%5B%5D=%7C91&form%5BcriteresScolarisation%5D=&form%5BniveauLinguistiqueVise%5D=&form%5Bprogramme%5D=&form%5BmotCle%5D="
URL = 'file:///home/colin/git/data-inclusion/analyse/notebooks/reseau-alpha/structure-list.html'

# Live HTML (don't use too much!)
# structure_base_url = 'https://www.reseau-alpha.org/structure/apprentissage-du-francais/'

# Local HTML
structure_base_url = "file:///home/colin/git/data-inclusion/analyse/notebooks/reseau-alpha/structures/"
formation_base_path = "file:///home/colin/git/data-inclusion/analyse/notebooks/reseau-alpha/services/"

# Structure avec antennes et formations : https://www.reseau-alpha.org/structure/apprentissage-du-francais/aries
# Structure sans antenne et sans formation : https://www.reseau-alpha.org/structure/apprentissage-du-francais/acafi
# Formation : https://www.reseau-alpha.org/structure/apprentissage-du-francais/aries/formation/francais-a-visee-professionnelle/b8a73-francais-a-visee-sociale-et-ou-professionnelle

def html_to_markdown(s: str):
    if s is None or s == "" :
        return s
    if type(s) == list:
        s = "<br/>".join(s)
    return trafilatura.extract(trafilatura.load_html("<html>" + s + "</html>"))

def clean_adresse(adresse_parts) -> {}:
    code_postal = adresse = ""
    clean_adresse_parts = {
        "adresse_entiere": "",
        "adresse": "",
        "code_postal": "",
        "commune": ""
    }
    for part in adresse_parts:
        part = part.strip()
        if re.match(r'^\d', part):
            if re.match(r'^\d{5}', part):
                split_address = part.split(" - ")
                clean_adresse_parts["code_postal"] = split_address[0]
                clean_adresse_parts["commune"] = split_address[1]
            else:
                clean_adresse_parts["adresse"] = part
        clean_adresse_parts["adresse_entiere"] += part + ", "
    return clean_adresse_parts

def strip(maybe_string):
    if type(maybe_string) == str:
        return maybe_string.strip()
    if maybe_string == None:
        return ""
    else:
        return maybe_string


class AlphaSpider(scrapy.Spider):
    name = "alpha"
    custom_settings = {
        "DOWNLOAD_DELAY": 0 if TESTING_WITH_LOCAL_FILES else 0.5
    }

    def start_requests(self):
        urls = [
            URL
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        
        formations_links = response.css('div#div-accordion-formation > div.contact-content a.readon')
        
        if TESTING_WITH_LOCAL_FILES:
            for slug in formations_links.xpath('@href').getall():
                next_page = formation_base_path + slug.split("/")[-1]
                yield scrapy.Request(next_page, callback=self.parse_formation)
        else:
            for a in formations_links:
                yield response.follow(a, callback=self.parse_formation)


        # for slug in dataslugs:
        #     next_page = structure_base_url + slug
        #     yield scrapy.Request(next_page, callback=self.parse_structure)
    
    # def parse_structure(self, response):
        # slug = response.url.split('/')[-1]
        # filename = f"structures/{slug}"
        # Path(filename).write_bytes(response.body)


        
        # Adresse
        # adresse_parts = response.css('div.lieu div.adresse::text')
        # code_postal = adresse = ""
        # clean_adresse_parts = []
        # for part in adresse_parts:
        #     print(part.strip())
        #     if re.match(r'^\d}', part):
        #         if re.match(r'^\d{5}', part):
        #             code_postal = part.strip()[0:5]
        #         else:
        #             adresse = part.strip()
        #     clean_adresse_parts.append(part.strip())
        # print("")


        # # Téléphone
        # telephone = response.css('div.lieu div.telephone > a::attr(href)').get()
        # if type(telephone) == str:
        #     telephone = telephone.strip()[4:]
        # else:
        #     telephone = ""

        # yield {
        #     "structure_id": slug,
        #     "structure_name": response.css('div#structure > strong::text').get().strip(),
        #     "code_postal": code_postal,
        #     "adresse": adresse,
        #     "adresse_entière": clean_adresse_parts,
        #     "site_web": response.css('div.lieu div.facebook::text').get().strip(),
        #     "telephone": telephone,

        # }

    def parse_formation(self, response):

        # Downloading HTML content
        # page = response.url.split("/")[-1]
        # filename = f"services/{page}"
        # Path(filename).write_bytes(response.body)

        formation_entete = response.css('div.entete')
        formation_contenu = response.css('div.entete + div')
        formation_contenu_col1 = response.css('div.entete + div > div:nth-child(1)')
        formation_contenu_col2 = response.css('div.entete + div > div:nth-child(2)')
        formation_inscription_info = formation_contenu_col2.css('div:nth-of-type(1)')
        formation_inscription_contact = formation_contenu_col2.css('div:nth-of-type(2)')
        formation_informations_pratiques = formation_contenu_col2.css('div:nth-of-type(3)')
        formation_lieux_horaires = response.css('div#lieux-formation')


        # SERVICE
        service = {}

        # Id
        service["id"] = response.url.split("/")[-1]
        
        # Nom
        service_nom_1 = strip(response.css("div.titre-element > strong::text").get())
        service_nom_2 = strip(response.css("a.underline.red-alpha + div::text").get())
        service["nom"] = f"{service_nom_1} ({service_nom_2})"

        # Date de màj
        date_maj_fr = strip(response.css("a.underline.red-alpha + div + div::text").get().split(":")[-1])
        service["date_maj"] = dateparser.parse(date_maj_fr).isoformat()
        
        # Description
        contenu_objectif_public = formation_contenu_col1.css(".row").getall()
        contenu_objectif_public += formation_informations_pratiques.get()
        # les descriptions sont très longues et rendent difficiles le test des autres champs
        # service["presentation_detail"] = html_to_markdown(contenu_objectif_public)

        # Lien vers la source
        service["lien_source"] = response.url

        # Courriel
        service["courriel"] = strip(formation_inscription_contact.css('div.email.red-alpha > a::attr(href)').get()).split(":")[-1]

        # Adresse
        clean_adresse_parts = clean_adresse(formation_lieux_horaires.css("div.adresse::text").getall())
        service["adresse"] = clean_adresse_parts["adresse"]
        service["code_postal"] = clean_adresse_parts["code_postal"]
        service["commune"] = clean_adresse_parts["commune"]

        # Téléphone
        service["telephone"] = ""
        
        # Contact nom prénom
        service["contact_nom_prenom"] = ""

        # Thématiques
        service["thematiques"] = ["apprendre-francais--suivre-formation"]
        if service_nom_2 == "Français à visée professionnelle":
            service["thematiques"].append("apprendre-francais--accompagnement-insertion-pro")
        if service_nom_2 == "Français à visée sociale et communicative":
            service["thematiques"].append("apprendre-francais--communiquer-vie-tous-les-jours")

        # Hard coded fields
        service["zone_diffusion_type"] = "departement"
        service["zone_diffusion_code"] = "91"
        service["zone_diffusion_nom"] = "Essonne"
        service["types"] = ["formation"]
        service["cumulable"] = True
        service["contact_public"] = True

        
        # STRUCTURE
        structure = {}
        # Nom de la structure
        structure["nom"] = strip(response.css("div.titre-element ~ a.underline.red-alpha::text").get())
        service["structure_id"] = structure["id"] = formation_entete.css("div.titre-element ~ a.underline.red-alpha::attr(href)").get().split("/")[-1]

        yield service
        
    
process = CrawlerProcess(settings={
    "FEEDS": {
        "alpha.json": {
            "format": "json",
            "overwrite": True,
            "ensure_ascii": False,
            'encoding': 'utf8',
            'store_empty': False,
            },
        "alpha.csv": {
            "format": "csv",
            "overwrite": True,
            'encoding': 'utf8',
            },
    },
})
process.crawl(AlphaSpider)
process.start()