In [1]:
import sys
print(sys.executable)

/home/maxime/dev/python/sndx/.venv/bin/python3.12


In [106]:
import os
import time
import asyncio
import sndx
from sndx import Scrapper
from pathlib import Path
from datetime import timedelta
from pyppeteer import launch
from queue import Queue
import yaml
import re

CHROMIUM_PATH = os.environ.get("CHROMIUM_PATH")
ROOT_URL = "https://vod.catalogue-crc.org"
CATEGORIES_URL = "https://vod.catalogue-crc.org/categorie.html"

def parse_duration(s: str) -> int:
    parts = list(map(int, s.split(":")))
    if len(parts) == 3:
        h, m, sec = parts
    elif len(parts) == 2:
        h = 0
        m, sec = parts
    else:
        raise ValueError(f"Invalid duration format: {s}")

    seconds = h * 3600 + m * 60 + sec
    return timedelta(seconds=seconds)



def safe_filename(s):
    s = s.strip()
    s = re.sub(r"[^A-Za-z0-9._-]+", "_", s)
    return s or "file"

In [4]:
profile = "sndx-profile-1"

with sndx.Sink() as sink:
    async with sndx.RecordingScrapper(profile, sink, headless=False) as scrapper:
        print("Doing stuff")
        await asyncio.sleep(3)
        print("Stuff finished")

Opening sink sndx-FBFXmg
[RecordingScrapper-ZbkGiQ] Starting with profile [sndx-profile-1] and sink [sndx-FBFXmg]...
Doing stuff
Stuff finished
[RecordingScrapper-ZbkGiQ] Terminating...
Closing sink sndx-FBFXmg


In [134]:
def is_category_url(url):
    return url.startswith("https://vod.catalogue-crc.org/categorie/")

def is_serie_url(url):
    return url.startswith("https://vod.catalogue-crc.org/serie/")

def is_recording_url(url):
    return url.startswith("https://vod.catalogue-crc.org/enregistrement/")


class Link:
    def __init__(self, url, text):
        self.url = url
        self.text = text

    def __repr__(self):
        return f"[{self.text}]({self.url})"


class CategoryMetadata:
    def __init__(self, url, title, categories):
        self.url = url
        self.title = title
        self.categories = categories


class MetadataScrapper(Scrapper):
    def __init__(self, profile_id, outdir, headless=True):
        super().__init__(profile_id, headless)
        self.outdir = Path(outdir).resolve()
        self.categories_dir = self.outdir / "categories"
        self.recordings_dir = self.outdir / "recordings"
        self.url_queue = None
        self.visited = None

    
    async def open(self):
        print(f"[MetadataScrapper-{self.id}] Starting with profile [{self.profile_id}]...")
        self.browser = await launch(
            headless=self.headless,
            executablePath=CHROMIUM_PATH,
            userDataDir=self.profile_path)
        pages = await self.browser.pages()
        self.page = pages[0]

    async def close(self):
        print(f"[MetadataScrapper-{self.id}] Terminating...")
        await self.browser.close()


    async def crawl(self):
        await self.initialize()
        
        #while not self.queue.empty():
        #    url = self.queue.get()
        #    await self.process_resource_url(url)
        #    self.visited.add(url)
        #    self.queue.task_done()


    async def initialize(self):
        self.log(f"Initializing...")
        self.queue = Queue()
        self.visited = set()

        self.categories_dir.mkdir(parents=True, exist_ok=True)
        self.recordings_dir.mkdir(parents=True, exist_ok=True)
        
        await self.page.goto(CATEGORIES_URL)
        urls = await self.queue_resource_urls()   

    
    async def queue_resource_urls(self):
        new = 0
        for url in await self.extract_resource_urls():
            if url not in self.visited:
                self.queue.put(url)
                new += 1
        self.log(f"Found {new} new resource URLs.")

    
    async def process_resource_url(self, url):
        self.log(f"Scrapping [{url}]...")
        await self.page.goto(url)
        
        await self.queue_resource_urls()

        if is_category_url(url) or is_serie_url(url):
            await self.extract_category_metadata()
        elif is_recording_url(url):
            await self.extract_recording_metadata()


    async def extract_category_metadata(self):
        url = await self.page.evaluate('() => window.location.href')
        breadcrumbs = await page.xpath("//ul[@class='uk-breadcrumb']//li")
        title = await scrapper.get_text(breadcrumbs[-1])
        categories = await self.extract_parent_categories()

        data = {
            "url": url,
            "title": title,
            "categories": [{ "url": l.url, "title": l.text } for l in categories],
        }

        filename = safe_filename(f"{title}.yaml")
        filepath = self.categories_dir / filename

        with open(filepath, "w") as f:
            yaml.dump(data, f, sort_keys=False)

        self.log(f"Extracted category metadata in file [{filepath}].")


    async def extract_parent_categories(self):
        breadcrumbs = await page.xpath("//ul[@class='uk-breadcrumb']//li")     
        categories = []
        for elem in breadcrumbs[2:-1]:
            a = (await elem.xpath(".//a"))[0]
            url = await (await a.getProperty("href")).jsonValue()
            text = await (await a.getProperty("textContent")).jsonValue()
            categories.append(Link(url, text))

        return categories
    

    async def extract_recording_metadata(self):
        url = await self.page.evaluate('() => window.location.href')
        categories = await self.extract_parent_categories()
        title = await self.get_first("//h1[preceding-sibling::*[2][self::h3]]")
        subtitle = await self.get_first("//h2[preceding-sibling::*[3][self::h3]]")

        details = await page.xpath("//ul[@id='details']//li[@class='uk-active']//dd")
        code = (await self.get_text(details[0])).strip()
        date = (await self.get_text(details[-4])).strip()
        place = (await self.get_text(details[-3])).strip()
        authors = [s.strip() for s in (await self.get_text(details[-2])).split("<br/>")]
        duration_str = (await self.get_text(details[-1])).strip()

        summary = await scrapper.get_first("//li[@class='PromptuairePublication']//p")

        data = {
            "url": url,
            "code": code,
            "title": title,
            "subtitle": subtitle,
            "categories": [{ "url": l.url, "title": l.text } for l in categories],
            "date": date,
            "place": place,
            "authors": authors,
            "duration": duration_str,
            "duration-seconds": parse_duration(duration_str).seconds,
            "summary": summary,
        }

        filename = safe_filename(f"{code}-{title}.yaml")
        filepath = self.recordings_dir / filename

        with open(filepath, "w") as f:
            yaml.dump(data, f, sort_keys=False)

        self.log(f"Extracted recording metadata in file [{filepath}].")
        

    async def extract_resource_urls(self):
        urls = []
        for elem in await page.xpath("//a"):
            urls.append(await (await elem.getProperty("href")).jsonValue())
            
        return [u for u in urls if is_category_url(u) or is_serie_url(u) or is_recording_url(u)]


    def log(self, msg):
        print(f"[MetadataScrapper-{self.id}] {msg}")

In [131]:
profile = "sndx-profile-1"
outdir = Path("/home/maxime/.local/share/sndx")

scrapper = MetadataScrapper(profile, outdir, headless=False)
await scrapper.open()
browser = scrapper.browser
page = scrapper.page

[MetadataScrapper-Tap3KQ] Starting with profile [sndx-profile-1]...


In [130]:
await scrapper.close()

[MetadataScrapper-LejD1g] Terminating...


In [132]:
await scrapper.crawl()

[MetadataScrapper-Tap3KQ] Initializing...
[MetadataScrapper-Tap3KQ] Found 48 new resource URLs.


In [133]:
await scrapper.extract_recording_metadata()

[MetadataScrapper-Tap3KQ] Extracted recording metadata in file [/home/maxime/.local/share/sndx/recordings/PI_4_28.1-Face_aux_plaies_du_Christ..yaml].


In [104]:
print(f"Queued: {scrapper.queue.qsize()}")
print(f"Visited: {len(scrapper.visited)}")

Queued: 48
Visited: 0


In [128]:
summary = await scrapper.get_first("//li[@class='PromptuairePublication']//p")
summary

'C’est un véritable roman d’aventure que la vie du frère Flavien Laplante, missionnaire au Bangladesh\xa0! Dans cette première partie, frère Pierre raconte la jeunesse, l’affermissement de la vocation et les débuts missionnaires de ce religieux canadien, bien selon le cœur du pape François. Mais quel est le secret de ce travailleur acharné, ami des humbles, bâtisseur et directeur d’école... terreur des pirates\xa0? L’amour de la Sainte Vierge.'

In [49]:
await scrapper.close()

[MetadataScrapper-PPuNfg] Terminating...


In [126]:
await page.xpath("//li[@class='PromptuairePublication']//p")


[<pyppeteer.element_handle.ElementHandle at 0x7fffd90f7fe0>]