In [1]:
import sys
print(sys.executable)

/home/maxime/dev/python/sndx/.venv/bin/python3.12


In [47]:
import os
import time
import asyncio
import sndx
from sndx import Scrapper
from pathlib import Path
from datetime import timedelta
from pyppeteer import launch
from queue import Queue

CHROMIUM_PATH = os.environ.get("CHROMIUM_PATH")
ROOT_URL = "https://vod.catalogue-crc.org"
CATEGORIES_URL = "https://vod.catalogue-crc.org/categorie.html"

In [4]:
profile = "sndx-profile-1"

with sndx.Sink() as sink:
    async with sndx.RecordingScrapper(profile, sink, headless=False) as scrapper:
        print("Doing stuff")
        await asyncio.sleep(3)
        print("Stuff finished")

Opening sink sndx-FBFXmg
[RecordingScrapper-ZbkGiQ] Starting with profile [sndx-profile-1] and sink [sndx-FBFXmg]...
Doing stuff
Stuff finished
[RecordingScrapper-ZbkGiQ] Terminating...
Closing sink sndx-FBFXmg


In [78]:
def is_category_url(url):
    return url.startswith("https://vod.catalogue-crc.org/categorie/")

def is_serie_url(url):
    return url.startswith("https://vod.catalogue-crc.org/serie/")

def is_recording_url(url):
    return url.startswith("https://vod.catalogue-crc.org/enregistrement/")


class Link:
    def __init__(self, url, text):
        self.url = url
        self.text = text

    def __repr__(self):
        return f"[{self.text}]({self.url})"


class CategoryMetadata:
    def __init__(self, url, title, categories):
        self.url = url
        self.title = title
        self.categories = categories


class MetadataScrapper(Scrapper):
    def __init__(self, profile_id, outdir, headless=True):
        super().__init__(profile_id, headless)
        self.outdir = Path(outdir).resolve()
        self.categories_dir = self.outdir / "categories"
        self.series_dir = self.outdir / "series"
        self.recordings_dir = self.outdir / "recordings"
        self.url_queue = None
        self.visited = None

    
    async def open(self):
        print(f"[MetadataScrapper-{self.id}] Starting with profile [{self.profile_id}]...")
        self.browser = await launch(
            headless=self.headless,
            executablePath=CHROMIUM_PATH,
            userDataDir=self.profile_path)
        pages = await self.browser.pages()
        self.page = pages[0]

    async def close(self):
        print(f"[MetadataScrapper-{self.id}] Terminating...")
        await self.browser.close()


    async def crawl(self):
        await self.initialize()
        
        #while not self.queue.empty():
        #    url = self.queue.get()
        #    await self.process_resource_url(url)
        #    self.queue.task_done()


    async def initialize(self):
        self.log(f"Initializing...")
        self.queue = Queue()
        self.visited = set()

        self.categories_dir.mkdir(parents=True, exist_ok=True)
        self.series_dir.mkdir(parents=True, exist_ok=True)
        self.recordings_dir.mkdir(parents=True, exist_ok=True)
        
        await self.page.goto(CATEGORIES_URL)
        urls = await self.queue_resource_urls()   

    
    async def queue_resource_urls(self):
        new = 0
        for url in await self.extract_resource_urls():
            if url not in self.visited:
                self.queue.put(url)
                new += 1
        self.log(f"Found {new} new resource URLs.")

    
    async def process_resource_url(self, url):
        self.log(f"Scrapping [{url}]...")
        await self.page.goto(url)
        
        await self.queue_resource_urls()

        if is_category_url(url):
            await self.extract_category_metadata()
        elif is_serie_url(url):
            await self.extract_serie_metadata()
        elif is_recording_url(url):
            await self.extract_recording_metadata()


    async def extract_category_metadata(self):
        url = await self.page.evaluate('() => window.location.href')
        breadcrumbs = await page.xpath("//ul[@class='uk-breadcrumb']//li")
        title = await scrapper.get_text(breadcrumbs[-1])
        
        categories = []
        for elem in breadcrumbs[2:-1]:
            a = (await elem.xpath(".//a"))[0]
            url = await (await a.getProperty("href")).jsonValue()
            text = await (await a.getProperty("textContent")).jsonValue()
            categories.append(Link(url, text))

        data = {
            "url": url,
            "title": title,
            "categories": [{ "url": l.url, "title": l.text } for l in categories],
        }

    

    async def extract_recording_metadata(self):
        url = await self.page.evaluate('() => window.location.href')
        category = await self.get_first("//h3[following-sibling::*[2][self::h1]]")
        title = await self.get_first("//h1[preceding-sibling::*[2][self::h3]]")
        subtitle = await self.get_first("//h2[preceding-sibling::*[3][self::h3]]")

        details = await self.page.xpath("//ul[@id='details']//dd")
        code = (await self.get_text(details[0])).strip()
        date = (await self.get_text(details[1])).strip()
        place = (await self.get_text(details[2])).strip()
        authors = [s.strip() for s in (await self.get_text(details[3])).split("<br/>")]
        duration_str = (await self.get_text(details[4])).strip()

        return RecordingMetadata(
            url=url,
            category=category,
            title=title,
            subtitle=subtitle,
            code=code,
            date=date,
            place=place,
            authors=authors,
            duration=parse_duration(duration_str))
        

    async def extract_resource_urls(self):
        urls = []
        for elem in await page.xpath("//a"):
            urls.append(await (await elem.getProperty("href")).jsonValue())
            
        return [u for u in urls if is_category_url(u) or is_serie_url(u) or is_recording_url(u)]


    def log(self, msg):
        print(f"[MetadataScrapper-{self.id}] {msg}")

In [80]:
profile = "sndx-profile-1"
outdir = Path("~/.local/share/sndx")

scrapper = MetadataScrapper(profile, outdir, headless=False)
await scrapper.open()
browser = scrapper.browser
page = scrapper.page

[MetadataScrapper-Z5yM2g] Starting with profile [sndx-profile-1]...


In [79]:
await scrapper.close()

[MetadataScrapper-CqyiNQ] Terminating...


In [81]:
await scrapper.crawl()

[MetadataScrapper-Z5yM2g] Initializing...
[MetadataScrapper-Z5yM2g] Found 48 new resource URLs.


In [83]:
await scrapper.extract_category_metadata()

<__main__.CategoryMetadata at 0x7fffd9171130>

In [59]:
print(f"Queued: {scrapper.queue.qsize()}")
print(f"Visited: {len(scrapper.visited)}")

Queued: 48
Visited: 0


In [76]:
breadcrumbs = await page.xpath("//ul[@class='uk-breadcrumb']//li")
current = breadcrumbs[-1]
parents = breadcrumbs[2:-1]
title = await scrapper.get_text(current)

links = []
for elem in parents:
    a = (await elem.xpath(".//a"))[0]
    url = await (await a.getProperty("href")).jsonValue()
    text = await (await a.getProperty("textContent")).jsonValue()
    links.append(Link(url, text))

links

[[I - La Contre-Réforme dans l’actualité](https://vod.catalogue-crc.org/categorie/i-la-contre-reforme-dans-actualite.html)]

In [49]:
await scrapper.close()

[MetadataScrapper-PPuNfg] Terminating...
