In [1]:
import sys
print(sys.executable)

/home/maxime/dev/python/sndx/.venv/bin/python3.12


In [None]:
import os
import subprocess
import time
import asyncio
import secrets
import re
from pathlib import Path
from datetime import timedelta
from pyppeteer import launch

chromium_path = os.environ.get("CHROMIUM_PATH")

login_url = "https://vod.catalogue-crc.org/connexion.html"
email = "genefrawoune-3470@yopmail.com"
pwd = (Path(os.getcwd()) / '..' / 'pwd.secret').read_text(encoding="utf-8").strip()

wait_seconds = 2  # Wait duration between navigation operations

output_dir = Path(os.getcwd()) / '..' / 'output'
output_dir.mkdir(parents=True, exist_ok=True)

In [3]:
def parse_duration(s: str) -> int:
    parts = list(map(int, s.split(":")))
    if len(parts) == 3:
        h, m, sec = parts
    elif len(parts) == 2:
        h = 0
        m, sec = parts
    else:
        raise ValueError(f"Invalid duration format: {s}")

    seconds = h * 3600 + m * 60 + sec
    return timedelta(seconds=seconds)


def safe_filename(s):
    s = s.strip()
    s = re.sub(r"[^A-Za-z0-9._-]+", "_", s)
    return s or "file"

In [4]:
class Sink:
    def __init__(self):
        self.id = secrets.token_urlsafe(4)[:6]
        self.name = f"sndx-{self.id}"
        self.module_id = None

    def __repr__(self):
        return f"Sink(id={self.id!r}, module_id={self.module_id!r})"

    def __enter__(self):
        self.module_id = subprocess.check_output(
            ["pactl", "load-module", "module-null-sink", self.name],
            text=True).strip()

    def __exit__(self, exc_type, exc_value, traceback):
        subprocess.Popen(["pactl", "unload-module", self.module_id])

In [6]:
class RecordingMetadata:
    def __init__(self, url, category, title, subtitle, code, date, place, authors, duration):
        self.url = url
        self.category = category
        self.title = title
        self.subtitle = subtitle
        self.code = code
        self.date = date
        self.place = place
        self.authors = authors
        self.duration = duration


In [7]:
class AudioRecording:
    def __init__(self, sink, filename):
        self.sink = sink
        self.filename = filename
        self.subprocess = None

    def start(self):
        self.subprocess = subprocess.Popen([
            "ffmpeg",
            "-y",
            "-f", "pulse",
            "-i", f"{self.sink.name}.monitor",
            "-ac", "2",
            "-vn",
            "-b:a", "192k",
            self.filename])
        print(f"Started recording from {self.sink.name} in {self.filename}.")

    def stop(self, exc_type, exc_value, traceback):
        self.subprocess.terminate()
        print(f"Stopped recording from {self.sink.name} in {self.filename}.")

In [13]:
class Agent:
    def __init__(self, sink, profile_id):
        self.sink = sink
        self.profile_id = profile_id
        self.profile_path = f"/path/{profile_id}"
        self.browser = None
        self.page = None

    
    async def __aenter__(self):
        self.browser = await launch(
            headless=False,
            executablePath=chromium_path,
            userDataDir=self.profile_path,
            args= [
                "--autoplay-policy=no-user-gesture-required",
                f"--audio-output-sink={self.sink.name}",
            ])
        pages = await self.browser.pages()
        self.page = pages[0]

    
    async def __aexit__(self, exc_type, exc_value, traceback):
        await self.browser.close()


    async def scrap_recording(self, url):
        await self.goto_logged_in(url)
        await self.wait_a_bit()
        
        metadata = await self.extract_metadata()

        audio_file = output_dir / safe_filename(f"{metadata.title or "no-title"}.mp3")
        recording = AudioRecording(self.sink, audio_file)
        recording.start()

        await self.play_recording()

        await asyncio.sleep(metadata.duration.seconds)
        recording.stop()

    
    async def get_first(self, xpath):
        elements = await self.page.xpath(xpath)
        if elements:
            return await self.get_text(elements[0])
        else:
            return None
    
    
    async def get_text(self, element):
        return await self.page.evaluate('(el) => el.textContent', element)
        
    
    async def extract_metadata(self):
        url = await self.page.evaluate('() => window.location.href')
        category = await self.get_first("//h3[following-sibling::*[2][self::h1]]")
        title = await self.get_first("//h1[preceding-sibling::*[2][self::h3]]")
        subtitle = await self.get_first("//h2[preceding-sibling::*[3][self::h3]]")
    
        details = await self.page.xpath("//ul[@id='details']//dd")
        code = (await self.get_text(details[0])).strip()
        date = (await self.get_text(details[1])).strip()
        place = (await self.get_text(details[2])).strip()
        authors = [s.strip() for s in (await self.get_text(details[3])).split("<br/>")]
        duration_str = (await self.get_text(details[4])).strip()
        
        return RecordingMetadata(
            url=url,
            category=category,
            title=title,
            subtitle=subtitle,
            code=code,
            date=date,
            place=place,
            authors=authors,
            duration=parse_duration(duration_str))

    
    async def wait_a_bit(self):
        await asyncio.sleep(wait_seconds)
    
    
    async def is_logged_in(self):
        elements = await self.page.xpath("//button[contains(text(), 'Se connecter')]")
        if len(elements) == 1:
            return False
        else:
            return True
    
    
    async def login(self):
        await self.page.goto(login_url)
        await self.wait_a_bit()
        
        await self.page.type("input[name='email']", email)
        await self.page.type("input[name='password']", pwd)
        elements = await self.page.xpath("//button[text()='Connexion']")
        await elements[0].click()
    
    
    async def goto_logged_in(self, url):
        await self.page.goto(url)
        await self.wait_a_bit()
        
        if not await self.is_logged_in():
            await self.login()
            await self.wait_a_bit()
            await self.page.goto(url)
    
    
    async def play_recording(self):
        elements = await self.page.xpath("//a[contains(text(), 'Audio bas débit')]")
        await elements[0].click()


In [9]:
sink = Sink()
sink.__enter__()
print(sink)

Sink(id='uhrPkA', module_id='536870916')


In [17]:
profile1 = "01"
agent = Agent(sink, profile1)
await agent.__aenter__()
print(agent)

<__main__.Agent object at 0x7fffd9987200>


In [None]:
video_url = "https://vod.catalogue-crc.org/enregistrement/la-nouvelle-droite.html"
await agent.scrap_recording(video_url)

Started recording from sndx-uhrPkA in /home/maxime/dev/python/sndx/notebooks/../output/LA_NOUVELLE_DROITE.mp3.


ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with gcc 14.3.0 (GCC)
  libavutil      59. 39.100 / 59. 39.100
  libavcodec     61. 19.101 / 61. 19.101
  libavformat    61.  7.100 / 61.  7.100
  libavdevice    61.  3.100 / 61.  3.100
  libavfilter    10.  4.100 / 10.  4.100
  libswscale      8.  3.100 /  8.  3.100
  libswresample   5.  3.100 /  5.  3.100
  libpostproc    58.  3.100 / 58.  3.100
[aist#0:0/pcm_s16le @ 0x4c3180] Guessed Channel Layout: stereo
Input #0, pulse, from 'sndx-uhrPkA.monitor':
  Duration: N/A, start: 1763316421.728367, bitrate: 1536 kb/s
  Stream #0:0: Audio: pcm_s16le, 48000 Hz, stereo, s16, 1536 kb/s
[out#0/mp3 @ 0x4c3300] Error opening output /home/maxime/dev/python/sndx/notebooks/../output/LA_NOUVELLE_DROITE.mp3: No such file or directory
Error opening output file /home/maxime/dev/python/sndx/notebooks/../output/LA_NOUVELLE_DROITE.mp3.
Error opening output files: No such file or directory


In [12]:
await agent.__aexit__(None, None, None)

## Helpers

In [129]:
metadata = await extract_metadata(page)

print(f"URL: {metadata.url}")
print(f"Category: {metadata.category}")
print(f"Title: {metadata.title}")
print(f"Subtitle: {metadata.subtitle}")
print(f"Code: {metadata.code}")
print(f"Date: {metadata.date}")
print(f"Place: {metadata.place}")
print(f"Authors: {metadata.authors}")
print(f"Duration: {metadata.duration}")

URL: https://vod.catalogue-crc.org/enregistrement/la-nouvelle-droite.html
Category: Les grands affrontements du siècle
Title: LA NOUVELLE DROITE
Subtitle: Alain de Benoist, surhomme indo-européen.
Code: AS 1
Date: 25 octobre 1979
Place: Paris
Authors: ['Abbé Georges de Nantes']
Duration: 1:01:43


In [120]:
elements = await page.xpath("//ul[@id='details']//dd")
(await get_text(elements[4])).strip()
[s.strip() for s in (await get_text(elements[3])).split("<br/>")]

['Abbé Georges de Nantes']

In [99]:
await text(first(page.xpath("//h2[preceding-sibling::*[3][self::h3]]")))


'Alain de Benoist, surhomme indo-européen.'

## Record
Cliquer sur "Audio bas débit" lance la lecture audio.

In [78]:
video_url = "https://vod.catalogue-crc.org/enregistrement/la-nouvelle-droite.html"
await goto_logged_in(page, video_url)

In [43]:
elements = await page.xpath("//span[contains(@class, 'vjs-duration-display')]")
text = await page.evaluate('(el) => el.textContent', elements[0])
duration = parse_duration(text.strip())
duration

datetime.timedelta(seconds=3703)

In [62]:
sink.__exit__(None, None, None)