In [1]:
import sys
print(sys.executable)

/home/maxime/dev/python/sndx/.venv/bin/python3.12


In [14]:
import os
import subprocess
import time
import asyncio
import secrets
from pathlib import Path
from datetime import timedelta
from pyppeteer import launch

chromium_path = os.environ.get("CHROMIUM_PATH")
login_url = "https://vod.catalogue-crc.org/connexion.html"
email = "genefrawoune-3470@yopmail.com"
pwd = os.environ.get("ACCOUNT_PWD")
wait_seconds = 2  # Wait duration between navigation operations
pwd = (Path(os.getcwd()) / '..' / 'pwd.secret').read_text(encoding="utf-8").strip()

In [49]:
def parse_duration(s: str) -> int:
    parts = list(map(int, s.split(":")))
    if len(parts) == 3:
        h, m, sec = parts
    elif len(parts) == 2:
        h = 0
        m, sec = parts
    else:
        raise ValueError(f"Invalid duration format: {s}")

    seconds = h * 3600 + m * 60 + sec
    return timedelta(seconds=seconds)

In [27]:
class Sink:
    def __init__(self):
        self.id = secrets.token_urlsafe(4)[:6]
        self.name = f"sndx-{self.id}"
        self.module_id = None

    def __repr__(self):
        return f"Sink(id={self.id!r}, module_id={self.module_id!r})"

    def __enter__(self):
        self.module_id = subprocess.check_output(
            ["pactl", "load-module", "module-null-sink", self.name],
            text=True).strip()

    def __exit__(self, exc_type, exc_value, traceback):
        subprocess.Popen(["pactl", "unload-module", self.module_id])

In [None]:
async def wait_a_bit():
    await asyncio.sleep(wait_seconds)


async def is_logged_in(page):
    elements = await page.xpath("//button[contains(text(), 'Se connecter')]")
    if len(elements) == 1:
        return False
    else:
        return True


async def login(page):
    await page.goto(login_url)
    await wait_a_bit()
    
    await page.type("input[name='email']", email)
    await page.type("input[name='password']", pwd)
    elements = await page.xpath("//button[text()='Connexion']")
    await elements[0].click()


async def goto_logged_in(page, url):
    await page.goto(url)
    await wait_a_bit()
    
    if not await is_logged_in(page):
        await login(page)
        await wait_a_bit()

        await page.goto(url)


async def start_recording(page):
    elements = await page.xpath("//a[contains(text(), 'Audio bas débit')]")
    await elements[0].click()

In [None]:
class RecordingMetadata:
    def __init__(self, url, category, title, subtitle, code, date, place, authors, duration):
        self.url = url
        self.category = category
        self.title = title
        self.subtitle = subtitle
        self.code = code
        self.date = date
        self.place = place
        self.authors = authors
        self.duration = duration


async def first(promise):
    elements = await promise
    if elements:
        return elements[0]
    else:
        return None


async def get_text(element):
    return await page.evaluate('(el) => el.textContent', element)


async def text(promise):
    element = await promise
    if element:
        return await get_text(element)
    else:
        return None
    

async def extract_metadata(page):
    url = await page.evaluate('() => window.location.href')
    category = await text(first(page.xpath("//h3[following-sibling::*[2][self::h1]]")))
    title = await text(first(page.xpath("//h1[preceding-sibling::*[2][self::h3]]")))
    subtitle = await text(first(page.xpath("//h2[preceding-sibling::*[3][self::h3]]")))

    details = await page.xpath("//ul[@id='details']//dd")
    code = (await get_text(details[0])).strip()
    date = (await get_text(details[1])).strip()
    place = (await get_text(details[2])).strip()
    authors = [s.strip() for s in (await get_text(details[3])).split("<br/>")]
    duration_str = (await get_text(details[4])).strip()
    
    return RecordingMetadata(
        url=url,
        category=category,
        title=title,
        subtitle=subtitle,
        code=code,
        date=date,
        place=place,
        authors=authors,
        duration=parse_duration(duration_str))

In [41]:
class AudioRecording:
    def __init__(self, sink):
        self.sink = sink
        self.subprocess = None

    def __enter__(self):
        self.subprocess = subprocess.Popen([
            "ffmpeg", "-y", "-f", "pulse", "-i", f"{sink.name}.monitor",
            "-ac", "2", "-vn",
            "-b:a", "192k", "audio.mp3"
        ])

    def __exit__(self, exc_type, exc_value, traceback):
        self.subprocess.terminate()

In [40]:
class Agent:
    def __init__(self, sink, profile_id):
        self.sink = sink
        self.profile_id = profile_id
        self.profile_path = f"/path/{profile_id}"
        self.browser = None
        self.page = None

    async def __aenter__(self):
        self.browser = await launch(
            headless=False,
            executablePath=chromium_path,
            userDataDir=self.profile_path,
            args= [
                "--autoplay-policy=no-user-gesture-required",
                f"--audio-output-sink={self.sink.name}",
            ])
        pages = await self.browser.pages()
        self.page = pages[0]

    async def __aexit__(self, exc_type, exc_value, traceback):
        self.browser.close()


    async def scrap_recording(self, url):
        await goto_logged_in(self.page, url)
        await wait_a_bit()
        metadata = await extract_metadata(self.page)

In [31]:
sink = Sink()
sink.__enter__()
print(sink)

Sink(id='-IMr5g', module_id='536870916')


In [39]:
profile1 = "01"
agent = Agent(sink, profile1)
await agent.__aenter__()
print(agent)

<__main__.Agent object at 0x7fffd9994ce0>


In [55]:
browser

<pyppeteer.browser.Browser at 0x7fffd986c590>

In [58]:
page = await browser.newPage()

## Helpers

In [129]:
metadata = await extract_metadata(page)

print(f"URL: {metadata.url}")
print(f"Category: {metadata.category}")
print(f"Title: {metadata.title}")
print(f"Subtitle: {metadata.subtitle}")
print(f"Code: {metadata.code}")
print(f"Date: {metadata.date}")
print(f"Place: {metadata.place}")
print(f"Authors: {metadata.authors}")
print(f"Duration: {metadata.duration}")

URL: https://vod.catalogue-crc.org/enregistrement/la-nouvelle-droite.html
Category: Les grands affrontements du siècle
Title: LA NOUVELLE DROITE
Subtitle: Alain de Benoist, surhomme indo-européen.
Code: AS 1
Date: 25 octobre 1979
Place: Paris
Authors: ['Abbé Georges de Nantes']
Duration: 1:01:43


In [120]:
elements = await page.xpath("//ul[@id='details']//dd")
(await get_text(elements[4])).strip()
[s.strip() for s in (await get_text(elements[3])).split("<br/>")]

['Abbé Georges de Nantes']

In [99]:
await text(first(page.xpath("//h2[preceding-sibling::*[3][self::h3]]")))


'Alain de Benoist, surhomme indo-européen.'

## Record
Cliquer sur "Audio bas débit" lance la lecture audio.

In [78]:
video_url = "https://vod.catalogue-crc.org/enregistrement/la-nouvelle-droite.html"
await goto_logged_in(page, video_url)

In [43]:
elements = await page.xpath("//span[contains(@class, 'vjs-duration-display')]")
text = await page.evaluate('(el) => el.textContent', elements[0])
duration = parse_duration(text.strip())
duration

datetime.timedelta(seconds=3703)

In [30]:
sink.__exit__(None, None, None)