In [None]:
from dataclasses import dataclass
import pandas as pd
import re

import requests
from bs4 import BeautifulSoup


@dataclass(frozen=True, kw_only=True)
class EventItemData:
    taxonomy: str
    title: str
    href: str
    datetimes: list[pd.Timestamp]


@dataclass(frozen=True, kw_only=True)
class EventDetailsData:
    duration: pd.Timedelta
    location: str


def get_soup(url):
    response = requests.get(url)
    assert response.status_code == 200
    soup = BeautifulSoup(response.content, "html.parser")
    return soup


def parse_event_items(soup: BeautifulSoup) -> list[EventItemData]:
    event_items = soup.find_all("div", class_="event__item")
    event_item_data_list = []
    for event_item in event_items:
        times = event_item.find_all("time")
        datetimes = [pd.Timestamp(t.attrs["datetime"]) for t in times]
        href = event_item.find("a", class_="event__item__absolutelink").attrs["href"]
        data = EventItemData(
            taxonomy=list(
                event_item.find("ul", class_="event__taxonomy").findChildren("li")
            )[0].get_text(strip=True),
            datetimes=datetimes,
            href=href,
            title=event_item.find("h3", class_="event__item__title").get_text(
                strip=True
            ),
        )
        event_item_data_list.append(data)
    return event_item_data_list


def parse_event_page(soup: BeautifulSoup) -> EventDetailsData:
    numbers = list(
        map(
            int,
            re.findall(
                r"\d+",
                soup.find(
                    "span", class_="practical-info__item__data", itemprop="duration"
                ).get_text(strip=True),
            ),
        )
    )
    duration = (
        pd.Timedelta(hours=numbers[0], minutes=numbers[1])
        if len(numbers) == 2
        else pd.Timedelta(hours=numbers[0])
    )
    data = {
        "duration": duration,
        "location": soup.find(
            "span", class_="practical-info__item__data", itemprop="location"
        ).get_text(strip=True),
    }
    return EventDetailsData(**data)


def get_events_details(event_items: list[EventItemData]) -> list[EventDetailsData]:
    event_details_data_list = []
    for event_item_data in event_items:
        soup = get_soup(event_item_data.href)
        event_details_data_list.append(parse_event_page(soup))
    return event_details_data_list


# URL de la page des événements
url = "https://onct.toulouse.fr/agenda/"

soup = get_soup(url)
event_items = parse_event_items(soup)
events_details = get_events_details(event_items)

EventItemData(taxonomy='Grand concert symphonique', title='Dima Slobodeniouk / Mikhaïl Pletnev', href='https://onct.toulouse.fr/agenda/dima-slobodeniouk-mikhail-pletnev/', datetimes=[Timestamp('2024-11-08 20:00:00+0100', tz='UTC+01:00'), Timestamp('2024-11-07 20:00:00+0100', tz='UTC+01:00')])
EventItemData(taxonomy='Concert en famille', title='Élodie Fondacci raconte… Le Violon magique', href='https://onct.toulouse.fr/agenda/elodie-fondacci-le-violon-magique-2/', datetimes=[Timestamp('2024-11-17 16:00:00+0100', tz='UTC+01:00'), Timestamp('2024-11-17 11:00:00+0100', tz='UTC+01:00')])
EventItemData(taxonomy='Grand concert symphonique', title='Tarmo Peltokoski', href='https://onct.toulouse.fr/agenda/tarmo-peltokoski-241123/', datetimes=[Timestamp('2024-11-23 20:00:00+0100', tz='UTC+01:00')])
EventItemData(taxonomy='Ciné-concert', title='Le Dictateur', href='https://onct.toulouse.fr/agenda/le-dictateur-6464578-2/', datetimes=[Timestamp('2024-12-01 16:00:00+0100', tz='UTC+01:00'), Timestamp

In [5]:
[print(i) for i in event_items]

EventItemData(taxonomy='Grand concert symphonique', title='Dima Slobodeniouk / Mikhaïl Pletnev', href='https://onct.toulouse.fr/agenda/dima-slobodeniouk-mikhail-pletnev/', datetimes=[Timestamp('2024-11-08 20:00:00+0100', tz='UTC+01:00'), Timestamp('2024-11-07 20:00:00+0100', tz='UTC+01:00')])
EventItemData(taxonomy='Concert en famille', title='Élodie Fondacci raconte… Le Violon magique', href='https://onct.toulouse.fr/agenda/elodie-fondacci-le-violon-magique-2/', datetimes=[Timestamp('2024-11-17 16:00:00+0100', tz='UTC+01:00'), Timestamp('2024-11-17 11:00:00+0100', tz='UTC+01:00')])
EventItemData(taxonomy='Grand concert symphonique', title='Tarmo Peltokoski', href='https://onct.toulouse.fr/agenda/tarmo-peltokoski-241123/', datetimes=[Timestamp('2024-11-23 20:00:00+0100', tz='UTC+01:00')])
EventItemData(taxonomy='Ciné-concert', title='Le Dictateur', href='https://onct.toulouse.fr/agenda/le-dictateur-6464578-2/', datetimes=[Timestamp('2024-12-01 16:00:00+0100', tz='UTC+01:00'), Timestamp

[None, None, None, None, None, None, None, None, None, None, None, None]

In [6]:
[print(i) for i in events_details]

EventDetailsData(duration=Timedelta('0 days 01:30:00'), location='Halle aux Grains')
EventDetailsData(duration=Timedelta('0 days 01:15:00'), location='Halle aux Grains')
EventDetailsData(duration=Timedelta('0 days 01:45:00'), location='Halle aux Grains')
EventDetailsData(duration=Timedelta('0 days 02:05:00'), location='Halle aux Grains')
EventDetailsData(duration=Timedelta('0 days 01:45:00'), location='Halle aux Grains')
EventDetailsData(duration=Timedelta('0 days 02:00:00'), location='Halle aux Grains')
EventDetailsData(duration=Timedelta('0 days 01:30:00'), location='Halle aux Grains')
EventDetailsData(duration=Timedelta('0 days 01:40:00'), location='Halle aux Grains')
EventDetailsData(duration=Timedelta('0 days 01:10:00'), location='Halle aux Grains')
EventDetailsData(duration=Timedelta('0 days 01:10:00'), location='Halle aux Grains')
EventDetailsData(duration=Timedelta('0 days 01:00:00'), location='Halle aux Grains')
EventDetailsData(duration=Timedelta('0 days 01:30:00'), location=

[None, None, None, None, None, None, None, None, None, None, None, None]