In [None]:
# Scraping from USC offical Website for 2025 SEASON

In [None]:
# Import libs
import sys
import os
import git
import pandas as pd

# Ajoute le dossier "ressources" au sys.path
git_root = git.Repo(search_parent_directories=True).working_tree_dir
sys.path.insert(0,   os.path.abspath(  os.path.join(  git_root,'api' ) ) )

import script.libs.utils as utils
import script.libs.scraping_utils as scraping_utils
import script.libs.HttpRequests as HttpRequests
from script.configuration import config, oa
from script.libs.getOaLocation import get_or_create_oa_location

from slugify import slugify
import requests
# import requests_cache
from bs4 import BeautifulSoup
import  dateparser, pytz
from urllib.parse import urlparse
from pprint import pprint

In [None]:
# Constants for Scraping
baseURL= "https://billetterie-usconcarneau.tickandlive.com/"
allEventsPage= f"{baseURL}/catalogue"
# locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')
DEFAULT_TIME_START = "19:30:00"
getTimeout= scraping_utils.getTimeout

In [None]:
# Get the HTML content of the events page
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0", 
}
html_doc = requests.get(url= allEventsPage,headers=headers, timeout=getTimeout).content
## In case of weird character encoding:
# html_doc = html_doc.decode('ISO-8859-1',errors='replace')
parsed_html = BeautifulSoup(html_doc ,'html.parser')


In [None]:
# Get stade Guy Piriou locationUid
access_token = oa.getToken()

locationUid= get_or_create_oa_location("Stade Guy Piriou, Concarneau",
                                        access_token,
                                        oa.public_key,
                                        f"{config.OA_API_URL}/locations")
print(locationUid)

In [None]:
# Select each card with match info: URl, date, matchName
cardSelector= "div.uk-card.uk-card-default.uk-box-shadow-medium"

dateSelector= "div.uk-padding-small.uk-padding-remove-top > span"
opponenetSelector= "div.uk-padding-small.uk-padding-remove-top > h4"
urlSelector= "a"

timezone_paris = pytz.timezone('Europe/Paris')

extractedCards = parsed_html.select(cardSelector)
allMatchEvents= []
for card in extractedCards:
    date: str = card.select_one(dateSelector).text.strip()
    matchName: str= card.select_one(opponenetSelector).text.strip()
    url: str= card.select_one(urlSelector)['href'].strip()
    
    # Filter only match cards
    if not (date.startswith("Vendredi") or date.startswith("Mardi")):
        print(f"date: {date} -> Not a match card, skip")
        continue
    if not matchName.startswith("USC"):
        print(f"matchName: {matchName}  -> Not a match card, skip")
        continue
    if not url.startswith("https://billetterie-usconcarneau.tickandlive.com/reserver/"):
        print(f"url: {url}  -> Not a match card, skip")
        continue
    # print(f"date: {date} | matchName: {matchName} | url: {url}")
    
    # Convert date string to datetime object. 19h30-21h15 
    start_date_str = f"{date} {DEFAULT_TIME_START}"
    start_date_obj = dateparser.parse(start_date_str, languages=['fr']).astimezone(timezone_paris)
    if start_date_obj is None:
        print(f"Error parsing date: {date}")
        continue
    end_date_obj = start_date_obj.replace(hour=21, minute=15)
    # print(f"Parsed date: {start_date_obj.isoformat()} - {end_date_obj.isoformat()}")
    
    # Create event object
    title= f"Match {matchName}"
    desc= f"Match: {matchName} le {date} à 19h30"
    longDesc = f"Match de l'Union Sportive Concarnoise: {matchName} le {date} à 19h30"
    event = {
        "uid-externe": f"usc-{slugify(start_date_str)}",
        "title": {"fr" : title } ,
        "description": {"fr" : desc },
        "longDescription": longDesc,
        "timings": [
            {
                "begin": start_date_obj.isoformat(),
                "end": end_date_obj.isoformat()
            }
        ],
        "locationUid": locationUid,
        "onlineAccessLink": url,
        "keywords": {"fr" : ["USC", "Foot", "Thoniers"]},
        "attendanceMode": 3  # 1=offline, 2=online, 3=hybrid
    }
    # Add to list
    allMatchEvents.append(event)
print(f"Total match events extracted: {len(allMatchEvents)}")
print("First event example:")
pprint(allMatchEvents[0])

In [None]:
# POST the events to OA API
for i, event in enumerate(allMatchEvents[1:]):

        response = HttpRequests.create_event(access_token,
                                        event,
        )

    

In [None]:
# CORRECTIONS SI NECESSAIRE.
# DEFAULT = False
# Update the events with online access link and attendance mode
correction = False
for event in [allMatchEvents[0]]:
    if not correction:
        print("Corrections not executed")
        break
    # print(event.get("title"))
    try:
        response= HttpRequests.search_events(oa.public_key, event.get("title"))
        if not response.get("events"):
            raise ValueError(f"No events return for {event.get("title")} found in OA {response}")
        event= response.get("events")[0]
        if not event.get("uid"):
            raise ValueError(f"Event returned for {event.get("title")} is empty {response.get("events")}")
    except Exception as e:
        print(f"Error in event search: {e}")
        continue
    # pprint.pprint(event)
    eventUID= event.get("uid")
    eventTitle= event.get("title")
    print(eventUID, eventTitle)
    
    #EXEMPLE DE CORRECTION
    # event["onlineAccessLink"] = event.get("lien")
    # event["attendanceMode"] = 3  # 1=offline, 2=online, 3=hybrid
    # event["keywords"] = { "fr" : event.get("keywords")}
    
    HttpRequests.delete_event(
        access_token,
        eventUID
    )
    print("--------")