# Race Program

In [46]:
import pandas as pd
import json

from bs4 import BeautifulSoup
import re
import copy 

from pathlib import Path

ROOT = Path('../')

filename = 'TESSERE scp 2025.pdf'

In [23]:
filename = 'FIC - Programma Serie.html'

with open(ROOT / 'data/external' / filename, "r") as f:
    soup = BeautifulSoup(f, 'html.parser')

with open(ROOT / 'data/athletes/athletes.json') as f:
    athletes_data = json.load(f)

In [24]:
club_athletes = [athlete['name'] for _, athlete in athletes_data.items()]
club_athletes

['Amato Federico',
 'Amato Maurizio',
 'Arnone Ciro',
 'Benvenuti Chiara',
 'Bosco Sisto',
 'Bruno Andrea',
 'Capritti Maurizio',
 'Cassina Giorgia',
 'Castiglione Andrea',
 'Cattarinich Irma',
 'Ciaravella Domenico',
 'Cincimino Giuseppe',
 'Civiletto Angelo Maria',
 "D'Amico Angelo",
 'De Caro Davide',
 'Fedele Monica',
 'Francaviglia Luca',
 'Franchina Bruno',
 'Giambanco Benedetto',
 'Li Greci Riccardo',
 'Macaluso Antonino',
 "Magazzu' Stefan",
 'Maniscalco Davide',
 'Marino Alberto',
 'Marino Salvatore',
 'Martucci Francesco Jan',
 'Mascari Giuseppe',
 'Miraglia Luca',
 'Munteanu Federico Andrei',
 'Oliva Salvatore',
 'Palumbo Lorenzo',
 'Puglia Pietro',
 'Rappa Sergio',
 'Sanseverino Giorgio',
 'Scalici Francesco Paolo',
 'Scarpello Martina',
 'Sciabarrà Flavio',
 'Terranova Chiaraluce',
 'Vento Marcello',
 'Zappulla Bianca',
 'Zerilli Laura',
 'Zerilli Mario Pietro']

In [62]:
def extract_race_info(table):
    data = table.find('td')
    fonts = data.find_all('font')
    race_text = fonts[3].find_all('b')[1].text
    race_time = fonts[3].find_all('b')[0].text.strip(" \xa0")
    race_info = dict(
        race_number = fonts[1].text.strip(),
        race_day = fonts[2].find('b').text.strip(" \xa0"),
        race_time = race_time,  # Optionally format this
        race_category = race_text.split("\xa0")[0].strip(),
        race_desc = race_text.split("\xa0")[1].strip(),
    )
    return race_info 


def extract_race_criterion(table, race_type):
    criterion = table.find('td', class_='t0')
    if criterion is not None:
        criterion = criterion.text.strip()
    else:
        if race_type == "Finale B":
            criterion = "Primo Medaglia di Bronzo, Secondo Quarto Classificato"
        if race_type == "Finale A":
            criterion = "Primo Medaglia d'Oro, Secondo Medaglia d'Argento"
    return criterion


def extract_race_crews(table):
    crews = table.find_all('td', class_='t1')
    crews_list = []
    for crew in crews:
        crew = crew.find('font')
        crew_number = crew.find('b').text.strip()
        if len(crew.find_all('font')) == 0:
            crews_list.append((crew_number, '-', '-'))
            continue
            
        club = crew.find_all('font')[0].text
        club = re.sub(r'\(\d\)', r'', club)
        athletes = []
        for a in crew.find_all('font')[1].stripped_strings:
            if a[0] not in ['(', '.']:
                athletes.append(re.sub(r'\d', '', a).strip())
        crews_list.append((crew_number, club, athletes))
    return crews_list
    

div = soup.find_all('div')[0]
tables = div.find_all('table')[-2:]
race_infos = extract_race_info(tables[0])
races = tables[1]
race_type = race_infos['race_desc']
race_infos["criterion"] = extract_race_criterion(races, race_type)
extract_race_crews(races)

[('1', 'PALERMO SC', ['Marino Salvatore', 'Puglia Pietro']),
 ('2', 'PELOROROW', ['Sorrenti Domenico', 'Bardetta Gianluca']),
 ('3', 'ICHNUSA CA', ['Demuro Francesco', 'Virdis Yaroslav']),
 ('4', 'TEVERE REMO', ['Carfagni Filippo', "Massai Niccolo'"]),
 ('5', 'TELIMAR', ['Armetta Francesco', 'Sardo Mattia']),
 ('6', 'PALERMO SC', ['Zerilli Mario Pietro', 'Francaviglia Luca'])]

In [72]:
soup.find_all('td', class_='label10')[1].find('b').text.replace(' - ', ' ').split()

['Regata', 'del', '28/03/2025', '30/03/2025']

In [78]:
race_program = []
race_dates = soup.find_all('td', class_='label10')[1].find('b').text.replace(' - ', ' ')
race_dates = pd.date_range(*race_dates.split()[-2:])
race_days = {str(date.day): date for date in race_dates}
print(race_days)
for div in soup.find_all('div'):
    tables = div.find_all('table')[-2:]
    if not len(tables):
        continue
    race_infos = extract_race_info(tables[0])
    races = tables[1]
    race_type = race_infos['race_desc']
    race_infos["criterion"] = extract_race_criterion(races, race_type)
    race_infos['crews'] = extract_race_crews(races)

    race_program.append(race_infos)
    
race_program[-100]

{'28': Timestamp('2025-03-28 00:00:00'), '29': Timestamp('2025-03-29 00:00:00'), '30': Timestamp('2025-03-30 00:00:00')}


{'race_number': '100',
 'race_day': '30',
 'race_time': '08:15',
 'race_category': 'DUE COASTAL ROWING Senior Beach 17-99 Mix',
 'race_desc': 'Qualificazione 4',
 'criterion': 'Il Primo in semifinale',
 'crews': [('1', 'PELOROROW', ['Fugazzotto Pietro', 'Lo Giudice Irene']),
  ('2', 'MONDELLO', ['Conti Alberto', 'Morello Costanza'])]}

In [64]:
club_races = []
for race in race_program:
    for lane, club, crew_members in race['crews']:
        club_athletes_in_crew = [a for a in crew_members if a in club_athletes]

        if club_athletes_in_crew:
            club_race = copy.copy(race)
            # Reorder crew members to put club athletes first
            ordered_crew = club_athletes_in_crew + [member for member in crew_members if member not in club_athletes]
                
            # Remove all the crews in the race and keep only the club crew
            del club_race['crews']
            club_race['crew'] = crew_members
            club_race['crew_lane'] = lane
            club_race['club'] = club
            club_races.append(club_race)

In [65]:
club_races[-1]

{'race_number': '162',
 'race_day': '30',
 'race_time': '11:55',
 'race_category': 'QUATTRO COASTAL C4x+ Senior Beach 17-99 Mix',
 'race_desc': 'Finale B',
 'criterion': 'Primo Medaglia di Bronzo, Secondo Quarto Classificato',
 'crew': ['Amato Federico',
  'Annella Cristina',
  'Conti Alberto',
  'Morello Costanza',
  'Leo Riccardo -Tim.'],
 'crew_lane': '1',
 'club': 'MONDELLO(1 Misto)'}

In [66]:
race_program[0]

{'race_number': '1',
 'race_day': '28',
 'race_time': '13:00',
 'race_category': 'DUE COASTAL ROWING Universitari M',
 'race_desc': "Time Trial R1 - Start every 1' minute",
 'criterion': 'Primi quattro tempi alle qualificazioni',
 'crews': [('1', 'PALERMO SC', ['Marino Salvatore', 'Puglia Pietro']),
  ('2', 'PELOROROW', ['Sorrenti Domenico', 'Bardetta Gianluca']),
  ('3', 'ICHNUSA CA', ['Demuro Francesco', 'Virdis Yaroslav']),
  ('4', 'TEVERE REMO', ['Carfagni Filippo', "Massai Niccolo'"]),
  ('5', 'TELIMAR', ['Armetta Francesco', 'Sardo Mattia']),
  ('6', 'PALERMO SC', ['Zerilli Mario Pietro', 'Francaviglia Luca'])]}