In [9]:
import fitz
import pandas as pd
from datetime import datetime
import logging


logging.basicConfig(level=logging.DEBUG)
filename = 'data/fwil_dhi_me_results_semi.pdf'
# filename = 'data/fwil_dhi_me_results_qr.pdf'

In [20]:
doc = fitz.open(filename)
page = doc[0]
text = page.get_text("text")

# Identify where the table starts 25 for the semis and 24 for the qualifiers
text.split('\n')[25:44]


['1. P',
 '8 BROSNAN Troy',
 'BROSNAN Troy',
 'BROSNAN Troy',
 'BROSNAN Troy',
 'CANYON CLLCTV FACTORY TEAM',
 '10007307417',
 'AUS',
 '1993',
 '45.455 (5)',
 '0:48.167 (6)',
 '2:43.249 (3)',
 '3:19.215 (2)',
 '3:50.532 (1)',
 '4:14.909',
 '4:14.909',
 '4:14.909',
 '4:14.909',
 '+0.000']

In [29]:
import fitz
from typing import List, Dict, Union
import pandas as pd
from datetime import datetime

filename = 'data/fwil_dhi_me_results_semi.pdf'
table_start_line = 25

def extract_time_and_rank(data_string: str) -> (str, str):
    if "(" in data_string:
        time, rank = data_string.split()[0], data_string.split()[-1].strip("()")
        return time, rank
    else:
        return "N/A", "N/A"

def calculate_sector_times_and_ranks(split_times: List[str]) -> List[str]:
    sector_times = []
    previous_time = "0:00.000"

    for split_time in split_times:
        try:
            delta = datetime.strptime(split_time, "%M:%S.%f") - datetime.strptime(previous_time, "%M:%S.%f")
            sector_times.append(str(delta)[2:])  # Skip "0:" part in "0:XX.XXX" string
            previous_time = split_time
        except ValueError:
            sector_times.append("N/A")

    return sector_times

def extract_rider_info_all_pages(filename: str, table_start_line: int = 25) -> List[Dict[str, Union[str, List[str]]]]:
    doc = fitz.open(filename)
    riders_info = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")
        lines = text.split('\n')
        line_start = table_start_line

        while line_start < len(lines):
            rider_info = lines[line_start:line_start + 20]
            if len(rider_info) < 19:
                break

            # No team case
            if rider_info[5].isdigit():
                speed_trap, speed_trap_rank = extract_time_and_rank(rider_info[8])
                split_times, split_time_ranks = zip(*(extract_time_and_rank(s) for s in rider_info[9:13]))
                rider_data = {
                    'rank': rider_info[0].split()[0].replace('.', ''),
                    'protected': rider_info[0].split()[1] if len(rider_info[0].split()) > 1 else '',
                    'rider_number': rider_info[1].split()[0],
                    'name': ' '.join(rider_info[1].split()[1:]),
                    'team': 'N/A',
                    'uci_id': rider_info[5],
                    'country': rider_info[6],
                    'birth_year': rider_info[7],
                    'speed_trap': speed_trap,
                    'speed_trap_rank': speed_trap_rank,
                    'split_times': list(split_times),
                    'split_time_ranks': list(split_time_ranks),
                    'final_time': rider_info[13],
                    'gap': rider_info[17] if len(rider_info) > 17 else 'N/A',
                    'points': rider_info[18] if len(rider_info) > 18 else 'N/A'
                }
                next_offset = 19
            # With team case
            else:
                speed_trap, speed_trap_rank = extract_time_and_rank(rider_info[9])
                split_times, split_time_ranks = zip(*(extract_time_and_rank(s) for s in rider_info[10:14]))
                rider_data = {
                    'rank': rider_info[0].split()[0].replace('.', ''),
                    'protected': rider_info[0].split()[1] if len(rider_info[0].split()) > 1 else '',
                    'rider_number': rider_info[1].split()[0],
                    'name': ' '.join(rider_info[1].split()[1:]),
                    'team': rider_info[5],
                    'uci_id': rider_info[6],
                    'country': rider_info[7],
                    'birth_year': rider_info[8],
                    'speed_trap': speed_trap,
                    'speed_trap_rank': speed_trap_rank,
                    'split_times': list(split_times),
                    'split_time_ranks': list(split_time_ranks),
                    'final_time': rider_info[14],
                    'gap': rider_info[18] if len(rider_info) > 18 else 'N/A',
                    'points': rider_info[19] if len(rider_info) > 19 else 'N/A'
                }
                next_offset = 20

            if rider_data['final_time'] in ['DNF', 'DNS']:
                break

            sector_times = calculate_sector_times_and_ranks(rider_data['split_times'])
            rider_data['sector_times'] = sector_times
            rider_data['sector_time_ranks'] = rider_data['split_time_ranks']
            riders_info.append(rider_data)
            line_start += next_offset

    return riders_info

# Generate DataFrame
riders_info = extract_rider_info_all_pages(filename, table_start_line)

df = pd.DataFrame(riders_info)
for i in range(4):
    df[f'split_{i+1}'] = df['split_times'].apply(lambda x: x[i] if len(x) > i else 'N/A')
    df[f'split_{i+1}_rank'] = df['split_time_ranks'].apply(lambda x: x[i] if len(x) > i else 'N/A')
    df[f'sector_{i+1}'] = df['sector_times'].apply(lambda x: x[i] if len(x) > i else 'N/A')
    df[f'sector_{i+1}_rank'] = df['sector_time_ranks'].apply(lambda x: x[i] if len(x) > i else 'N/A')

df.drop(columns=['split_times', 'split_time_ranks', 'sector_times', 'sector_time_ranks'], inplace=True)

# Display and Save
display(df)
file_prefix = filename.split('/')[-1].split('.')[0]
df.to_csv(f'data/{file_prefix}.csv', index=False)


Unnamed: 0,rank,protected,rider_number,name,team,uci_id,country,birth_year,speed_trap,speed_trap_rank,...,sector_2,sector_2_rank,split_3,split_3_rank,sector_3,sector_3_rank,split_4,split_4_rank,sector_4,sector_4_rank
0,1,P,8,BROSNAN Troy,CANYON CLLCTV FACTORY TEAM,10007307417,AUS,1993,45.455,5,...,01:55.082000,3,3:19.215,2,00:35.966000,2,3:50.532,1,00:31.317000,1
1,2,P,4,ILES Finn,SPECIALIZED GRAVITY,10090907774,CAN,1999,45.512,3,...,01:55.565000,2,3:19.966,4,00:37.222000,4,3:50.904,2,00:30.938000,2
2,3,P,1,BRUNI Loic,SPECIALIZED GRAVITY,10007544358,FRA,1994,43.636,46,...,01:54.413000,1,3:19.197,1,00:36.541000,1,3:51.072,3,00:31.875000,3
3,4,P,10,SHAW Luca,CANYON CLLCTV FACTORY TEAM,10008813442,USA,1996,44.253,27,...,01:56.120000,5,3:19.843,3,00:36.056000,3,3:51.396,4,00:31.553000,4
4,5,,57,MEIER-SMITH Luke *,GIANT FACTORY OFF-ROAD TEAM - DH,10049212326,AUS,2002,45.000,10,...,01:55.476000,4,3:21.382,8,00:37.821000,8,3:52.056,7,00:30.674000,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,57,,24,PENE Tuhoto-Ariki,MS INTENSE RACING,10022183274,NZL,2001,33.473,59,...,02:58.301000,58,4:33.089,58,00:44.605000,58,5:21.609,58,00:48.520000,58
57,58,,78,MENOYO BUSQUETS Pau *,COMMENCAL / SCHWALBE,10079233725,ESP,2003,28.191,62,...,02:06.219000,56,3:55.120,56,00:59.715000,56,5:05.861,57,01:10.741000,57
58,59,,33,SUAREZ ALONSO Angel,FRAMEWORKS RACING,10008831529,ESP,1995,41.594,56,...,03:16.171000,61,5:59.405,60,00:39.751000,60,6:32.812,59,00:33.407000,59
59,60,,115,VERNON Taylor,ZERODE RACING,10008728667,GBR,1996,32.491,60,...,03:40.136000,60,5:35.049,59,01:05.313000,59,7:12.856,60,01:37.807000,60


In [49]:
import fitz
from typing import List, Dict, Union
import pandas as pd
from datetime import datetime, timedelta

filename = 'data/fwil_dhi_me_results_qr.pdf'
table_start_line = 24

def extract_time_and_rank(data_string: str) -> (str, str):
    if "(" in data_string:
        time, rank = data_string.split()[0], data_string.split()[-1].strip("()")
        return time, rank
    else:
        return data_string, "N/A"

def calculate_sector_times(split_times: List[str], final_time: str) -> List[str]:
    sector_times = []
    previous_time = "0:00.000"

    for split_time in split_times:
        try:
            delta = datetime.strptime(split_time, "%M:%S.%f") - datetime.strptime(previous_time, "%M:%S.%f")
            sector_times.append(str(delta)[2:])
            previous_time = split_time
        except ValueError:
            sector_times.append("N/A")

    try:
        if final_time not in ['DNF', 'DNS', "N/A", "-"]:
            delta = datetime.strptime(final_time, "%M:%S.%f") - datetime.strptime(previous_time, "%M:%S.%f")
            sector_times.append(str(delta)[2:])
        else:
            sector_times.append("N/A")
    except ValueError:
        sector_times.append("N/A")

    return sector_times

def convert_to_timedelta(time_str: str) -> timedelta:
    """ Convert a string in the format 'MM:SS.sss' to a timedelta object. """
    try:
        dt = datetime.strptime(time_str, "%M:%S.%f")
        return timedelta(minutes=dt.minute, seconds=dt.second, microseconds=dt.microsecond)
    except ValueError:
        return timedelta(days=9999)  # Assign a very large timedelta value for invalid or "N/A" times

def extract_rider_info_all_pages(filename: str, table_start_line: int = 25) -> List[Dict[str, Union[str, List[str]]]]:
    doc = fitz.open(filename)
    riders_info = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")
        lines = text.split('\n')
        line_start = table_start_line

        while line_start < len(lines):
            rider_info = lines[line_start:line_start + 20]
            if len(rider_info) < 19:
                break

            # No team case
            if rider_info[5].isdigit():
                speed_trap, speed_trap_rank = extract_time_and_rank(rider_info[8])
                split_times, split_time_ranks = zip(*(extract_time_and_rank(s) for s in rider_info[9:13]))
                final_time = rider_info[13]
                gap = rider_info[14]
                rider_data = {
                    'rank': rider_info[0].split()[0].replace('.', ''),
                    'protected': rider_info[0].split()[1] if len(rider_info[0].split()) > 1 else '',
                    'rider_number': rider_info[1].split()[0],
                    'name': ' '.join(rider_info[1].split()[1:]),
                    'team': 'N/A',
                    'uci_id': rider_info[5],
                    'country': rider_info[6],
                    'birth_year': rider_info[7],
                    'speed_trap': speed_trap,
                    'speed_trap_rank': speed_trap_rank,
                    'split_times': list(split_times),
                    'split_time_ranks': list(split_time_ranks),
                    'final_time': final_time,
                    'gap': gap,
                    'points': rider_info[18] if len(rider_info) > 18 else 'N/A'
                }
                next_offset = 19
            # With team case
            else:
                speed_trap, speed_trap_rank = extract_time_and_rank(rider_info[9])
                split_times, split_time_ranks = zip(*(extract_time_and_rank(s) for s in rider_info[10:14]))
                final_time = rider_info[14]
                gap = rider_info[15]
                rider_data = {
                    'rank': rider_info[0].split()[0].replace('.', ''),
                    'protected': rider_info[0].split()[1] if len(rider_info[0].split()) > 1 else '',
                    'rider_number': rider_info[1].split()[0],
                    'name': ' '.join(rider_info[1].split()[1:]),
                    'team': rider_info[5],
                    'uci_id': rider_info[6],
                    'country': rider_info[7],
                    'birth_year': rider_info[8],
                    'speed_trap': speed_trap,
                    'speed_trap_rank': speed_trap_rank,
                    'split_times': list(split_times),
                    'split_time_ranks': list(split_time_ranks),
                    'final_time': final_time,
                    'gap': gap,
                    'points': rider_info[19] if len(rider_info) > 19 else 'N/A'
                }
                next_offset = 20

            if rider_data['final_time'] in ['DNF', 'DNS']:
                break

            sector_times = calculate_sector_times(rider_data['split_times'], rider_data['final_time'])
            rider_data['sector_times'] = sector_times
            rider_data['sector_time_ranks'] = rider_data['split_time_ranks'] + ["N/A"]
            riders_info.append(rider_data)
            line_start += next_offset

    return riders_info

# Generate DataFrame
riders_info = extract_rider_info_all_pages(filename, table_start_line)

df = pd.DataFrame(riders_info)

# Create the necessary columns
for i in range(4):
    df[f'split_{i+1}'] = df['split_times'].apply(lambda x: x[i] if len(x) > i else 'N/A')
    df[f'split_{i+1}_rank'] = df['split_time_ranks'].apply(lambda x: x[i] if len(x) > i else 'N/A')
    df[f'sector_{i+1}'] = df['sector_times'].apply(lambda x: x[i] if len(x) > i else 'N/A')
    df[f'sector_{i+1}_rank'] = df['sector_time_ranks'].apply(lambda x: x[i] if len(x) > i else 'N/A')

# Final sector
df[f'sector_5'] = df.apply(
    lambda row: str(datetime.strptime(row["final_time"], "%M:%S.%f") - datetime.strptime(row["split_4"], "%M:%S.%f"))[2:]
    if row["final_time"] not in ['DNF', 'DNS', 'N/A', '-'] and row["split_4"] not in ['DNF', 'DNS', 'N/A', '-']
    else "N/A", axis=1)

df[f'sector_5_rank'] = df[f'sector_5'].apply(convert_to_timedelta).rank(method='min').astype(int)

df.drop(columns=['split_times', 'split_time_ranks', 'sector_times', 'sector_time_ranks'], inplace=True)

# Display and Save
display(df)
file_prefix = filename.split('/')[-1].split('.')[0]
df.to_csv(f'data/{file_prefix}.csv', index=False)


Unnamed: 0,rank,protected,rider_number,name,team,uci_id,country,birth_year,speed_trap,speed_trap_rank,final_time,gap,points,split_1,split_1_rank,sector_1,sector_1_rank,split_2,split_2_rank,sector_2,sector_2_rank,split_3,split_3_rank,sector_3,sector_3_rank,split_4,split_4_rank,sector_4,sector_4_rank,sector_5,sector_5_rank
0,1,P,1,BRUNI Loic,SPECIALIZED GRAVITY,10007544358,FRA,1994,44.944,18,4:08.331,4:08.331,50,0:47.636,1,00:47.636000,1,2:38.146,1,01:50.510000,1,3:13.355,1,00:35.209000,1,3:44.241,1,00:30.886000,1,00:24.090000,15
1,2,P,4,ILES Finn,SPECIALIZED GRAVITY,10090907774,CAN,1999,45.056,16,4:11.624,4:11.624,40,0:47.848,3,00:47.848000,3,2:40.348,2,01:52.500000,2,3:16.657,3,00:36.309000,3,3:47.685,2,00:31.028000,2,00:23.939000,5
2,3,P,10,SHAW Luca,CANYON CLLCTV FACTORY TEAM,10008813442,USA,1996,44.527,27,4:12.089,4:12.089,30,0:47.829,2,00:47.829000,2,2:41.145,5,01:53.316000,5,3:17.216,5,00:36.071000,5,3:48.081,5,00:30.865000,5,00:24.008000,7
3,4,P,8,BROSNAN Troy,CANYON CLLCTV FACTORY TEAM,10007307417,AUS,1993,46.124,2,4:12.240,4:12.240,25,0:47.926,4,00:47.926000,4,2:41.618,6,01:53.692000,6,3:17.437,6,00:35.819000,6,3:48.003,3,00:30.566000,3,00:24.237000,25
4,5,p,17,O CALLAGHAN Oisin *,YT MOB,10017486353,IRL,2003,45.802,4,4:12.317,4:12.317,22,0:48.652,22,00:48.652000,22,2:40.938,4,01:52.286000,4,3:16.538,2,00:35.600000,2,3:48.155,6,00:31.617000,6,00:24.162000,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,126,,140,LANCETT-EDWARDS Josh *,,10124404706,GBR,2005,26.354,134,8:23.027,8:23.027,-,0:52.897,132,00:52.897000,132,3:59.878,129,03:06.981000,129,5:45.941,129,01:46.063000,129,7:25.822,127,01:39.881000,127,00:57.205000,127
126,127,,106,WILLIAMS Preston *,THE ALLIANCE,10088217743,GBR,2003,41.618,110,9:30.980,9:30.980,-,0:49.575,60,00:49.575000,60,7:52.445,133,07:02.870000,133,8:30.599,133,00:38.154000,133,9:04.396,130,00:33.797000,130,00:26.584000,110
127,128,,95,LAMM Nico,,10074168507,GER,1999,28.016,132,9:48.150,9:48.150,-,0:50.720,102,00:50.720000,102,5:24.086,132,04:33.366000,132,7:03.754,131,01:39.668000,131,8:16.303,129,01:12.549000,129,01:31.847000,128
128,129,,143,MURRAY Charles,,10084688054,NZL,1996,38.055,121,13:40.882,13:40.882,-,0:50.505,94,00:50.505000,94,11:24.175,134,10:33.670000,134,12:10.986,134,00:46.811000,134,12:57.837,131,00:46.851000,131,00:43.045000,123
