In [9]:
import fitz
import pandas as pd
from datetime import datetime
import logging


logging.basicConfig(level=logging.DEBUG)
filename = 'data/fwil_dhi_me_results_semi.pdf'
# filename = 'data/fwil_dhi_me_results_qr.pdf'

In [20]:
doc = fitz.open(filename)
page = doc[0]
text = page.get_text("text")

# Identify where the table starts 25 for the semis and 24 for the qualifiers
text.split('\n')[25:44]


['1. P',
 '8 BROSNAN Troy',
 'BROSNAN Troy',
 'BROSNAN Troy',
 'BROSNAN Troy',
 'CANYON CLLCTV FACTORY TEAM',
 '10007307417',
 'AUS',
 '1993',
 '45.455 (5)',
 '0:48.167 (6)',
 '2:43.249 (3)',
 '3:19.215 (2)',
 '3:50.532 (1)',
 '4:14.909',
 '4:14.909',
 '4:14.909',
 '4:14.909',
 '+0.000']

In [3]:
import fitz
from typing import List, Dict, Union
import pandas as pd
from datetime import datetime, timedelta

filename = 'data/fwil_dhi_me_results_semi.pdf'
table_start_line = 25

def extract_time_and_rank(data_string: str) -> (str, str):
    if "(" in data_string:
        time, rank = data_string.split()[0], data_string.split()[-1].strip("()")
        return time, rank
    else:
        return "N/A", "N/A"

def calculate_sector_times(split_times: List[str]) -> List[str]:
    sector_times = []
    previous_time = "0:00.000"

    for split_time in split_times:
        try:
            delta = datetime.strptime(split_time, "%M:%S.%f") - datetime.strptime(previous_time, "%M:%S.%f")
            sector_times.append(str(delta)[2:])  # Skip "0:" part in "0:XX.XXX" string
            previous_time = split_time
        except ValueError:
            sector_times.append("N/A")

    return sector_times

def extract_rider_info_all_pages(filename: str, table_start_line: int = 25) -> List[Dict[str, Union[str, List[str]]]]:
    doc = fitz.open(filename)
    riders_info = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")
        lines = text.split('\n')
        line_start = table_start_line

        while line_start < len(lines):
            rider_info = lines[line_start:line_start + 20]
            if len(rider_info) < 19:
                break

            # No team case
            if rider_info[5].isdigit():
                speed_trap, speed_trap_rank = extract_time_and_rank(rider_info[8])
                split_times, split_time_ranks = zip(*(extract_time_and_rank(s) for s in rider_info[9:13]))
                rider_data = {
                    'rank': rider_info[0].split()[0].replace('.', ''),
                    'protected': rider_info[0].split()[1] if len(rider_info[0].split()) > 1 else '',
                    'rider_number': rider_info[1].split()[0],
                    'name': ' '.join(rider_info[1].split()[1:]),
                    'team': 'N/A',
                    'uci_id': rider_info[5],
                    'country': rider_info[6],
                    'birth_year': rider_info[7],
                    'speed_trap': speed_trap,
                    'speed_trap_rank': speed_trap_rank,
                    'split_times': list(split_times),
                    'split_time_ranks': list(split_time_ranks),
                    'final_time': rider_info[13],
                    'gap': rider_info[17] if len(rider_info) > 17 else 'N/A',
                    'points': rider_info[18] if len(rider_info) > 18 else 'N/A'
                }
                next_offset = 19
            # With team case
            else:
                speed_trap, speed_trap_rank = extract_time_and_rank(rider_info[9])
                split_times, split_time_ranks = zip(*(extract_time_and_rank(s) for s in rider_info[10:14]))
                rider_data = {
                    'rank': rider_info[0].split()[0].replace('.', ''),
                    'protected': rider_info[0].split()[1] if len(rider_info[0].split()) > 1 else '',
                    'rider_number': rider_info[1].split()[0],
                    'name': ' '.join(rider_info[1].split()[1:]),
                    'team': rider_info[5],
                    'uci_id': rider_info[6],
                    'country': rider_info[7],
                    'birth_year': rider_info[8],
                    'speed_trap': speed_trap,
                    'speed_trap_rank': speed_trap_rank,
                    'split_times': list(split_times),
                    'split_time_ranks': list(split_time_ranks),
                    'final_time': rider_info[14],
                    'gap': rider_info[18] if len(rider_info) > 18 else 'N/A',
                    'points': rider_info[19] if len(rider_info) > 19 else 'N/A'
                }
                next_offset = 20

            if rider_data['final_time'] in ['DNF', 'DNS']:
                break

            sector_times = calculate_sector_times(rider_data['split_times'])
            rider_data['sector_times'] = sector_times
            riders_info.append(rider_data)
            line_start += next_offset

    return riders_info

# Generate DataFrame
riders_info = extract_rider_info_all_pages(filename, table_start_line)

df = pd.DataFrame(riders_info)
for i in range(4):
    df[f'split_{i+1}'] = df['split_times'].apply(lambda x: x[i] if len(x) > i else 'N/A')
    df[f'split_{i+1}_rank'] = df['split_time_ranks'].apply(lambda x: x[i] if len(x) > i else 'N/A')
    df[f'sector_{i+1}'] = df['sector_times'].apply(lambda x: x[i] if len(x) > i else 'N/A')

# Rank the sector times correctly
for i in range(4):
    df[f'sector_{i+1}_rank'] = df[f'sector_{i+1}'].apply(
        lambda x: timedelta(minutes=int(x.split(":")[0]), seconds=float(x.split(":")[1]))
        if x not in ["N/A", "-"]
        else timedelta.max
    ).rank(method="min").astype(int)

# Handle the final sector (sector_5)
df[f'sector_5'] = df.apply(lambda row: 
    str(datetime.strptime(row["final_time"], "%M:%S.%f") - datetime.strptime(row["split_4"], "%M:%S.%f"))[2:]
    if row["final_time"] not in ["DNF", "DNS", "N/A", "-"] and row["split_4"] not in ["DNF", "DNS", "N/A", "-"]
    else "N/A", axis=1)

df[f'sector_5_rank'] = df[f'sector_5'].apply(
    lambda x: timedelta(minutes=int(x.split(":")[0]), seconds=float(x.split(":")[1]))
    if x not in ["N/A", "-"]
    else timedelta.max
).rank(method="min").astype(int)

df.drop(columns=['split_times', 'split_time_ranks', 'sector_times'], inplace=True)

# Display and Save
display(df)
file_prefix = filename.split('/')[-1].split('.')[0]
df.to_csv(f'data/{file_prefix}.csv", index=False')




Unnamed: 0,rank,protected,rider_number,name,team,uci_id,country,birth_year,speed_trap,speed_trap_rank,...,sector_3,split_4,split_4_rank,sector_4,sector_1_rank,sector_2_rank,sector_3_rank,sector_4_rank,sector_5,sector_5_rank
0,1,P,8,BROSNAN Troy,CANYON CLLCTV FACTORY TEAM,10007307417,AUS,1993,45.455,5,...,00:35.966000,3:50.532,1,00:31.317000,6,2,4,14,00:24.377000,21
1,2,P,4,ILES Finn,SPECIALIZED GRAVITY,10090907774,CAN,1999,45.512,3,...,00:37.222000,3:50.904,2,00:30.938000,1,5,28,5,00:24.018000,12
2,3,P,1,BRUNI Loic,SPECIALIZED GRAVITY,10007544358,FRA,1994,43.636,46,...,00:36.541000,3:51.072,3,00:31.875000,7,1,18,31,00:23.908000,6
3,4,P,10,SHAW Luca,CANYON CLLCTV FACTORY TEAM,10008813442,USA,1996,44.253,27,...,00:36.056000,3:51.396,4,00:31.553000,3,7,6,21,00:24.014000,11
4,5,,57,MEIER-SMITH Luke *,GIANT FACTORY OFF-ROAD TEAM - DH,10049212326,AUS,2002,45.000,10,...,00:37.821000,3:52.056,7,00:30.674000,4,3,40,3,00:23.811000,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,57,,24,PENE Tuhoto-Ariki,MS INTENSE RACING,10022183274,NZL,2001,33.473,59,...,00:44.605000,5:21.609,58,00:48.520000,51,58,57,58,00:37.548000,58
57,58,,78,MENOYO BUSQUETS Pau *,COMMENCAL / SCHWALBE,10079233725,ESP,2003,28.191,62,...,00:59.715000,5:05.861,57,01:10.741000,30,56,59,60,01:22.579000,61
58,59,,33,SUAREZ ALONSO Angel,FRAMEWORKS RACING,10008831529,ESP,1995,41.594,56,...,00:39.751000,6:32.812,59,00:33.407000,61,59,55,56,00:26,56
59,60,,115,VERNON Taylor,ZERODE RACING,10008728667,GBR,1996,32.491,60,...,01:05.313000,7:12.856,60,01:37.807000,45,61,60,61,00:44.338000,59


In [5]:
import fitz
from typing import List, Dict, Union
import pandas as pd
from datetime import datetime, timedelta

filename = 'data/fwil_dhi_me_results_qr.pdf'
table_start_line = 24

def extract_time_and_rank(data_string: str) -> (str, str):
    if "(" in data_string:
        time, rank = data_string.split()[0], data_string.split()[-1].strip("()")
        return time, rank
    else:
        return "N/A", "N/A"

def calculate_sector_times(split_times: List[str]) -> List[str]:
    sector_times = []
    previous_time = "0:00.000"

    for split_time in split_times:
        try:
            delta = datetime.strptime(split_time, "%M:%S.%f") - datetime.strptime(previous_time, "%M:%S.%f")
            sector_times.append(str(delta)[2:])  # Skip "0:" part in "0:XX.XXX" string
            previous_time = split_time
        except ValueError:
            sector_times.append("N/A")

    return sector_times

def extract_rider_info_all_pages(filename: str, table_start_line: int = 25) -> List[Dict[str, Union[str, List[str]]]]:
    doc = fitz.open(filename)
    riders_info = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")
        lines = text.split('\n')
        line_start = table_start_line

        while line_start < len(lines):
            rider_info = lines[line_start:line_start + 20]
            if len(rider_info) < 19:
                break

            # No team case
            if rider_info[5].isdigit():
                speed_trap, speed_trap_rank = extract_time_and_rank(rider_info[8])
                split_times, split_time_ranks = zip(*(extract_time_and_rank(s) for s in rider_info[9:13]))
                rider_data = {
                    'rank': rider_info[0].split()[0].replace('.', ''),
                    'protected': rider_info[0].split()[1] if len(rider_info[0].split()) > 1 else '',
                    'rider_number': rider_info[1].split()[0],
                    'name': ' '.join(rider_info[1].split()[1:]),
                    'team': 'N/A',
                    'uci_id': rider_info[5],
                    'country': rider_info[6],
                    'birth_year': rider_info[7],
                    'speed_trap': speed_trap,
                    'speed_trap_rank': speed_trap_rank,
                    'split_times': list(split_times),
                    'split_time_ranks': list(split_time_ranks),
                    'final_time': rider_info[13],
                    'gap': rider_info[17] if len(rider_info) > 17 else 'N/A',
                    'points': rider_info[18] if len(rider_info) > 18 else 'N/A'
                }
                next_offset = 19
            # With team case
            else:
                speed_trap, speed_trap_rank = extract_time_and_rank(rider_info[9])
                split_times, split_time_ranks = zip(*(extract_time_and_rank(s) for s in rider_info[10:14]))
                rider_data = {
                    'rank': rider_info[0].split()[0].replace('.', ''),
                    'protected': rider_info[0].split()[1] if len(rider_info[0].split()) > 1 else '',
                    'rider_number': rider_info[1].split()[0],
                    'name': ' '.join(rider_info[1].split()[1:]),
                    'team': rider_info[5],
                    'uci_id': rider_info[6],
                    'country': rider_info[7],
                    'birth_year': rider_info[8],
                    'speed_trap': speed_trap,
                    'speed_trap_rank': speed_trap_rank,
                    'split_times': list(split_times),
                    'split_time_ranks': list(split_time_ranks),
                    'final_time': rider_info[14],
                    'gap': rider_info[18] if len(rider_info) > 18 else 'N/A',
                    'points': rider_info[19] if len(rider_info) > 19 else 'N/A'
                }
                next_offset = 20

            if rider_data['final_time'] in ['DNF', 'DNS']:
                break

            sector_times = calculate_sector_times(rider_data['split_times'])
            rider_data['sector_times'] = sector_times
            riders_info.append(rider_data)
            line_start += next_offset

    return riders_info

# Generate DataFrame
riders_info = extract_rider_info_all_pages(filename, table_start_line)

df = pd.DataFrame(riders_info)
for i in range(4):
    df[f'split_{i+1}'] = df['split_times'].apply(lambda x: x[i] if len(x) > i else 'N/A')
    df[f'split_{i+1}_rank'] = df['split_time_ranks'].apply(lambda x: x[i] if len(x) > i else 'N/A')
    df[f'sector_{i+1}'] = df['sector_times'].apply(lambda x: x[i] if len(x) > i else 'N/A')

# Rank the sector times correctly
for i in range(4):
    df[f'sector_{i+1}_rank'] = df[f'sector_{i+1}'].apply(
        lambda x: timedelta(minutes=int(x.split(":")[0]), seconds=float(x.split(":")[1]))
        if x not in ["N/A", "-"]
        else timedelta.max
    ).rank(method="min").astype(int)

# Handle the final sector (sector_5)
df[f'sector_5'] = df.apply(lambda row: 
    str(datetime.strptime(row["final_time"], "%M:%S.%f") - datetime.strptime(row["split_4"], "%M:%S.%f"))[2:]
    if row["final_time"] not in ["DNF", "DNS", "N/A", "-"] and row["split_4"] not in ["DNF", "DNS", "N/A", "-"]
    else "N/A", axis=1)

df[f'sector_5_rank'] = df[f'sector_5'].apply(
    lambda x: timedelta(minutes=int(x.split(":")[0]), seconds=float(x.split(":")[1]))
    if x not in ["N/A", "-"]
    else timedelta.max
).rank(method="min").astype(int)

df.drop(columns=['split_times', 'split_time_ranks', 'sector_times'], inplace=True)

# Display and Save
display(df)
file_prefix = filename.split('/')[-1].split('.')[0]
df.to_csv(f'data/{file_prefix}.csv", index=False')


OverflowError: days=-1000000000; must have magnitude <= 999999999

Exception ignored in: 'pandas._libs.algos.rank_sorted_1d'
Traceback (most recent call last):
  File "algos.pyx", line 77, in pandas._libs.algos.are_diff
OverflowError: days=-1000000000; must have magnitude <= 999999999


OverflowError: days=-1000000000; must have magnitude <= 999999999

Exception ignored in: 'pandas._libs.algos.rank_sorted_1d'
Traceback (most recent call last):
  File "algos.pyx", line 77, in pandas._libs.algos.are_diff
OverflowError: days=-1000000000; must have magnitude <= 999999999


Unnamed: 0,rank,protected,rider_number,name,team,uci_id,country,birth_year,speed_trap,speed_trap_rank,...,sector_3,split_4,split_4_rank,sector_4,sector_1_rank,sector_2_rank,sector_3_rank,sector_4_rank,sector_5,sector_5_rank
0,1,P,1,BRUNI Loic,SPECIALIZED GRAVITY,10007544358,FRA,1994,44.944,18,...,00:35.209000,3:44.241,1,00:30.886000,1,1,3,5,00:24.090000,15
1,2,P,4,ILES Finn,SPECIALIZED GRAVITY,10090907774,CAN,1999,45.056,16,...,00:36.309000,3:47.685,2,00:31.028000,3,4,17,11,00:23.939000,5
2,3,P,10,SHAW Luca,CANYON CLLCTV FACTORY TEAM,10008813442,USA,1996,44.527,27,...,00:36.071000,3:48.081,5,00:30.865000,2,6,9,3,00:24.008000,7
3,4,P,8,BROSNAN Troy,CANYON CLLCTV FACTORY TEAM,10007307417,AUS,1993,46.124,2,...,00:35.819000,3:48.003,3,00:30.566000,4,9,7,1,00:24.237000,25
4,5,p,17,O CALLAGHAN Oisin *,YT MOB,10017486353,IRL,2003,45.802,4,...,00:35.600000,3:48.155,6,00:31.617000,22,3,4,31,00:24.162000,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,126,,140,LANCETT-EDWARDS Josh *,,10124404706,GBR,2005,26.354,134,...,01:46.063000,7:25.822,127,01:39.881000,125,126,130,0,00:57.205000,127
126,127,,106,WILLIAMS Preston *,THE ALLIANCE,10088217743,GBR,2003,41.618,110,...,00:38.154000,9:04.396,130,00:33.797000,57,129,71,103,00:26.584000,110
127,128,,95,LAMM Nico,,10074168507,GER,1999,28.016,132,...,01:39.668000,8:16.303,129,01:12.549000,98,128,129,126,01:31.847000,128
128,129,,143,MURRAY Charles,,10084688054,NZL,1996,38.055,121,...,00:46.811000,12:57.837,131,00:46.851000,91,130,123,120,00:43.045000,123
