In [5]:
import fitz
import pandas as pd
from datetime import datetime
import logging


logging.basicConfig(level=logging.DEBUG)
# filename = 'data/fwil_dhi_me_results_semi.pdf'
# filename = 'data/fwil_dhi_me_results_qr.pdf'
# filename = 'data/fwil_dhi_me_results_f.pdf'
# filename = 'data/fwil_dhi_me_results_tt.pdf'
# filename = 'data/leog_dhi_me_results_qr.pdf'
filename = 'data/leog_dhi_me_results_semi.pdf'



In [3]:
doc = fitz.open(filename)
page = doc[0]
text = page.get_text("text")

# Identify where the table starts 25 for the semis and 24 for the qualifiers 24 for the finals and 34 for the time trials
text.split('\n')[24:64]


['1. p',
 '28 O CALLAGHAN Oisin *',
 'O CALLAGHAN Oisin *',
 'O CALLAGHAN Oisin *',
 'O CALLAGHAN Oisin *',
 'YT MOB',
 '10017486353',
 'IRL',
 '2003',
 '54.490 (85)',
 '0:33.636 (14)',
 '1:18.014 (8)',
 '1:57.530 (10)',
 '2:36.230 (2)',
 '3:09.336',
 '3:09.336',
 '3:09.336',
 '3:09.336',
 '+0.000',
 '50',
 '2. P',
 '10 SHAW Luca',
 'SHAW Luca',
 'SHAW Luca',
 'SHAW Luca',
 'CANYON CLLCTV FACTORY TEAM',
 '10008813442',
 'USA',
 '1996',
 '58.632 (1)',
 '0:33.248 (5)',
 '1:17.662 (5)',
 '1:55.046 (1)',
 '2:35.613 (1)',
 '3:09.396',
 '3:09.396',
 '3:09.396',
 '3:09.396',
 '+0.060',
 '40']

In [6]:
import fitz
from typing import List, Dict, Union
import pandas as pd
from datetime import datetime, timedelta

filename = filename
table_start_line = 25

def extract_time_and_rank(data_string: str) -> (str, str):
    if "(" in data_string:
        time, rank = data_string.split()[0], data_string.split()[-1].strip("()")
        return time, rank
    else:
        return "N/A", "N/A"

def calculate_sector_times(split_times: List[str]) -> List[str]:
    sector_times = []
    previous_time = "0:00.000"

    for split_time in split_times:
        try:
            delta = datetime.strptime(split_time, "%M:%S.%f") - datetime.strptime(previous_time, "%M:%S.%f")
            sector_times.append(str(delta)[2:])  # Skip "0:" part in "0:XX.XXX" string
            previous_time = split_time
        except ValueError:
            sector_times.append("N/A")

    return sector_times

def extract_rider_info_all_pages(filename: str, table_start_line: int = 25) -> List[Dict[str, Union[str, List[str]]]]:
    doc = fitz.open(filename)
    riders_info = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")
        lines = text.split('\n')
        line_start = table_start_line

        while line_start < len(lines):
            rider_info = lines[line_start:line_start + 20]
            if len(rider_info) < 19:
                break

            # No team case
            if rider_info[5].isdigit():
                speed_trap, speed_trap_rank = extract_time_and_rank(rider_info[8])
                split_times, split_time_ranks = zip(*(extract_time_and_rank(s) for s in rider_info[9:13]))
                rider_data = {
                    'rank': rider_info[0].split()[0].replace('.', ''),
                    'protected': rider_info[0].split()[1] if len(rider_info[0].split()) > 1 else '',
                    'rider_number': rider_info[1].split()[0],
                    'name': ' '.join(rider_info[1].split()[1:]),
                    'team': 'N/A',
                    'uci_id': rider_info[5],
                    'country': rider_info[6],
                    'birth_year': rider_info[7],
                    'speed_trap': speed_trap,
                    'speed_trap_rank': speed_trap_rank,
                    'split_times': list(split_times),
                    'split_time_ranks': list(split_time_ranks),
                    'final_time': rider_info[13],
                    'gap': rider_info[17] if len(rider_info) > 17 else 'N/A',
                    'points': rider_info[18] if len(rider_info) > 18 else 'N/A'
                }
                next_offset = 19
            # With team case
            else:
                speed_trap, speed_trap_rank = extract_time_and_rank(rider_info[9])
                split_times, split_time_ranks = zip(*(extract_time_and_rank(s) for s in rider_info[10:14]))
                rider_data = {
                    'rank': rider_info[0].split()[0].replace('.', ''),
                    'protected': rider_info[0].split()[1] if len(rider_info[0].split()) > 1 else '',
                    'rider_number': rider_info[1].split()[0],
                    'name': ' '.join(rider_info[1].split()[1:]),
                    'team': rider_info[5],
                    'uci_id': rider_info[6],
                    'country': rider_info[7],
                    'birth_year': rider_info[8],
                    'speed_trap': speed_trap,
                    'speed_trap_rank': speed_trap_rank,
                    'split_times': list(split_times),
                    'split_time_ranks': list(split_time_ranks),
                    'final_time': rider_info[14],
                    'gap': rider_info[18] if len(rider_info) > 18 else 'N/A',
                    'points': rider_info[19] if len(rider_info) > 19 else 'N/A'
                }
                next_offset = 20

            if rider_data['final_time'] in ['DNF', 'DNS']:
                break

            sector_times = calculate_sector_times(rider_data['split_times'])
            rider_data['sector_times'] = sector_times
            riders_info.append(rider_data)
            line_start += next_offset

    return riders_info

# Generate DataFrame
riders_info = extract_rider_info_all_pages(filename, table_start_line)

df = pd.DataFrame(riders_info)
for i in range(4):
    df[f'split_{i+1}'] = df['split_times'].apply(lambda x: x[i] if len(x) > i else 'N/A')
    df[f'split_{i+1}_rank'] = df['split_time_ranks'].apply(lambda x: x[i] if len(x) > i else 'N/A')
    df[f'sector_{i+1}'] = df['sector_times'].apply(lambda x: x[i] if len(x) > i else 'N/A')

# Rank the sector times correctly
for i in range(4):
    df[f'sector_{i+1}_rank'] = df[f'sector_{i+1}'].apply(
        lambda x: timedelta(minutes=int(x.split(":")[0]), seconds=float(x.split(":")[1]))
        if x not in ["N/A", "-"]
        else timedelta.max
    ).rank(method="min").astype(int)

# Handle the final sector (sector_5)
df[f'sector_5'] = df.apply(lambda row: 
    str(datetime.strptime(row["final_time"], "%M:%S.%f") - datetime.strptime(row["split_4"], "%M:%S.%f"))[2:]
    if row["final_time"] not in ["DNF", "DNS", "N/A", "-"] and row["split_4"] not in ["DNF", "DNS", "N/A", "-"]
    else "N/A", axis=1)

df[f'sector_5_rank'] = df[f'sector_5'].apply(
    lambda x: timedelta(minutes=int(x.split(":")[0]), seconds=float(x.split(":")[1]))
    if x not in ["N/A", "-"]
    else timedelta.max
).rank(method="min").astype(int)

df.drop(columns=['split_times', 'split_time_ranks', 'sector_times'], inplace=True)

# Display and Save
display(df)
file_prefix = filename.split('/')[-1].split('.')[0]
df.to_csv(f'data/{file_prefix}.csv', index=False)


Unnamed: 0,rank,protected,rider_number,name,team,uci_id,country,birth_year,speed_trap,speed_trap_rank,...,sector_3,split_4,split_4_rank,sector_4,sector_1_rank,sector_2_rank,sector_3_rank,sector_4_rank,sector_5,sector_5_rank
0,1,P,1,BRUNI Loic,SPECIALIZED GRAVITY,10007544358,FRA,1994,55.881,26,...,00:37.990000,2:32.866,1,00:38.461000,4,1,5,2,00:32.280000,2
1,2,P,10,SHAW Luca,CANYON CLLCTV FACTORY TEAM,10008813442,USA,1996,59.232,1,...,00:36.981000,2:32.886,2,00:39.471000,2,4,1,7,00:33.019000,12
2,3,P,8,BROSNAN Troy,CANYON CLLCTV FACTORY TEAM,10007307417,AUS,1993,56.446,13,...,00:38.460000,2:34.147,4,00:39.075000,3,3,16,5,00:31.985000,1
3,4,,28,O CALLAGHAN Oisin *,YT MOB,10017486353,IRL,2003,54.711,46,...,00:38.799000,2:33.859,3,00:38.520000,1,10,28,3,00:32.638000,5
4,5,P,19,PIERRON Amaury,COMMENCAL/MUC-OFF BY RIDING,10008827283,FRA,1996,56.446,13,...,00:38.438000,2:35.342,5,00:39.109000,16,7,15,6,00:33.085000,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,58,,65,ZWAR Oliver,CANYON CLLCTV FMD,10008106857,SWE,1995,55.575,30,...,00:38.317000,2:38.023,22,00:40.320000,47,26,12,24,00:46.814000,60
58,59,,110,MEEK Toby *,MEEKBOYZ RACING,10064077170,NZL,2004,54.381,51,...,00:39.462000,2:38.120,23,00:40.601000,34,5,50,32,00:50.795000,62
59,60,P,7,COULANGES Benoit,DORVAL AM COMMENCAL,10008194359,FRA,1994,56.446,13,...,00:38.688000,3:00.375,61,00:39.473000,62,36,23,8,00:33.880000,38
60,61,,173,EDMONDSON Jamie,BNC RACING,10023865620,GBR,2001,55.158,39,...,00:39.154000,2:51.491,60,00:53.110000,32,30,42,61,00:48.107000,61


In [4]:
import fitz
from typing import List, Dict, Union
import pandas as pd
from datetime import datetime, timedelta

filename = 'data/leog_dhi_me_results_qr.pdf'
table_start_line = 24

def extract_time_and_rank(data_string: str) -> (str, str):
    if "(" in data_string:
        time, rank = data_string.split()[0], data_string.split()[-1].strip("()")
        return time, rank
    else:
        return "N/A", "N/A"

def calculate_sector_times(split_times: List[str]) -> List[str]:
    sector_times = []
    previous_time = "0:00.000"

    for split_time in split_times:
        try:
            delta = datetime.strptime(split_time, "%M:%S.%f") - datetime.strptime(previous_time, "%M:%S.%f")
            sector_times.append(str(delta)[2:])  # Skip "0:" part in "0:XX.XXX" string
            previous_time = split_time
        except ValueError:
            sector_times.append("N/A")

    return sector_times

def extract_rider_info_all_pages(filename: str, table_start_line: int = 25) -> List[Dict[str, Union[str, List[str]]]]:
    doc = fitz.open(filename)
    riders_info = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")
        lines = text.split('\n')
        line_start = table_start_line

        while line_start < len(lines):
            rider_info = lines[line_start:line_start + 20]
            if len(rider_info) < 19:
                break

            # No team case
            if rider_info[5].isdigit():
                speed_trap, speed_trap_rank = extract_time_and_rank(rider_info[8])
                split_times, split_time_ranks = zip(*(extract_time_and_rank(s) for s in rider_info[9:13]))
                rider_data = {
                    'rank': rider_info[0].split()[0].replace('.', ''),
                    'protected': rider_info[0].split()[1] if len(rider_info[0].split()) > 1 else '',
                    'rider_number': rider_info[1].split()[0],
                    'name': ' '.join(rider_info[1].split()[1:]),
                    'team': 'N/A',
                    'uci_id': rider_info[5],
                    'country': rider_info[6],
                    'birth_year': rider_info[7],
                    'speed_trap': speed_trap,
                    'speed_trap_rank': speed_trap_rank,
                    'split_times': list(split_times),
                    'split_time_ranks': list(split_time_ranks),
                    'final_time': rider_info[13],
                    'gap': rider_info[17] if len(rider_info) > 17 else 'N/A',
                    'points': rider_info[18] if len(rider_info) > 18 else 'N/A'
                }
                next_offset = 19
            # With team case
            else:
                speed_trap, speed_trap_rank = extract_time_and_rank(rider_info[9])
                split_times, split_time_ranks = zip(*(extract_time_and_rank(s) for s in rider_info[10:14]))
                rider_data = {
                    'rank': rider_info[0].split()[0].replace('.', ''),
                    'protected': rider_info[0].split()[1] if len(rider_info[0].split()) > 1 else '',
                    'rider_number': rider_info[1].split()[0],
                    'name': ' '.join(rider_info[1].split()[1:]),
                    'team': rider_info[5],
                    'uci_id': rider_info[6],
                    'country': rider_info[7],
                    'birth_year': rider_info[8],
                    'speed_trap': speed_trap,
                    'speed_trap_rank': speed_trap_rank,
                    'split_times': list(split_times),
                    'split_time_ranks': list(split_time_ranks),
                    'final_time': rider_info[14],
                    'gap': rider_info[18] if len(rider_info) > 18 else 'N/A',
                    'points': rider_info[19] if len(rider_info) > 19 else 'N/A'
                }
                next_offset = 20

            if rider_data['final_time'] in ['DNF', 'DNS']:
                break

            sector_times = calculate_sector_times(rider_data['split_times'])
            rider_data['sector_times'] = sector_times
            riders_info.append(rider_data)
            line_start += next_offset

    return riders_info

# Generate DataFrame
riders_info = extract_rider_info_all_pages(filename, table_start_line)

df = pd.DataFrame(riders_info)
for i in range(4):
    df[f'split_{i+1}'] = df['split_times'].apply(lambda x: x[i] if len(x) > i else 'N/A')
    df[f'split_{i+1}_rank'] = df['split_time_ranks'].apply(lambda x: x[i] if len(x) > i else 'N/A')
    df[f'sector_{i+1}'] = df['sector_times'].apply(lambda x: x[i] if len(x) > i else 'N/A')

# Rank the sector times correctly
for i in range(4):
    df[f'sector_{i+1}_rank'] = df[f'sector_{i+1}'].apply(
        lambda x: timedelta(minutes=int(x.split(":")[0]), seconds=float(x.split(":")[1]))
        if x not in ["N/A", "-"]
        else timedelta.max
    ).rank(method="min").astype(int)

# Handle the final sector (sector_5)
df[f'sector_5'] = df.apply(lambda row: 
    str(datetime.strptime(row["final_time"], "%M:%S.%f") - datetime.strptime(row["split_4"], "%M:%S.%f"))[2:]
    if row["final_time"] not in ["DNF", "DNS", "N/A", "-"] and row["split_4"] not in ["DNF", "DNS", "N/A", "-"]
    else "N/A", axis=1)

df[f'sector_5_rank'] = df[f'sector_5'].apply(
    lambda x: timedelta(minutes=int(x.split(":")[0]), seconds=float(x.split(":")[1]))
    if x not in ["N/A", "-"]
    else timedelta.max
).rank(method="min").astype(int)

df.drop(columns=['split_times', 'split_time_ranks', 'sector_times'], inplace=True)

# Display and Save
display(df)
file_prefix = filename.split('/')[-1].split('.')[0]
df.to_csv(f'data/{file_prefix}.csv", index=False')


OverflowError: days=-1000000000; must have magnitude <= 999999999

Exception ignored in: 'pandas._libs.algos.rank_sorted_1d'
Traceback (most recent call last):
  File "algos.pyx", line 77, in pandas._libs.algos.are_diff
OverflowError: days=-1000000000; must have magnitude <= 999999999


OverflowError: days=-1000000000; must have magnitude <= 999999999

Exception ignored in: 'pandas._libs.algos.rank_sorted_1d'
Traceback (most recent call last):
  File "algos.pyx", line 77, in pandas._libs.algos.are_diff
OverflowError: days=-1000000000; must have magnitude <= 999999999


OverflowError: days=-1000000000; must have magnitude <= 999999999

Exception ignored in: 'pandas._libs.algos.rank_sorted_1d'
Traceback (most recent call last):
  File "algos.pyx", line 77, in pandas._libs.algos.are_diff
OverflowError: days=-1000000000; must have magnitude <= 999999999


OverflowError: days=-1000000000; must have magnitude <= 999999999

Exception ignored in: 'pandas._libs.algos.rank_sorted_1d'
Traceback (most recent call last):
  File "algos.pyx", line 77, in pandas._libs.algos.are_diff
OverflowError: days=-1000000000; must have magnitude <= 999999999


OverflowError: days=-1000000000; must have magnitude <= 999999999

Exception ignored in: 'pandas._libs.algos.rank_sorted_1d'
Traceback (most recent call last):
  File "algos.pyx", line 77, in pandas._libs.algos.are_diff
OverflowError: days=-1000000000; must have magnitude <= 999999999


Unnamed: 0,rank,protected,rider_number,name,team,uci_id,country,birth_year,speed_trap,speed_trap_rank,...,sector_3,split_4,split_4_rank,sector_4,sector_1_rank,sector_2_rank,sector_3_rank,sector_4_rank,sector_5,sector_5_rank
0,1,p,28,O CALLAGHAN Oisin *,YT MOB,10017486353,IRL,2003,54.490,85,...,00:39.516000,2:36.230,2,00:38.700000,14,3,41,1,00:33.106000,2
1,2,P,10,SHAW Luca,CANYON CLLCTV FACTORY TEAM,10008813442,USA,1996,58.632,1,...,00:37.384000,2:35.613,1,00:40.567000,5,5,1,7,00:33.783000,9
2,3,P,8,BROSNAN Troy,CANYON CLLCTV FACTORY TEAM,10007307417,AUS,1993,55.939,19,...,00:39.406000,2:36.923,3,00:39.980000,4,7,36,3,00:33.414000,5
3,4,p,6,KERR Bernard,PIVOT FACTORY RACING,10006413094,GBR,1991,55.824,25,...,00:38.922000,2:37.815,7,00:40.618000,9,13,13,8,00:33.004000,1
4,5,,110,MEEK Toby *,MEEKBOYZ RACING,10064077170,NZL,2004,55.441,45,...,00:39.634000,2:37.711,6,00:40.398000,6,2,50,4,00:33.474000,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,166,,32,KERR Henry,CANYON CLLCTV PIRELLI,10023914524,IRL,2000,54.527,84,...,00:40.158000,4:47.258,168,02:45.134000,85,52,85,0,00:37.910000,143
166,167,,185,GENTLE Jobe *,TEAM HIGH COUNTRY,10114036416,AUS,2004,53.589,116,...,00:40.203000,2:52.398,128,00:47.423000,120,120,88,152,02:40.769000,0
167,168,,81,MARINI Hugo *,SCOTT DOWNHILL FACTORY,10072798379,FRA,2005,54.290,97,...,00:39.931000,4:26.087,167,02:23.942000,53,96,69,168,01:11.299000,168
168,169,,104,PERRAUDIN Marius *,,10088322524,SUI,2002,52.241,142,...,00:42.549000,6:05.469,169,01:18.760000,118,118,160,164,00:42.295000,163
