In [34]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import pandas as pd

In [10]:
driver = webdriver.Safari()

def get_rider_urls(race, year):
    if race[1] == 'result':
        race_url = f"https://www.procyclingstats.com/race/{race[0]}/{year}/result"
        urls_to_try = [race_url]
    else:
        urls_to_try = [f"https://www.procyclingstats.com/race/{race[0]}/{year}/stage-1",
                    f"https://www.procyclingstats.com/race/{race[0]}/{year}/prologue"]

    for url in urls_to_try:
        driver.get(url)
        
        try:
            h1_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, 'h1'))
            )

            if h1_element.text == 'Page not found':
                print(f"Page not found for URL: {url}")
                continue

            race_table_tag = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'results.basic.moblist10'))
            )

            tr_table = race_table_tag.find_elements(By.TAG_NAME, 'tr')
            
            rider_name = []
            for i in tr_table[1:]:
                rider_name.append(i.find_element(By.TAG_NAME, 'a').get_attribute('href'))

            return rider_name
        except Exception as e:
            print(f"Error fetching data for URL: {url} - {e}")
            continue
        
    return []
    
def get_races(year):
    year_url = f"https://www.procyclingstats.com/races.php?year={year}&circuit=1&class=&filter=Filter"
    driver.get(year_url)

    try:
        race_table = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'basic'))
        )

        tr_table = race_table.find_elements(By.TAG_NAME, 'tr')

        races = []
        for i in tr_table:
            td_elements = i.find_elements(By.TAG_NAME, 'td')
            for td in td_elements:
                a_tag = td.find_elements(By.TAG_NAME, 'a')
                if a_tag and a_tag[0].get_attribute('href').__contains__('race'):
                    race = [None, None]
                    race[0] = a_tag[0].get_attribute('href').split('/')[4]
                    race[1] = a_tag[0].get_attribute('href').split('/')[-1]
                    races.append(race)
        return races

    except Exception as e:
        print(f"Error fetching data for URL: {year_url} - {e}")
        return []

races = []
flat_url = []
for year in range(2010, 2025):
    races = get_races(year)
    print('year:', year, 'races:', races)
    url_list = [get_rider_urls(i,year) for i in races]
    flat_url += [item for sublist in url_list for item in sublist]

driver.quit()

unique_URLs = list(set(flat_url))
unique_URLs = pd.Series(unique_URLs)

print(len(unique_URLs))

year: 2010 races: [['milano-sanremo', 'result'], ['gent-wevelgem', 'result'], ['itzulia-basque-country', 'gc'], ['tour-de-romandie', 'gc'], ['dauphine', 'gc'], ['tour-de-suisse', 'gc'], ['san-sebastian', 'result'], ['tour-de-pologne', 'gc'], ['vuelta-a-espana', 'gc'], ['gp-montreal', 'result']]
Page not found for URL: https://www.procyclingstats.com/race/dauphine/2010/stage-1
year: 2011 races: [['tour-down-under', 'gc'], ['paris-nice', 'gc'], ['tirreno-adriatico', 'gc'], ['milano-sanremo', 'result'], ['volta-a-catalunya', 'gc'], ['gent-wevelgem', 'result'], ['ronde-van-vlaanderen', 'result'], ['itzulia-basque-country', 'gc'], ['paris-roubaix', 'result'], ['amstel-gold-race', 'result'], ['la-fleche-wallone', 'result'], ['liege-bastogne-liege', 'result'], ['tour-de-romandie', 'gc'], ['giro-d-italia', 'gc'], ['dauphine', 'gc'], ['tour-de-suisse', 'gc'], ['tour-de-france', 'gc'], ['san-sebastian', 'result'], ['tour-de-pologne', 'gc'], ['renewi-tour', 'gc'], ['vuelta-a-espana', 'gc'], ['bre

In [None]:
driver = webdriver.Safari()
rider_data_list = []
for i, rider in enumerate(unique_URLs):
    driver.get(rider)

    try:
        name = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, 'h1'))
        ).text

        nationality = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'black'))
        ).text

        try:
            weight_element = driver.find_element(By.XPATH, "//b[text()='Weight:']/following-sibling::text()[1]")
            weight = weight_element.text
        except NoSuchElementException:
            weight = None
        
        try:
            height_element = driver.find_element(By.XPATH, "//b[text()='Height:']/following-sibling::text()[1]")
            height = height_element.text
        except NoSuchElementException:
            height = None

        one_day = driver.find_element(By.XPATH, 
                "//div[@class='title']/a[contains(@href, 'career-points-one-day-races')]/../preceding-sibling::div[@class='pnt']").text
        gc = driver.find_element(By.XPATH, 
                "//div[@class='title']/a[contains(@href, 'career-points-gc')]/../preceding-sibling::div[@class='pnt']").text
        tt = driver.find_element(By.XPATH,
                "//div[@class='title']/a[contains(@href, 'career-points-time-trial')]/../preceding-sibling::div[@class='pnt']").text
        sprint = driver.find_element(By.XPATH,
                "//div[@class='title']/a[contains(@href, 'career-points-sprint')]/../preceding-sibling::div[@class='pnt']").text
        climber = driver.find_element(By.XPATH,
                "//div[@class='title']/a[contains(@href, 'career-points-climbers')]/../preceding-sibling::div[@class='pnt']").text
        hills = driver.find_element(By.XPATH,
                "//div[@class='title']/a[contains(@href, 'hills')]/../preceding-sibling::div[@class='pnt']").text

        rider_data = {
            'url': rider,
            'name': name,
            'nationality': nationality,
            'weight': weight,
            'height': height,
            'one_day': one_day,
            'gc': gc,
            'tt': tt,
            'sprint': sprint,
            'climber': climber,
            'hills': hills
        }

        rider_data_list.append(rider_data)
        print(f"Rider {i+1} of {len(unique_URLs)} fetched")

    except Exception as e:
        print(f"Error fetching data for URL: {rider} - {e}")
        continue

driver.quit()

rider_data_df = pd.DataFrame(rider_data_list)

# save df to csv
rider_data_df.to_csv('rider_data.csv', index=False)

Rider 1 of 2779 fetched
Rider 2 of 2779 fetched
Rider 3 of 2779 fetched
Rider 4 of 2779 fetched
Rider 5 of 2779 fetched
Rider 6 of 2779 fetched
Rider 7 of 2779 fetched
Rider 8 of 2779 fetched
Rider 9 of 2779 fetched
Rider 10 of 2779 fetched
Rider 11 of 2779 fetched
Rider 12 of 2779 fetched
Rider 13 of 2779 fetched
Rider 14 of 2779 fetched
Rider 15 of 2779 fetched
Rider 16 of 2779 fetched
Rider 17 of 2779 fetched
Rider 18 of 2779 fetched
Rider 19 of 2779 fetched
Rider 20 of 2779 fetched
Rider 21 of 2779 fetched
Rider 22 of 2779 fetched
Rider 23 of 2779 fetched
Rider 24 of 2779 fetched
Rider 25 of 2779 fetched
Rider 26 of 2779 fetched
Rider 27 of 2779 fetched
Rider 28 of 2779 fetched
Rider 29 of 2779 fetched
Rider 30 of 2779 fetched
Rider 31 of 2779 fetched
Rider 32 of 2779 fetched
Rider 33 of 2779 fetched
Rider 34 of 2779 fetched
Rider 35 of 2779 fetched
Rider 36 of 2779 fetched
Rider 37 of 2779 fetched
Rider 38 of 2779 fetched
Rider 39 of 2779 fetched
Rider 40 of 2779 fetched
Rider 41 

In [13]:
# season stats
riders_profiles = pd.read_csv('rider_data.csv')
urls = list(riders_profiles['url'])
rider_season_urls = [i + str('/statistics/season-statistics') for i in urls]

# add season stats to rider_data_df
driver = webdriver.Safari()
rider_season_data_list = []
for i, rider in enumerate(rider_season_urls):
    driver.get(rider)
    
    try:

        season_data = []

        season_table = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'basic'))
        )
        rows = season_table.find_elements(By.TAG_NAME, 'tr')[1:]
        for row in rows:
            cols = row.find_elements(By.TAG_NAME, 'td')
            if len(cols) == 7:
                season_data.append({
                    'season': cols[0].text if cols[0].text != '' else 'Total',
                    'points': cols[1].text if cols[1].text != '-' else 0,
                    'racedays': cols[2].text if cols[2].text != '-' else 0,
                    'kms': cols[3].text if cols[3].text != '-' else 0,
                    'wins': cols[4].text if cols[4].text != '-' else 0,
                    'top_3s': cols[5].text if cols[5].text != '-' else 0,
                    'top_10s': cols[6].text if cols[6].text != '-' else 0,
                    'name': riders_profiles.loc[i, 'name'],
                    'nationality': riders_profiles.loc[i, 'nationality'],
                    'weight': riders_profiles.loc[i, 'weight'],
                    'height': riders_profiles.loc[i, 'height'],
                    'one_day': riders_profiles.loc[i, 'one_day'],
                    'gc': riders_profiles.loc[i, 'gc'],
                    'tt': riders_profiles.loc[i, 'tt'],
                    'sprint': riders_profiles.loc[i, 'sprint'],
                    'climber': riders_profiles.loc[i, 'climber'],
                    'hills': riders_profiles.loc[i, 'hills'],
                    'url': urls[i]
                })
        rider_season_data_list.append(season_data)
        print(f"Processed data for rider {i+1} of {len(rider_season_urls)}")
    except Exception as e:
        print(f"Error fetching data for URL: {rider} - {e}")
        continue

# change this season_data to df and save to csv
rider_season_data_df = pd.DataFrame([item for sublist in rider_season_data_list for item in sublist])
rider_season_data_df.to_csv('rider_season_data.csv', index=False)
    
driver.quit()

Processed data for rider 1 of 2778
Processed data for rider 2 of 2778
Processed data for rider 3 of 2778
Processed data for rider 4 of 2778
Processed data for rider 5 of 2778
Processed data for rider 6 of 2778
Processed data for rider 7 of 2778
Processed data for rider 8 of 2778
Processed data for rider 9 of 2778
Processed data for rider 10 of 2778
Processed data for rider 11 of 2778
Processed data for rider 12 of 2778
Processed data for rider 13 of 2778
Processed data for rider 14 of 2778
Processed data for rider 15 of 2778
Processed data for rider 16 of 2778
Processed data for rider 17 of 2778
Processed data for rider 18 of 2778
Processed data for rider 19 of 2778
Processed data for rider 20 of 2778
Processed data for rider 21 of 2778
Processed data for rider 22 of 2778
Processed data for rider 23 of 2778
Processed data for rider 24 of 2778
Processed data for rider 25 of 2778
Processed data for rider 26 of 2778
Processed data for rider 27 of 2778
Processed data for rider 28 of 2778
P

In [34]:
# add current team to the riders
driver = webdriver.Safari()
riders_profiles = pd.read_csv('rider_season_data.csv')

current_team_dict = {}

for i, rider in riders_profiles.iterrows():
    riders_profiles.loc[i, 'current_team_2025'] = None
    if current_team_dict.get(rider['name']):
        riders_profiles.loc[i, 'current_team_2025'] = current_team_dict[rider['name']]
        continue

    print(rider['url'])
    driver.get(rider['url'])
    try:
        team_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, "//ul[contains(@class, 'list') and contains(@class, 'rdr-teams') and contains(@class, 'moblist')]/li"))
        )

        for team_element in team_elements:
            year = team_element.find_element(By.CLASS_NAME, 'season').text
            team_name = team_element.find_element(By.CLASS_NAME, 'name').text
            if year == '2025':
                current_team_dict[rider['name']] = team_name
                riders_profiles.loc[i, 'current_team_2025'] = team_name
                break
        if not riders_profiles.loc[i, 'current_team_2025']:
            riders_profiles.loc[i, 'current_team_2025'] = 'Not active'
            current_team_dict[rider['name']] = 'Not active'
        print(f"Processed data for rider {i+1} of {len(riders_profiles)}")
    except Exception as e:
        print(f"Error fetching data for URL: {rider} - {e}")
        continue

driver.quit()

riders_profiles.to_csv('rider_season_data.csv', index=False)

https://www.procyclingstats.com/rider/camilo-andres-suarez
Processed data for rider 1 of 37009
https://www.procyclingstats.com/rider/kim-kirchen
Processed data for rider 10 of 37009
https://www.procyclingstats.com/rider/robert-hunter
Processed data for rider 25 of 37009
https://www.procyclingstats.com/rider/jorge-azanza
Processed data for rider 43 of 37009
https://www.procyclingstats.com/rider/josu-etxeberria-azpilikueta
Processed data for rider 55 of 37009
https://www.procyclingstats.com/rider/yonathan-monsalve
Processed data for rider 63 of 37009
https://www.procyclingstats.com/rider/dylan-groenewegen
Processed data for rider 79 of 37009
https://www.procyclingstats.com/rider/rainer-kepplinger
Processed data for rider 95 of 37009
https://www.procyclingstats.com/rider/simon-gerrans
Processed data for rider 100 of 37009
https://www.procyclingstats.com/rider/rune-herregodts
Processed data for rider 121 of 37009
https://www.procyclingstats.com/rider/francis-mourey
Processed data for rider

In [6]:
# get race stats
all_races_urls = []
driver = webdriver.Safari()
for year in range(2010, 2025):
    year_url = f"https://www.procyclingstats.com/races.php?year={year}&circuit=1&class=&filter=Filter"
    driver.get(year_url)

    try:
        race_table = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'basic'))
        )

        tr_table = race_table.find_elements(By.TAG_NAME, 'tr')
        striked_rows = race_table.find_elements(By.CLASS_NAME, 'striked')
        tr_table = [row for row in tr_table if row not in striked_rows]
            
        races = []
        for i in tr_table:
            td_elements = i.find_elements(By.TAG_NAME, 'td')
            for td in td_elements:
                a_tag = td.find_elements(By.TAG_NAME, 'a')
                if a_tag and a_tag[0].get_attribute('href').__contains__('race'):
                    races.append(a_tag[0].get_attribute('href'))

    except Exception as e:
        print(f"Error fetching data for URL: {year_url} - {e}")
        continue

    all_races_urls.append(races)

driver.quit()
all_races_urls = [url for sublist in all_races_urls for url in sublist]
print(len(all_races_urls))

437


In [39]:
# get leaderboard data
driver = webdriver.Safari()
race_results_list = []
for counter, race in enumerate(all_races_urls):
    if race.split('/')[-1] == 'result':
        urls_to_try = [race]
    else:
        urls_to_try = []
        base_url = '/'.join(race.split('/')[:-1])
        urls_to_try.append(base_url + '/prologue')
        for i in range(1, 22):
            urls_to_try.append(base_url + '/stage-' + str(i))

    #debugging
    # urls_to_try = ['https://www.procyclingstats.com/race/paris-roubaix/2019/result']

    for url in urls_to_try:
        driver.get(url)
        
        try:
            h1_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, 'h1'))
            )

            if h1_element.text == 'Page not found':
                print(f"Page not found for URL: {url}")
                continue
            race_table_tag = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'results.basic.moblist10'))
            )

            tr_table = race_table_tag.find_elements(By.TAG_NAME, 'tr')
            
            result = []
            for i in tr_table[1:]:
                cols = i.find_elements(By.TAG_NAME, 'td')

                if len(i.find_elements(By.TAG_NAME, 's')) > 0:
                    continue

                if len(cols) == 9:
                    try:
                        time_element = cols[8].find_element(By.CLASS_NAME, 'hide')
                        time_text = time_element.text
                    except NoSuchElementException:
                        time_text = cols[8].text
                    result.append({
                        'year': WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.XPATH, "//div[@class='main']/span[@class='hideIfMobile'][1]"))
                        ).text,
                        'name': race.split('/')[4],
                        'stage': 'one_day',
                        'rank': cols[0].text,
                        'rider': cols[4].find_element(By.TAG_NAME, 'a').text,
                        'age': cols[5].text,
                        'speciality': cols[3].text,
                        'team': cols[6].text,
                        'time': time_text,
                        'url': url
                    })
                elif len(cols) == 10:
                    try:
                        time_element = cols[9].find_element(By.CLASS_NAME, 'hide')
                        time_text = time_element.text
                    except NoSuchElementException:
                        time_text = cols[9].text
                    result.append({
                        'year': WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.XPATH, "//div[@class='main']/span[@class='hideIfMobile'][1]"))
                        ).text,
                        'name': race.split('/')[4],
                        'stage': 'one_day',
                        'rank': cols[0].text,
                        'rider': cols[4].find_element(By.TAG_NAME, 'a').text,
                        'age': cols[5].text,
                        'speciality': cols[3].text,
                        'team': cols[6].text,
                        'time': time_text,
                        'url': url
                    })
                else:
                    try:
                        time_element = cols[12].find_element(By.CLASS_NAME, 'hide')
                        time_text = time_element.text
                    except NoSuchElementException:
                        time_text = cols[12].text
                    result.append({
                        'year': WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.XPATH, "//div[@class='main']/span[@class='hideIfMobile'][1]"))
                        ).text,
                        'name': race.split('/')[4],
                        'stage': url.split('/')[-1],
                        'rank': cols[0].text,
                        'rider': cols[6].find_element(By.TAG_NAME, 'a').text,
                        'age': cols[7].text,
                        'speciality': cols[5].text,
                        'team': cols[8].text,
                        'time': time_text,
                        'url': url
                    })

            race_results_list.append(result)
            
        except Exception as e:
            print(f"Error fetching data for URL: {url} - {e}")
            continue
    print(f"Processed race {counter+1} of {len(all_races_urls)}")

driver.quit()
race_results_list = [item for sublist in race_results_list for item in sublist]
race_results_df = pd.DataFrame(race_results_list)
race_results_df.to_csv('leaderboard_data.csv', index=False)

Processed race 1 of 437
Processed race 2 of 437
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/prologue
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-7
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-8
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-9
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-10
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-11
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-12
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-13
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-14
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basq

In [32]:
# get race coure data
driver = webdriver.Safari()
race_course_data_list = []
for counter, race in enumerate(all_races_urls):
    if race.split('/')[-1] == 'result':
        urls_to_try = [race]
    else:
        urls_to_try = []
        base_url = '/'.join(race.split('/')[:-1])
        urls_to_try.append(base_url + '/prologue')
        for i in range(1, 22):
            urls_to_try.append(base_url + '/stage-' + str(i))

    for url in urls_to_try:
        driver.get(url)
        
        try:
            h1_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, 'h1'))
            )

            if h1_element.text == 'Page not found':
                print(f"Page not found for URL: {url}")
                continue
            
            infolist = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'infolist'))
            )
            items = infolist.find_elements(By.TAG_NAME, 'li')
            
            for item in items:
                if item.text.__contains__('Date'):
                    date = item.text.split(': ')[-1]
                if item.text.__contains__('Distance'):
                    distance = item.text.split(': ')[-1]
                if item.text.__contains__('speed'):
                    speed = item.text.split(': ')[-1]
                if item.text.__contains__('ProfileScore'):
                    score = item.text.split(': ')[-1] if item.text.split(': ')[-1] != ' ' else None
                if item.text.__contains__('Vertical meters'):
                    vertical_meters = item.text.split(': ')[-1] if item.text.split(': ')[-1] != '' else None
                if item.text.__contains__('ranking'):
                    ranking = item.text.split(': ')[-1] if item.text.split(': ')[-1] != 'n/a' else None
                if item.text.__contains__('quality'):
                    quality = item.text.split(': ')[-1]
                if item.text.__contains__('Won'):
                    won = item.text.split(': ')[-1]
            
            race_course_data = {
                'year': WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.XPATH, "//div[@class='main']/span[@class='hideIfMobile'][1]"))
                        ).text,
                'name': race.split('/')[4],
                'stage': url.split('/')[-1] if url.split('/')[-1] != 'result' else 'one_day',
                'date': date,
                'distance': distance,
                'speed': speed,
                'score': score,
                'vertical_meters': vertical_meters,
                'ranking': ranking,
                'quality': quality,
                'won': won,
                'url': url
            }

            race_course_data_list.append(race_course_data)

        except Exception as e:
            print(f"Error fetching data for URL: {url} - {e}")
            continue
    print(f"Processed race {counter+1} of {len(all_races_urls)}")

driver.quit()

race_course_data_df = pd.DataFrame(race_course_data_list)
race_course_data_df.to_csv('race_course_data.csv', index=False)

Processed race 1 of 437
Processed race 2 of 437
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/prologue
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-7
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-8
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-9
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-10
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-11
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-12
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-13
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basque-country/2010/stage-14
Page not found for URL: https://www.procyclingstats.com/race/itzulia-basq