In [69]:
import os
import pickle
import re
import time

from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from urllib.parse import urlencode

In [70]:
def extract_ci_string(text):
    text = text.decode('utf-8')
    # Regex to capture any part of the string containing '_CI_'
    pattern = r'\b\w+_CI_\w+\b'
    matches = re.findall(pattern, text)
    
    if len(matches) == 1:
        return matches[0]  # Return the match if there's exactly one
    elif len(matches) > 1:
        raise ValueError("Error: More than one '_CI_' string found.")
    else:
        raise ValueError("Error: No '_CI_' string found.")
    
# CI_result = extract_ci_string(speed_response.content)
# print("Extracted String:", CI_result)

In [71]:
def get_data(url): 
    # In this notebook, pass this as the `headers` argument when using requests.get()
    # headers = {'User-Agent': 'Mozilla/5.0'}
    # snapshot_response = requests.get(url, headers=headers) # SOLUTION
    # status_code = snapshot_response.status_code  # SOLUTION
    # print(f"Response statuse code: {status_code}")

    proxy_params = {
        'api_key': '2b680f1f-2808-48e3-9059-c1615c97f189',
        'url': url, 
    }

    response = requests.get(
    url='https://proxy.scrapeops.io/v1/',
    params=urlencode(proxy_params),
    timeout=120,
    )

    status_code = response.status_code  # SOLUTION

    if status_code != 200:
        print(f"Status code is not ok. Expected 200. Response status code: {status_code}")

    return response.content


In [72]:
def parse_jumpoff_page(text, filename="jumpoff_data.csv"):
    jumpoff_soup = BeautifulSoup(text, "html.parser")

    #print(jumpoff_soup.prettify())

    competition_id = extract_ci_string(text)
    
    print("comp id:", competition_id)
    table = jumpoff_soup.find('table', {'id': 'PlaceHolderMain_gvcIResults'})

    # Extract the table rows
    rows = table.find_all('tr')

    # Process table headers
    headers = [header.text.strip() for header in rows[2].find_all('th')]

    rows = table.find_all('tr', class_=['row', 'altrow'])

    data = []
    for index, row in enumerate(rows): 
        if index == 16:
            break
        cols = row.find_all('td')

        
        if len(cols) < 9:
            continue
        
        # Extract the required data
        competition_data = {
            'competition_id': competition_id,
            'position': cols[0].get_text(strip=True),
            'athlete_fei_id': cols[1].find('a').get_text(strip=True),
            'athlete': cols[2].find('a').get_text(strip=True),
            'horse_fei_id': cols[3].find('a').get_text(strip=True),
            'horse': cols[4].find('a').get_text(strip=True),
            'phase1_faults': cols[8].find_all('td')[1].get_text(strip=True),
            'phase1_time': cols[8].find_all('td')[3].get_text(strip=True),
            'phase2_faults': cols[13].find_all('td')[1].get_text(strip=True) if len(cols[13].find_all('td')) > 1 else 'NA',
            'phase2_time': cols[13].find_all('td')[3].get_text(strip=True) if len(cols[13].find_all('td')) > 3 else 'NA',
            
        }
        
        data.append(competition_data)

    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)

    return df

In [73]:
# Test Functions

speed_url = 'https://data.fei.org/Result/ResultList.aspx?p=9A80682A95C06DB56193A83C95F324D39683772B3B2F77A7BC2A1E3E667ABB0A85CA703CF95D562D86A621768D9E7146'
jumpoff_url = "https://data.fei.org/Result/ResultList.aspx?p=108112E70B42A2FDFB8334732BF83AB688E135126F8F834640167486F9EB6487DB2FFBB6C376748916CE9311174822F"
speed_html_content = get_data(jumpoff_url)
parse_jumpoff_page(speed_html_content)
#print(speed_data)

comp id: 2022_CI_1092_S_S_01_02


Unnamed: 0,competition_id,position,athlete_fei_id,athlete,horse_fei_id,horse,phase1_faults,phase1_time,phase2_faults,phase2_time
0,2022_CI_1092_S_S_01_02,1,10002656,Roberto TERAN TAFUR (COL),104IT51,DEZ' OOKTOFF,0,80.98,0.0,36.48
1,2022_CI_1092_S_S_01_02,2,10226828,Eliza KIMBALL (USA),104NB58,ECLIPS,0,77.83,0.0,36.78
2,2022_CI_1092_S_S_01_02,3,10015481,Filip DE WANDEL (BEL),106FA73,MANHATTAN VAN'T LEEUWERIK...,0,79.52,0.0,37.03
3,2022_CI_1092_S_S_01_02,4,10105441,Nick HANESS (USA),105DC67,GERKO III VAN DE ZEILHOEK,0,80.98,0.0,38.26
4,2022_CI_1092_S_S_01_02,5,10062291,Wim JANSSEN (NED),104KY26,DURANGO VDL,0,82.88,0.0,46.69
5,2022_CI_1092_S_S_01_02,6,10001018,Conor SWAIL (IRL),106EH53,THEO 160,0,80.81,4.0,36.26
6,2022_CI_1092_S_S_01_02,7,10002656,Roberto TERAN TAFUR (COL),106SN41,SANTANA,0,79.83,4.0,36.86
7,2022_CI_1092_S_S_01_02,8,10092750,Kelli CRUCIOTTI VANDERVEE... (USA),106SG18,ISABELLA VAN DE ZUUTHOEVE,0,79.3,4.0,38.54
8,2022_CI_1092_S_S_01_02,9,10135487,Alexandra PIELET (USA),106CK05,HYPERBOLICS,0,82.04,8.0,39.81
9,2022_CI_1092_S_S_01_02,10,10100285,Sean JOBIN (CAN),104CW31,DARIUS,1,83.32,,


In [74]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time

speed_url = 'https://data.fei.org/Result/ResultList.aspx?p=9A80682A95C06DB56193A83C95F324D39683772B3B2F77A7BC2A1E3E667ABB0A85CA703CF95D562D86A621768D9E7146'

# Setup Chrome WebDriver with the correct service
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

try:
    # Open the webpage
    driver.get(speed_url)

    # Find and click the next page button to go to the second page
    next_page_button = driver.find_element(By.LINK_TEXT, '2')
    next_page_button.click()

    # Allow some time for the page to load
    time.sleep(2)

    # Extract the titles and prices of books on the second page
    books = driver.find_elements(By.CSS_SELECTOR, '.product_pod')
    
    for book in books:
        title = book.find_element(By.TAG_NAME, 'h3').text
        price = book.find_element(By.CSS_SELECTOR, '.price_color').text
        print(f'Title: {title}, Price: {price}')

finally:
    # Close the WebDriver
    driver.quit()


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"link text","selector":"2"}
  (Session info: chrome=127.0.6533.120); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x00000001032c9024 cxxbridge1$str$ptr + 1887276
1   chromedriver                        0x00000001032c1700 cxxbridge1$str$ptr + 1856264
2   chromedriver                        0x0000000102ed082c cxxbridge1$string$len + 88524
3   chromedriver                        0x0000000102f14834 cxxbridge1$string$len + 367060
4   chromedriver                        0x0000000102f4c48c cxxbridge1$string$len + 595500
5   chromedriver                        0x0000000102f09474 cxxbridge1$string$len + 321044
6   chromedriver                        0x0000000102f0a0e4 cxxbridge1$string$len + 324228
7   chromedriver                        0x0000000103290a08 cxxbridge1$str$ptr + 1656336
8   chromedriver                        0x0000000103295464 cxxbridge1$str$ptr + 1675372
9   chromedriver                        0x00000001032768ec cxxbridge1$str$ptr + 1549556
10  chromedriver                        0x0000000103295c14 cxxbridge1$str$ptr + 1677340
11  chromedriver                        0x00000001032685fc cxxbridge1$str$ptr + 1491460
12  chromedriver                        0x00000001032b2a5c cxxbridge1$str$ptr + 1795684
13  chromedriver                        0x00000001032b2bd8 cxxbridge1$str$ptr + 1796064
14  chromedriver                        0x00000001032c1334 cxxbridge1$str$ptr + 1855292
15  libsystem_pthread.dylib             0x0000000186d4ffa8 _pthread_start + 148
16  libsystem_pthread.dylib             0x0000000186d4ada0 thread_start + 8


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
# URL of the FEI competition results page
speed_url = 'https://data.fei.org/Result/ResultList.aspx?p=9A80682A95C06DB56193A83C95F324D39683772B3B2F77A7BC2A1E3E667ABB0A85CA703CF95D562D86A621768D9E7146'

# Setup Chrome WebDriver with the correct service
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

try:
    # Open the webpage
    driver.get(speed_url)

    # Wait for the element to be clickable and click it
    next_page_button = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, "//a[contains(@href, 'Page$2')]"))
    )
    next_page_button.click()

    # Allow some time for the second page to load
    time.sleep(2)

    # Get the HTML content of the second page
    second_page_html = driver.page_source

    # Print the HTML content of the second page
    print(second_page_html)

finally:
    # Close the WebDriver
    driver.quit()

TimeoutException: Message: 
Stacktrace:
0   chromedriver                        0x0000000105071024 cxxbridge1$str$ptr + 1887276
1   chromedriver                        0x0000000105069700 cxxbridge1$str$ptr + 1856264
2   chromedriver                        0x0000000104c7882c cxxbridge1$string$len + 88524
3   chromedriver                        0x0000000104cbc834 cxxbridge1$string$len + 367060
4   chromedriver                        0x0000000104cf448c cxxbridge1$string$len + 595500
5   chromedriver                        0x0000000104cb1474 cxxbridge1$string$len + 321044
6   chromedriver                        0x0000000104cb20e4 cxxbridge1$string$len + 324228
7   chromedriver                        0x0000000105038a08 cxxbridge1$str$ptr + 1656336
8   chromedriver                        0x000000010503d464 cxxbridge1$str$ptr + 1675372
9   chromedriver                        0x000000010501e8ec cxxbridge1$str$ptr + 1549556
10  chromedriver                        0x000000010503dc14 cxxbridge1$str$ptr + 1677340
11  chromedriver                        0x00000001050105fc cxxbridge1$str$ptr + 1491460
12  chromedriver                        0x000000010505aa5c cxxbridge1$str$ptr + 1795684
13  chromedriver                        0x000000010505abd8 cxxbridge1$str$ptr + 1796064
14  chromedriver                        0x0000000105069334 cxxbridge1$str$ptr + 1855292
15  libsystem_pthread.dylib             0x0000000186d4ffa8 _pthread_start + 148
16  libsystem_pthread.dylib             0x0000000186d4ada0 thread_start + 8


In [79]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set up the browser driver
driver = webdriver.Chrome()

# Navigate to the page
driver.get("https://data.fei.org/Calendar/Search.aspx")

# Set up a wait
wait = WebDriverWait(driver, 60)  # waits up to 10 seconds

# Input values into the form using the updated methods and wait for elements
date_from = wait.until(EC.presence_of_element_located((By.NAME, "ctl00$PlaceHolderMain$dtCritDateFrom$txtDate")))
date_from.send_keys("01/01/2020")

date_to = wait.until(EC.presence_of_element_located((By.NAME, "ctl00$PlaceHolderMain$dtCritDateTo$txtDate")))
date_to.send_keys("31/12/2024")

# Find and click the search button
# Make sure to use the correct method to locate the search button, like using the button's text or other attributes
search_button = wait.until(EC.element_to_be_clickable((By.ID, "searchButton")))  # Update if the ID is incorrect
search_button.click()

# Get the resulting HTML
html_content = driver.page_source

# Close the browser
driver.quit()

print(html_content)

TimeoutException: Message: 
Stacktrace:
0   chromedriver                        0x0000000102a71024 cxxbridge1$str$ptr + 1887276
1   chromedriver                        0x0000000102a69700 cxxbridge1$str$ptr + 1856264
2   chromedriver                        0x000000010267882c cxxbridge1$string$len + 88524
3   chromedriver                        0x00000001026bc834 cxxbridge1$string$len + 367060
4   chromedriver                        0x00000001026f448c cxxbridge1$string$len + 595500
5   chromedriver                        0x00000001026b1474 cxxbridge1$string$len + 321044
6   chromedriver                        0x00000001026b20e4 cxxbridge1$string$len + 324228
7   chromedriver                        0x0000000102a38a08 cxxbridge1$str$ptr + 1656336
8   chromedriver                        0x0000000102a3d464 cxxbridge1$str$ptr + 1675372
9   chromedriver                        0x0000000102a1e8ec cxxbridge1$str$ptr + 1549556
10  chromedriver                        0x0000000102a3dc14 cxxbridge1$str$ptr + 1677340
11  chromedriver                        0x0000000102a105fc cxxbridge1$str$ptr + 1491460
12  chromedriver                        0x0000000102a5aa5c cxxbridge1$str$ptr + 1795684
13  chromedriver                        0x0000000102a5abd8 cxxbridge1$str$ptr + 1796064
14  chromedriver                        0x0000000102a69334 cxxbridge1$str$ptr + 1855292
15  libsystem_pthread.dylib             0x0000000186d4ffa8 _pthread_start + 148
16  libsystem_pthread.dylib             0x0000000186d4ada0 thread_start + 8
