Going to run through the same process of getting athletes for performances before Olympics and filter performances by date.

Walking through the whole process again:
1. Get all athlete performances >1100 points for 800m -> Marathon
2. Get all outdoor performances and filter out indoor performances, leading to DF1
3. Get list of female athletes who competed in the Olympics through Selenium, leading to DF2
4. Process names and combine DF1 and DF2, ensuring that names are the same.
5. Compute predicted form based on fastest outdoor times ran prior to olympics, with exponential loss decay (and if there are any other models?)
6. Grab Olympic results (finals)
7. Grab IAAF rankings (pre Oly)
8. Compute squared error between predicted rankings (both IAAF rankings pre oly and exponential loss decay)
9. Compute mean squared error and whether volatility was high for the same events (800m and marathon)

Importing statements (to run regardless of progress)


In [3]:
import pandas as pd
!pip install bs4
from bs4 import BeautifulSoup
import requests
import numpy as np
import math


Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [4]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.23.1-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.26.2-py3-none-any.whl (475 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m476.0/476.0 kB[0m [31m26.

## Step 1 - Get all athlete performances > 1100 points

In [1]:
!pip install bs4
from bs4 import BeautifulSoup


Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [2]:
import requests
import pandas as pd


In [3]:
def collect_urls(base_url, start_page, end_page):
  urls = []
  for page in range(start_page, end_page + 1):
        url = base_url.replace("&page=1", f"&page={page}")
        urls.append(url)
  return urls

In [4]:
def get_full_event_results(base_url, start_page, end_page, discipline):
    urls = collect_urls(base_url, start_page, end_page)
    results = []
    for url in urls:
        response = requests.get(url)
        data = response.text
        soup = BeautifulSoup(data, 'html.parser')
        table = soup.find('table')
        for row in table.find_all('tr')[1:]:
            cols = row.find_all('td')
            if len(cols) > 1:
                mark = cols[1].text.strip()
                competitor = cols[3].text.strip()
                nationality = cols[5].text.strip()
                location = cols[-3].text.strip()
                date = cols[-2].text.strip()
                result_score = cols[-1].text.strip()

                results.append({
                    'Mark': mark,
                    'Competitor': competitor,
                    'Nat': nationality,
                    'Location': location,
                    'Date': date, #used for form tracking later
                    'Results Score': result_score
                })
    df = pd.DataFrame(results)
    df.insert(4, 'Discipline', discipline)
    return df


In [5]:
base_url_800m = "https://worldathletics.org/records/toplists/middlelong/800-metres/all/women/senior/2024?regionType=world&timing=electronic&page=1&bestResultsOnly=false&maxResultsByCountry=all&eventId=10229512&ageCategory=senior"

In [6]:
df_800m_2024 = get_full_event_results(base_url_800m, 1, 14, '800m')
df_800m_2024

Unnamed: 0,Mark,Competitor,Nat,Location,Discipline,Date,Results Score
0,1:54.61,Keely HODGKINSON,GBR,"Olympic Stadium, London (GBR)",800m,20 JUL 2024,1261
1,1:55.61,Jemma REEKIE,GBR,"Olympic Stadium, London (GBR)",800m,20 JUL 2024,1242
2,1:55.78,Keely HODGKINSON,GBR,"Hayward Field, Eugene, OR (USA)",800m,25 MAY 2024,1239
3,1:56.28,Georgia BELL,GBR,"Olympic Stadium, London (GBR)",800m,20 JUL 2024,1230
4,1:56.71,Mary MORAA,KEN,"Hayward Field, Eugene, OR (USA)",800m,25 MAY 2024,1222
...,...,...,...,...,...,...,...
1395,2:03.51,Judy Jepkoech KOSGEI,KEN,"Percy Beard Track, Gainesville, FL (USA)",800m,13 APR 2024,1100
1396,2:03.52,Naum BOPAPE,RSA,"McArthur Stadium, Potchefstroom (RSA)",800m,11 MAY 2024,1100
1397,2:03.52,Tigist GIRMA,ETH,"Štadión SNP, Banská Bystrica (SVK)",800m,24 MAY 2024,1100
1398,2:03.52,Joanna ARCHER,GUY,"John McDonnell Field, Fayetteville, AR (USA)",800m,25 MAY 2024,1100


In [7]:
base_url_1500m = "https://worldathletics.org/records/toplists/middlelong/1500-metres/all/women/senior/2024?regionType=world&page=1&bestResultsOnly=false&maxResultsByCountry=all&eventId=10229513&ageCategory=senior"

In [8]:
df_1500m_2024_w = get_full_event_results(base_url_1500m, 1, 12, '1500m')
df_1500m_2024_w

Unnamed: 0,Mark,Competitor,Nat,Location,Discipline,Date,Results Score
0,3:49.04,Faith KIPYEGON,KEN,"Stade Charléty, Paris (FRA)",1500m,07 JUL 2024,1295
1,3:50.30,Gudaf TSEGAY,ETH,"Egret Stadium, Xiamen (CHN)",1500m,20 APR 2024,1285
2,3:50.83,Jessica HULL,AUS,"Stade Charléty, Paris (FRA)",1500m,07 JUL 2024,1280
3,3:51.29,Faith KIPYEGON,KEN,"Stade de France, Paris (FRA)",1500m,10 AUG 2024,1277
4,3:52.56,Jessica HULL,AUS,"Stade de France, Paris (FRA)",1500m,10 AUG 2024,1266
...,...,...,...,...,...,...,...
1195,4:13.8h,Birtukan DEGU,ETH,"Stadium, Hawassa (ETH)",1500m,06 JUL 2024,1098
1196,4:13.73,Giulia APRILE,ITA,"Bungertstadion, Rehlingen (GER)",1500m,19 MAY 2024,1098
1197,4:13.75,Molly HUDSON,GBR,"Un. of Kentucky Outdoor Track Facility, Lexing...",1500m,23 MAY 2024,1097
1198,4:13.76,Rachel SMITH,USA,"Convention Center, Albuquerque, NM (USA) (i)",1500m,17 FEB 2024,1118


In [9]:
base_url_3000m_sc = "https://worldathletics.org/records/toplists/middlelong/3000-metres-steeplechase/all/women/senior/2024?regionType=world&page=1&bestResultsOnly=false&maxResultsByCountry=all&eventId=10229524&ageCategory=senior"

In [10]:
df_3000m_sc_2024_w = get_full_event_results(base_url_3000m_sc, 1, 6, '3000m Steeplechase')
df_3000m_sc_2024_w

Unnamed: 0,Mark,Competitor,Nat,Location,Discipline,Date,Results Score
0,8:52.76,Winfred YAVI,BRN,"Stade de France, Paris (FRA)",3000m Steeplechase,06 AUG 2024,1263
1,8:53.34,Peruth CHEMUTAI,UGA,"Stade de France, Paris (FRA)",3000m Steeplechase,06 AUG 2024,1261
2,8:55.09,Peruth CHEMUTAI,UGA,"Hayward Field, Eugene, OR (USA)",3000m Steeplechase,25 MAY 2024,1257
3,8:55.15,Faith CHEROTICH,KEN,"Stade de France, Paris (FRA)",3000m Steeplechase,06 AUG 2024,1257
4,8:55.40,Beatrice CHEPKOECH,KEN,"Egret Stadium, Xiamen (CHN)",3000m Steeplechase,20 APR 2024,1256
...,...,...,...,...,...,...,...
595,10:01.21,Patrycja KAPAŁA,POL,"Stadion Śląski, Chorzów (POL)",3000m Steeplechase,18 MAY 2024,1092
596,10:01.23,Maria Mihaela BLAGA,ROU,"Atatürk Stadyumu, Izmir (TUR)",3000m Steeplechase,25 MAY 2024,1092
597,10:01.27,Jenna MELANSON,CAN,"Mt. Hood Community College, Gresham, OR (USA)",3000m Steeplechase,08 JUN 2024,1092
598,10:01.29,Makenna KREBS,USA,"Franklin Field, Philadelphia, PA (USA)",3000m Steeplechase,25 APR 2024,1092


In [11]:
base_url_5000m = "https://worldathletics.org/records/toplists/middlelong/5000-metres/all/women/senior/2024?regionType=world&page=1&bestResultsOnly=false&maxResultsByCountry=all&eventId=10229514&ageCategory=senior"

In [12]:
df_5000m_2024_w = get_full_event_results(base_url_5000m, 1, 6, '5000m')
df_5000m_2024_w

Unnamed: 0,Mark,Competitor,Nat,Location,Discipline,Date,Results Score
0,14:18.76,Tsigie GEBRESELAMA,ETH,"Hayward Field, Eugene, OR (USA)",5000m,25 MAY 2024,1244
1,14:18.92,Ejgayehu TAYE,ETH,"Hayward Field, Eugene, OR (USA)",5000m,25 MAY 2024,1244
2,14:20.61,Freweyni HAILU,ETH,"Hayward Field, Eugene, OR (USA)",5000m,25 MAY 2024,1241
3,14:22.76,Aynadis MEBRATU,ETH,"Hayward Field, Eugene, OR (USA)",5000m,25 MAY 2024,1236
4,14:23.71,Birke HAYLOM,ETH,"Hayward Field, Eugene, OR (USA)",5000m,25 MAY 2024,1234
...,...,...,...,...,...,...,...
595,15:36.59,Rika KASEDA,JPN,"Denka Big Swan Stadium, Niigata (JPN)",5000m,29 JUN 2024,1093
596,15:36.63,Mirriam CHEROP,KEN,"Aoba Park Athletic Field, Chitose (JPN)",5000m,20 JUL 2024,1093
597,15:36.81,Carla GALLARDO,ESP,"Carl-Kaufmann-Stadion, Karlsruhe (GER)",5000m,11 MAY 2024,1093
598,15:36.95,Margot APPLETON,USA,"Geroge C. Griffin Track, Atlanta, GA (USA)",5000m,11 MAY 2024,1092


In [13]:
base_url_10k = "https://worldathletics.org/records/toplists/middlelong/10000-metres/all/women/senior/2024?regionType=world&page=1&bestResultsOnly=false&maxResultsByCountry=all&eventId=10229521&ageCategory=senior"

In [14]:
df_10k_2024_w = get_full_event_results(base_url_10k, 1, 3, '10000m')
df_10k_2024_w

Unnamed: 0,Mark,Competitor,Nat,Location,Discipline,Date,Results Score
0,28:54.14,Beatrice CHEBET,KEN,"Hayward Field, Eugene, OR (USA)",10000m,25 MAY 2024,1309
1,29:05.92,Gudaf TSEGAY,ETH,"Hayward Field, Eugene, OR (USA)",10000m,25 MAY 2024,1298
2,29:26.89,Lilian Kasait RENGERUK,KEN,"Hayward Field, Eugene, OR (USA)",10000m,25 MAY 2024,1278
3,29:27.59,Margaret Chelimo KIPKEMBOI,KEN,"Hayward Field, Eugene, OR (USA)",10000m,25 MAY 2024,1278
4,29:47.71,Fotyen TESFAY,ETH,"Estadio Enrique Lopez Cuenca, Nerja (ESP)",10000m,14 JUN 2024,1259
...,...,...,...,...,...,...,...
295,33:06.73,Jennifer SANDOVAL-GUZMAN,USA,"Jack Kemp Stadium, Los Angeles, CA (USA)",10000m,11 MAY 2024,1081
296,33:06.80,Katrina SPRATFORD-STERLING,USA,"Jack Kemp Stadium, Los Angeles, CA (USA)",10000m,11 MAY 2024,1081
297,33:06.99,Mariya NODA,JPN,"Athletic Track, Abashiri (JPN)",10000m,10 JUL 2024,1081
298,33:07.01,Katrina SPRATFORD-STERLING,USA,"JSerra Catholic HS, San Juan Capistrano, CA (USA)",10000m,16 MAR 2024,1081


In [15]:
base_url_hm = "https://worldathletics.org/records/toplists/road-running/half-marathon/all/women/senior/2024?regionType=world&page=1&bestResultsOnly=false&maxResultsByCountry=all&eventId=10229541&ageCategory=senior"

In [16]:
df_hm_2024_w = get_full_event_results(base_url_hm, 1, 3, 'Half Marathon')
df_hm_2024_w

Unnamed: 0,Mark,Competitor,Nat,Location,Discipline,Date,Results Score
0,1:04:29,Joyciline JEPKOSGEI,KEN,Barcelona (ESP),Half Marathon,11 FEB 2024,1246
1,1:04:37,Sutume Asefa KEBEDE,ETH,"Houston, TX (USA)",Half Marathon,14 JAN 2024,1243
2,1:04:40,Senbere TEFERI,ETH,Barcelona (ESP),Half Marathon,11 FEB 2024,1242
3,1:05:14,Tsigie GEBRESELAMA,ETH,Ras Al Khaimah (UAE),Half Marathon,24 FEB 2024,1230
4,1:05:44,Ababel YESHANEH,ETH,Ras Al Khaimah (UAE),Half Marathon,24 FEB 2024,1219
...,...,...,...,...,...,...,...
295,1:11:56,Mio KURODA,JPN,"Ishin Me-Life Stadium, Yamaguchi (JPN)",Half Marathon,11 FEB 2024,1090
296,1:11:56,Jacelyn GRUPPEN,NED,Venlo (NED),Half Marathon,24 MAR 2024,1090
297,1:11:57,Malindi ELMORE,CAN,Vancouver (CAN),Half Marathon,11 FEB 2024,1090
298,1:11:59,Maria Sagnes WÅGAN,NOR,Barcelona (ESP),Half Marathon,11 FEB 2024,1089


In [17]:
base_url_m = "https://worldathletics.org/records/toplists/road-running/marathon/all/women/senior/2024?regionType=world&page=1&bestResultsOnly=false&maxResultsByCountry=all&eventId=10229534&ageCategory=senior"


In [18]:
df_m_2024_w = get_full_event_results(base_url_m, 1, 5, 'Marathon')
df_m_2024_w

Unnamed: 0,Mark,Competitor,Nat,Location,Discipline,Date,Results Score
0,2:15:55,Sutume Asefa KEBEDE,ETH,Tokyo (JPN),Marathon,03 MAR 2024,1276
1,2:16:07,Tigist KETEMA,ETH,Dubai (UAE),Marathon,07 JAN 2024,1274
2,2:16:14,Rosemary WANJIRU,KEN,Tokyo (JPN),Marathon,03 MAR 2024,1272
3,2:16:16,Peres JEPCHIRCHIR,KEN,London (GBR),Marathon,21 APR 2024,1272
4,2:16:23,Tigst ASSEFA,ETH,London (GBR),Marathon,21 APR 2024,1271
...,...,...,...,...,...,...,...
495,2:34:58,Alemitu HAROYE,ETH,Zheng-Kai (CHN),Marathon,31 MAR 2024,1084
496,2:34:59,Genet ROBI,ETH,Shijiazhuang (CHN),Marathon,31 MAR 2024,1084
497,2:35:01,Kasumi NISHIHARA,JPN,Tokyo (JPN),Marathon,03 MAR 2024,1084
498,2:35:03,Urge DIRO,ETH,Chongqing (CHN),Marathon,24 MAR 2024,1083


In [19]:
combined_df_w_road = pd.concat([df_m_2024_w, df_hm_2024_w], ignore_index=True)
combined_df_w_road

Unnamed: 0,Mark,Competitor,Nat,Location,Discipline,Date,Results Score
0,2:15:55,Sutume Asefa KEBEDE,ETH,Tokyo (JPN),Marathon,03 MAR 2024,1276
1,2:16:07,Tigist KETEMA,ETH,Dubai (UAE),Marathon,07 JAN 2024,1274
2,2:16:14,Rosemary WANJIRU,KEN,Tokyo (JPN),Marathon,03 MAR 2024,1272
3,2:16:16,Peres JEPCHIRCHIR,KEN,London (GBR),Marathon,21 APR 2024,1272
4,2:16:23,Tigst ASSEFA,ETH,London (GBR),Marathon,21 APR 2024,1271
...,...,...,...,...,...,...,...
795,1:11:56,Mio KURODA,JPN,"Ishin Me-Life Stadium, Yamaguchi (JPN)",Half Marathon,11 FEB 2024,1090
796,1:11:56,Jacelyn GRUPPEN,NED,Venlo (NED),Half Marathon,24 MAR 2024,1090
797,1:11:57,Malindi ELMORE,CAN,Vancouver (CAN),Half Marathon,11 FEB 2024,1090
798,1:11:59,Maria Sagnes WÅGAN,NOR,Barcelona (ESP),Half Marathon,11 FEB 2024,1089


In [20]:
combined_df_track_w = pd.concat([df_800m_2024, df_1500m_2024_w, df_3000m_sc_2024_w, df_5000m_2024_w, df_10k_2024_w], ignore_index=True)
combined_df_track_w

Unnamed: 0,Mark,Competitor,Nat,Location,Discipline,Date,Results Score
0,1:54.61,Keely HODGKINSON,GBR,"Olympic Stadium, London (GBR)",800m,20 JUL 2024,1261
1,1:55.61,Jemma REEKIE,GBR,"Olympic Stadium, London (GBR)",800m,20 JUL 2024,1242
2,1:55.78,Keely HODGKINSON,GBR,"Hayward Field, Eugene, OR (USA)",800m,25 MAY 2024,1239
3,1:56.28,Georgia BELL,GBR,"Olympic Stadium, London (GBR)",800m,20 JUL 2024,1230
4,1:56.71,Mary MORAA,KEN,"Hayward Field, Eugene, OR (USA)",800m,25 MAY 2024,1222
...,...,...,...,...,...,...,...
4095,33:06.73,Jennifer SANDOVAL-GUZMAN,USA,"Jack Kemp Stadium, Los Angeles, CA (USA)",10000m,11 MAY 2024,1081
4096,33:06.80,Katrina SPRATFORD-STERLING,USA,"Jack Kemp Stadium, Los Angeles, CA (USA)",10000m,11 MAY 2024,1081
4097,33:06.99,Mariya NODA,JPN,"Athletic Track, Abashiri (JPN)",10000m,10 JUL 2024,1081
4098,33:07.01,Katrina SPRATFORD-STERLING,USA,"JSerra Catholic HS, San Juan Capistrano, CA (USA)",10000m,16 MAR 2024,1081


In [21]:
combined_df_track_w['Competitor'] = combined_df_track_w['Competitor'].str.upper()
combined_df_track_w

Unnamed: 0,Mark,Competitor,Nat,Location,Discipline,Date,Results Score
0,1:54.61,KEELY HODGKINSON,GBR,"Olympic Stadium, London (GBR)",800m,20 JUL 2024,1261
1,1:55.61,JEMMA REEKIE,GBR,"Olympic Stadium, London (GBR)",800m,20 JUL 2024,1242
2,1:55.78,KEELY HODGKINSON,GBR,"Hayward Field, Eugene, OR (USA)",800m,25 MAY 2024,1239
3,1:56.28,GEORGIA BELL,GBR,"Olympic Stadium, London (GBR)",800m,20 JUL 2024,1230
4,1:56.71,MARY MORAA,KEN,"Hayward Field, Eugene, OR (USA)",800m,25 MAY 2024,1222
...,...,...,...,...,...,...,...
4095,33:06.73,JENNIFER SANDOVAL-GUZMAN,USA,"Jack Kemp Stadium, Los Angeles, CA (USA)",10000m,11 MAY 2024,1081
4096,33:06.80,KATRINA SPRATFORD-STERLING,USA,"Jack Kemp Stadium, Los Angeles, CA (USA)",10000m,11 MAY 2024,1081
4097,33:06.99,MARIYA NODA,JPN,"Athletic Track, Abashiri (JPN)",10000m,10 JUL 2024,1081
4098,33:07.01,KATRINA SPRATFORD-STERLING,USA,"JSerra Catholic HS, San Juan Capistrano, CA (USA)",10000m,16 MAR 2024,1081


In [22]:
combined_df_track_w.to_csv('combined_df_track_w.csv', index=False)

In [23]:
combined_distance_w = pd.concat([combined_df_track_w, combined_df_w_road], ignore_index=True)
combined_distance_w

Unnamed: 0,Mark,Competitor,Nat,Location,Discipline,Date,Results Score
0,1:54.61,KEELY HODGKINSON,GBR,"Olympic Stadium, London (GBR)",800m,20 JUL 2024,1261
1,1:55.61,JEMMA REEKIE,GBR,"Olympic Stadium, London (GBR)",800m,20 JUL 2024,1242
2,1:55.78,KEELY HODGKINSON,GBR,"Hayward Field, Eugene, OR (USA)",800m,25 MAY 2024,1239
3,1:56.28,GEORGIA BELL,GBR,"Olympic Stadium, London (GBR)",800m,20 JUL 2024,1230
4,1:56.71,MARY MORAA,KEN,"Hayward Field, Eugene, OR (USA)",800m,25 MAY 2024,1222
...,...,...,...,...,...,...,...
4895,1:11:56,Mio KURODA,JPN,"Ishin Me-Life Stadium, Yamaguchi (JPN)",Half Marathon,11 FEB 2024,1090
4896,1:11:56,Jacelyn GRUPPEN,NED,Venlo (NED),Half Marathon,24 MAR 2024,1090
4897,1:11:57,Malindi ELMORE,CAN,Vancouver (CAN),Half Marathon,11 FEB 2024,1090
4898,1:11:59,Maria Sagnes WÅGAN,NOR,Barcelona (ESP),Half Marathon,11 FEB 2024,1089


In [24]:
combined_distance_w.to_csv('combined_distance_w.csv', index=False)

## Step 2

In [28]:
def remove_all_indoor(df):
    new_df = df[~df['Location'].str.contains(r'\(i\)', case=False)]
    return new_df
outdoor_df_w = remove_all_indoor(combined_distance_w)
outdoor_df_w

Unnamed: 0,Mark,Competitor,Nat,Location,Discipline,Date,Results Score
0,1:54.61,KEELY HODGKINSON,GBR,"Olympic Stadium, London (GBR)",800m,20 JUL 2024,1261
1,1:55.61,JEMMA REEKIE,GBR,"Olympic Stadium, London (GBR)",800m,20 JUL 2024,1242
2,1:55.78,KEELY HODGKINSON,GBR,"Hayward Field, Eugene, OR (USA)",800m,25 MAY 2024,1239
3,1:56.28,GEORGIA BELL,GBR,"Olympic Stadium, London (GBR)",800m,20 JUL 2024,1230
4,1:56.71,MARY MORAA,KEN,"Hayward Field, Eugene, OR (USA)",800m,25 MAY 2024,1222
...,...,...,...,...,...,...,...
4895,1:11:56,Mio KURODA,JPN,"Ishin Me-Life Stadium, Yamaguchi (JPN)",Half Marathon,11 FEB 2024,1090
4896,1:11:56,Jacelyn GRUPPEN,NED,Venlo (NED),Half Marathon,24 MAR 2024,1090
4897,1:11:57,Malindi ELMORE,CAN,Vancouver (CAN),Half Marathon,11 FEB 2024,1090
4898,1:11:59,Maria Sagnes WÅGAN,NOR,Barcelona (ESP),Half Marathon,11 FEB 2024,1089


In [29]:
outdoor_df_w["Results Score"] = outdoor_df_w["Results Score"].astype(int)
outdoor_df_w["Results Score"].dtype

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outdoor_df_w["Results Score"] = outdoor_df_w["Results Score"].astype(int)


dtype('int64')

In [30]:
outdoor_df_sorted_w = outdoor_df_w.sort_values(by="Results Score", ascending=False)
outdoor_df_sorted_w

Unnamed: 0,Mark,Competitor,Nat,Location,Discipline,Date,Results Score
3800,28:54.14,BEATRICE CHEBET,KEN,"Hayward Field, Eugene, OR (USA)",10000m,25 MAY 2024,1309
3801,29:05.92,GUDAF TSEGAY,ETH,"Hayward Field, Eugene, OR (USA)",10000m,25 MAY 2024,1298
1400,3:49.04,FAITH KIPYEGON,KEN,"Stade Charléty, Paris (FRA)",1500m,07 JUL 2024,1295
1401,3:50.30,GUDAF TSEGAY,ETH,"Egret Stadium, Xiamen (CHN)",1500m,20 APR 2024,1285
1402,3:50.83,JESSICA HULL,AUS,"Stade Charléty, Paris (FRA)",1500m,07 JUL 2024,1280
...,...,...,...,...,...,...,...
4097,33:06.99,MARIYA NODA,JPN,"Athletic Track, Abashiri (JPN)",10000m,10 JUL 2024,1081
4095,33:06.73,JENNIFER SANDOVAL-GUZMAN,USA,"Jack Kemp Stadium, Los Angeles, CA (USA)",10000m,11 MAY 2024,1081
4094,33:06.55,MCKAYLIE CAESAR,USA,"Cobb Track and Angell Field, Palo Alto, CA (USA)",10000m,29 MAR 2024,1081
4092,33:06.35,KAHO HORIO,JPN,"Prefectural Stadium, Kumagaya (JPN)",10000m,18 MAY 2024,1081


In [32]:
outdoor_df_sorted_w['Date'] = pd.to_datetime(outdoor_df_sorted_w['Date'], format = '%d %b %Y')
filtered_df = outdoor_df_sorted_w[outdoor_df_sorted_w['Date']<'2024-08-01'] # we want any performances before the Olympics so
filtered_df.sort_values(by=['Competitor', 'Discipline', 'Date'], ascending=[True, True, False], inplace=True)

# Drop duplicates, keeping the last (most recent) entry for each competitor
outdoor_df_sorted_by_latest_w = filtered_df.drop_duplicates(subset=['Competitor','Discipline'], keep='first')

outdoor_df_sorted_by_latest_w

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.sort_values(by=['Competitor', 'Discipline', 'Date'], ascending=[True, True, False], inplace=True)


Unnamed: 0,Mark,Competitor,Nat,Location,Discipline,Date,Results Score
2983,9:48.26,. PRITI,IND,"Dean A. Hayes Track and Soccer Stadium, Murfre...",3000m Steeplechase,2024-05-31,1124
4058,32:55.91,. SEEMA,IND,"Mt. Hood Community College, Gresham, OR (USA)",10000m,2024-06-08,1090
3677,15:30.70,. SEEMA,IND,"Drake Stadium, Los Angeles, CA (USA)",5000m,2024-05-17,1104
1202,2:03.08,AALIYAH MILLER,USA,"Jack Kemp Stadium, Los Angeles, CA (USA)",800m,2024-07-20,1108
963,2:02.50,AALIYAH MOORE,GUY,"John McDonnell Field, Fayetteville, AR (USA)",800m,2024-05-25,1118
...,...,...,...,...,...,...,...
1619,4:03.90,ÁGUEDA MARQUÉS,ESP,"Estadi Olímpic Camilo Cano, La Nucia (ESP)",1500m,2024-06-30,1174
953,2:02.47,ÁGUEDA MARQUÉS,ESP,"Stade Jean Jacoby, Schifflange (LUX)",800m,2024-07-21,1118
3179,10:00.67,ŞEVVAL ÖZDOĞAN,TUR,"Boudewijnstadion, Bruxelles (BEL)",3000m Steeplechase,2024-05-26,1093
2057,4:09.34,ŞILAN AYYILDIZ,TUR,"Drake Stadium, Des Moines, IA (USA)",1500m,2024-04-27,1132


In [33]:
outdoor_df_sorted_by_latest_w.to_csv('outdoor_df_sorted_by_latest_w.csv', index=False)

## Step 3

### Selenium Config

In [36]:
!pip install selenium



In [37]:
!apt-get update
!apt-get install chromium chromium-driver

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.81)] [Waiting for headers] [Connected to r2u.stat.il                                                                                                    Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.17.190.167)]                                                                                                    Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.17.190.167)]                                                                                                    Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois

In [38]:
from selenium import webdriver

In [39]:
!apt-get install chromium-driver

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Note, selecting 'chromium-chromedriver' instead of 'chromium-driver'
The following additional packages will be installed:
  apparmor chromium-browser libfuse3-3 liblzo2-2 libudev1 snapd squashfs-tools systemd-hwe-hwdb
  udev
Suggested packages:
  apparmor-profiles-extra apparmor-utils fuse3 zenity | kdialog
The following NEW packages will be installed:
  apparmor chromium-browser chromium-chromedriver libfuse3-3 liblzo2-2 snapd squashfs-tools
  systemd-hwe-hwdb udev
The following packages will be upgraded:
  libudev1
1 upgraded, 9 newly installed, 0 to remove and 58 not upgraded.
Need to get 28.5 MB of archives.
After this operation, 118 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 apparmor amd64 3.0.4-2ubuntu2.3 [595 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 liblzo2-2 amd64 2.10-2build3 [53.7 kB]
Get:3 http://ar

In [40]:
def web_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--verbose")
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920, 1200")
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    return driver



In [41]:
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup

In [39]:
urls = [
    "https://worldathletics.org/competitions/olympic-games/paris24/athletes?competitionGroup=olympic-games&urlSlug=paris24&sexCode=W&disciplineCode=800",
    "https://worldathletics.org/competitions/olympic-games/paris24/athletes?competitionGroup=olympic-games&urlSlug=paris24&sexCode=W&disciplineCode=1500",
    "https://worldathletics.org/competitions/olympic-games/paris24/athletes?competitionGroup=olympic-games&urlSlug=paris24&sexCode=W&disciplineCode=3KSC",
    "https://worldathletics.org/competitions/olympic-games/paris24/athletes?competitionGroup=olympic-games&urlSlug=paris24&sexCode=W&disciplineCode=5000",
    "https://worldathletics.org/competitions/olympic-games/paris24/athletes?competitionGroup=olympic-games&urlSlug=paris24&sexCode=W&disciplineCode=10K",
    "https://worldathletics.org/competitions/olympic-games/paris24/athletes?competitionGroup=olympic-games&urlSlug=paris24&sexCode=W&disciplineCode=MAR"
]
events = [
    "800m",
    "1500m",
    "3000m SC",
    "5000m",
    "10000m",
    "Marathon",
]


In [40]:
def fetch_athlete_data(driver, urls, events):
    all_athletes = []

    for url, event in zip(urls, events):
        driver.get(url)

        # Wait for the table to be present on the page
        try:
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "table.styles_table__oNcJD"))
            )
        except Exception as e:
            print(f"An error occurred while waiting for the table for event {event}:", e)
            continue

        # Optional: Pause to ensure all JavaScript content is loaded
        time.sleep(5)  # Adjust the sleep time as necessary

        # Get the page source after rendering
        raw_html = driver.page_source

        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(raw_html, 'html.parser')

        # Find the table
        table = soup.find('table', class_='styles_table__oNcJD')
        if table:
            for row in table.find_all('tr')[1:]:  # Skip the header row
                cols = row.find_all('td')
                if len(cols) > 1:
                    athlete = cols[1].text.strip()
                    nationality = cols[3].text.strip()
                    birthdate = cols[4].text.strip()
                    all_athletes.append({
                        'Athlete': athlete,
                        'Nat': nationality,
                        'Birthdate': birthdate,
                        'Event': event
                    })
        else:
            print(f"Table not found for event {event}")

    # Convert to DataFrame
    df = pd.DataFrame(all_athletes)
    return df


In [45]:
driver = web_driver()

In [46]:
df_oly_w = fetch_athlete_data(driver, urls, events)
df_oly_w


Unnamed: 0,Athlete,Nat,Birthdate,Event
0,"AKINS, Nia",USA,07 Jul 1998,800m
1,"AL ROUMI, Amal",KUW,22 Aug 1992,800m
2,"ALEMU, Habitam",ETH,09 Jul 1997,800m
3,"ALMANZA, Rose Mary",CUB,13 Jul 1992,800m
4,"ALMASRI, Layla",PLE,26 Jun 1999,800m
...,...,...,...,...
284,"WIKSTRÖM, Carolina",SWE,04 Sep 1993,Marathon
285,"WOLDU, Mekdes",FRA,20 Oct 1992,Marathon
286,"XIA, Yuyu",CHN,01 Mar 1998,Marathon
287,"YAREMCHUK, Sofiia",ITA,03 Jun 1994,Marathon


In [47]:
df_oly_w.to_csv('df_w_distance_without_formatting.csv', index=False)

In [8]:
df_oly_w = pd.read_csv('df_w_distance_without_formatting.csv')
df_oly_w

Unnamed: 0,Athlete,Nat,Birthdate,Event
0,"AKINS, Nia",USA,07 Jul 1998,800m
1,"AL ROUMI, Amal",KUW,22 Aug 1992,800m
2,"ALEMU, Habitam",ETH,09 Jul 1997,800m
3,"ALMANZA, Rose Mary",CUB,13 Jul 1992,800m
4,"ALMASRI, Layla",PLE,26 Jun 1999,800m
...,...,...,...,...
284,"WIKSTRÖM, Carolina",SWE,04 Sep 1993,Marathon
285,"WOLDU, Mekdes",FRA,20 Oct 1992,Marathon
286,"XIA, Yuyu",CHN,01 Mar 1998,Marathon
287,"YAREMCHUK, Sofiia",ITA,03 Jun 1994,Marathon


In [9]:
# prompt: Join the athlete names and swap the names that are after the comma to before the comma and make everything into block letters.

def format_names(df):
  df['Athlete'] = df['Athlete'].str.upper()
  split_names = df['Athlete'].str.split(', ', n=1, expand=True)
  df['Athlete'] = split_names[1].str.cat(split_names[0], sep=' ').fillna(df['Athlete'])
  return df

df_oly_w_formatted = format_names(df_oly_w)
df_oly_w_formatted


Unnamed: 0,Athlete,Nat,Birthdate,Event
0,NIA AKINS,USA,07 Jul 1998,800m
1,AMAL AL ROUMI,KUW,22 Aug 1992,800m
2,HABITAM ALEMU,ETH,09 Jul 1997,800m
3,ROSE MARY ALMANZA,CUB,13 Jul 1992,800m
4,LAYLA ALMASRI,PLE,26 Jun 1999,800m
...,...,...,...,...
284,CAROLINA WIKSTRÖM,SWE,04 Sep 1993,Marathon
285,MEKDES WOLDU,FRA,20 Oct 1992,Marathon
286,YUYU XIA,CHN,01 Mar 1998,Marathon
287,SOFIIA YAREMCHUK,ITA,03 Jun 1994,Marathon


In [13]:
df_oly_w_formatted['Event'] = df_oly_w_formatted.groupby('Athlete')['Event'].transform(lambda x: ', '.join(sorted(x.unique())))
df_oly_w_formatted = df_oly_w_formatted.drop_duplicates(subset='Athlete')
df_oly_w_formatted

Unnamed: 0,Athlete,Nat,Birthdate,Event
0,NIA AKINS,USA,07 Jul 1998,800m
1,AMAL AL ROUMI,KUW,22 Aug 1992,800m
2,HABITAM ALEMU,ETH,09 Jul 1997,800m
3,ROSE MARY ALMANZA,CUB,13 Jul 1992,800m
4,LAYLA ALMASRI,PLE,26 Jun 1999,800m
...,...,...,...,...
284,CAROLINA WIKSTRÖM,SWE,04 Sep 1993,Marathon
285,MEKDES WOLDU,FRA,20 Oct 1992,Marathon
286,YUYU XIA,CHN,01 Mar 1998,Marathon
287,SOFIIA YAREMCHUK,ITA,03 Jun 1994,Marathon


In [53]:
df_oly_w_formatted.to_csv('df_oly_w_formatted.csv', index=False)


In [54]:
outdoor_df_sorted_by_latest_w = pd.read_csv('outdoor_df_sorted_by_latest_w.csv')
outdoor_df_sorted_by_latest_w

Unnamed: 0,Mark,Competitor,Nat,Location,Discipline,Date,Results Score
0,9:48.26,. PRITI,IND,"Dean A. Hayes Track and Soccer Stadium, Murfre...",3000m Steeplechase,2024-05-31,1124
1,32:55.91,. SEEMA,IND,"Mt. Hood Community College, Gresham, OR (USA)",10000m,2024-06-08,1090
2,15:30.70,. SEEMA,IND,"Drake Stadium, Los Angeles, CA (USA)",5000m,2024-05-17,1104
3,2:03.08,AALIYAH MILLER,USA,"Jack Kemp Stadium, Los Angeles, CA (USA)",800m,2024-07-20,1108
4,2:02.50,AALIYAH MOORE,GUY,"John McDonnell Field, Fayetteville, AR (USA)",800m,2024-05-25,1118
...,...,...,...,...,...,...,...
1866,4:03.90,ÁGUEDA MARQUÉS,ESP,"Estadi Olímpic Camilo Cano, La Nucia (ESP)",1500m,2024-06-30,1174
1867,2:02.47,ÁGUEDA MARQUÉS,ESP,"Stade Jean Jacoby, Schifflange (LUX)",800m,2024-07-21,1118
1868,10:00.67,ŞEVVAL ÖZDOĞAN,TUR,"Boudewijnstadion, Bruxelles (BEL)",3000m Steeplechase,2024-05-26,1093
1869,4:09.34,ŞILAN AYYILDIZ,TUR,"Drake Stadium, Des Moines, IA (USA)",1500m,2024-04-27,1132


In [55]:
# prompt: make all names in the competitor field in block letters (capital)

outdoor_df_sorted_by_latest_w['Competitor'] = outdoor_df_sorted_by_latest_w['Competitor'].str.upper()


In [56]:
merged_df = pd.merge(outdoor_df_sorted_by_latest_w, df_oly_w_formatted, left_on='Competitor', right_on='Athlete', how='inner')
merged_df.drop(columns=['Athlete'], inplace = True)
merged_df

Unnamed: 0,Mark,Competitor,Nat_x,Location,Discipline,Date,Results Score,Nat_y,Birthdate,Event
0,4:05.45,ABBEY CALDWELL,AUS,"Zdzisław Krzyszkowiak Stadium, Bydgoszcz (POL)",1500m,2024-06-20,1162,AUS,03 Jul 2001,800m
1,1:59.35,ABBEY CALDWELL,AUS,"Stadion Allmend, Luzern (SUI)",800m,2024-07-16,1174,AUS,03 Jul 2001,800m
2,15:28.04,AGATE CAUNE,LAT,"Stadio Olimpico, Roma (ITA)",5000m,2024-06-07,1109,LAT,07 Aug 2004,5000m
3,3:58.05,AGATHE GUILLEMOT,FRA,"Stade Charléty, Paris (FRA)",1500m,2024-07-07,1221,FRA,11 Jul 1999,1500m
4,2:01.05,AGATHE GUILLEMOT,FRA,"Stade des Maradas, Cergy-Pontoise (FRA)",800m,2024-05-12,1144,FRA,11 Jul 1999,1500m
...,...,...,...,...,...,...,...,...,...,...
374,4:12.85,YUME GOTO,JPN,"Denka Big Swan Stadium, Niigata (JPN)",1500m,2024-06-28,1104,JPN,25 Feb 2000,1500m
375,2:25:45,YUYU XIA,CHN,Nagoya (JPN),Marathon,2024-03-10,1175,CHN,01 Mar 1998,Marathon
376,2:26:42,ZHANNA MAMAZHANOVA,KAZ,Rotterdam (NED),Marathon,2024-04-14,1165,KAZ,26 Jan 1994,Marathon
377,4:03.90,ÁGUEDA MARQUÉS,ESP,"Estadi Olímpic Camilo Cano, La Nucia (ESP)",1500m,2024-06-30,1174,ESP,19 Mar 1999,1500m


In [58]:
merged_df.to_csv('merged_df_w_new.csv', index=False)


In [59]:
sorted_df = merged_df.sort_values(by='Results Score', ascending=False)
sorted_df

Unnamed: 0,Mark,Competitor,Nat_x,Location,Discipline,Date,Results Score,Nat_y,Birthdate,Event
32,28:54.14,BEATRICE CHEBET,KEN,"Hayward Field, Eugene, OR (USA)",10000m,2024-05-25,1309,KEN,05 Mar 2000,"10000m, 5000m"
141,29:05.92,GUDAF TSEGAY,ETH,"Hayward Field, Eugene, OR (USA)",10000m,2024-05-25,1298,ETH,23 Jan 1997,"10000m, 1500m, 5000m"
111,3:49.04,FAITH KIPYEGON,KEN,"Stade Charléty, Paris (FRA)",1500m,2024-07-07,1295,KEN,10 Jan 1994,"1500m, 5000m"
142,3:50.30,GUDAF TSEGAY,ETH,"Egret Stadium, Xiamen (CHN)",1500m,2024-04-20,1285,ETH,23 Jan 1997,"10000m, 1500m, 5000m"
165,3:50.83,JESSICA HULL,AUS,"Stade Charléty, Paris (FRA)",1500m,2024-07-07,1280,AUS,22 Oct 1996,1500m
...,...,...,...,...,...,...,...,...,...,...
28,1:11:48,ANGIE ORJUELA,COL,"Houston, TX (USA)",Half Marathon,2024-01-14,1093,COL,09 May 1989,Marathon
261,1:11:49,MÉLODY JULIEN,FRA,"Stadio Olimpico, Roma (ITA)",Half Marathon,2024-06-09,1093,FRA,13 May 1999,Marathon
308,2:34:08,REBECCA CHEPTEGEI,UGA,Doha (QAT),Marathon,2024-02-16,1092,UGA,22 Feb 1991,Marathon
247,1:11:57,MALINDI ELMORE,CAN,Vancouver (CAN),Half Marathon,2024-02-11,1090,CAN,13 Mar 1980,Marathon


### Calculating Exponential Loss Decay

In [60]:
merged_df['Date'] = pd.to_datetime(merged_df['Date'])
merged_df["Days Since Performance"] = (pd.to_datetime('2024-08-01') - merged_df['Date']).dt.days #to make this more precise the actual event date can be used but can be aggregated to 1 aug for now

In [61]:
lambda_decay = 0.001  # ideally this is changeable

# Calculate the weighted score using exponential decay
merged_df['Weighted Score'] = merged_df['Results Score'] * np.exp(-lambda_decay * merged_df['Days Since Performance'])

In [62]:
sorted_df = merged_df.sort_values(by='Weighted Score', ascending=False)
sorted_df

Unnamed: 0,Mark,Competitor,Nat_x,Location,Discipline,Date,Results Score,Nat_y,Birthdate,Event,Days Since Performance,Weighted Score
111,3:49.04,FAITH KIPYEGON,KEN,"Stade Charléty, Paris (FRA)",1500m,2024-07-07,1295,KEN,10 Jan 1994,"1500m, 5000m",25,1263.026336
165,3:50.83,JESSICA HULL,AUS,"Stade Charléty, Paris (FRA)",1500m,2024-07-07,1280,AUS,22 Oct 1996,1500m,25,1248.396687
187,1:54.61,KEELY HODGKINSON,GBR,"Olympic Stadium, London (GBR)",800m,2024-07-20,1261,GBR,03 Mar 2002,800m,12,1245.958430
164,1:55.61,JEMMA REEKIE,GBR,"Olympic Stadium, London (GBR)",800m,2024-07-20,1242,GBR,06 Mar 1998,800m,12,1227.185067
199,3:53.79,LAURA MUIR,GBR,"Stade Charléty, Paris (FRA)",1500m,2024-07-07,1256,GBR,09 May 1993,1500m,25,1224.989250
...,...,...,...,...,...,...,...,...,...,...,...,...
256,1:10:21,MERITXELL SOLER,ESP,Santa Pola (ESP),Half Marathon,2024-01-21,1122,ESP,20 Jul 1992,Marathon,193,925.068775
308,2:34:08,REBECCA CHEPTEGEI,UGA,Doha (QAT),Marathon,2024-02-16,1092,UGA,22 Feb 1991,Marathon,167,924.049976
63,1:10:46,CAROLINA WIKSTRÖM,SWE,Santa Pola (ESP),Half Marathon,2024-01-21,1114,SWE,04 Sep 1993,Marathon,193,918.472919
247,1:11:57,MALINDI ELMORE,CAN,Vancouver (CAN),Half Marathon,2024-02-11,1090,CAN,13 Mar 1980,Marathon,172,917.757299


In [63]:
sorted_df.to_csv('sorted_df_w_new.csv', index=False)


## Post Olympics Form Analysis

In [64]:
df = pd.read_csv('sorted_df_w_new.csv')
df

Unnamed: 0,Mark,Competitor,Nat_x,Location,Discipline,Date,Results Score,Nat_y,Birthdate,Event,Days Since Performance,Weighted Score
0,3:49.04,FAITH KIPYEGON,KEN,"Stade Charléty, Paris (FRA)",1500m,2024-07-07,1295,KEN,10 Jan 1994,"1500m, 5000m",25,1263.026336
1,3:50.83,JESSICA HULL,AUS,"Stade Charléty, Paris (FRA)",1500m,2024-07-07,1280,AUS,22 Oct 1996,1500m,25,1248.396687
2,1:54.61,KEELY HODGKINSON,GBR,"Olympic Stadium, London (GBR)",800m,2024-07-20,1261,GBR,03 Mar 2002,800m,12,1245.958430
3,1:55.61,JEMMA REEKIE,GBR,"Olympic Stadium, London (GBR)",800m,2024-07-20,1242,GBR,06 Mar 1998,800m,12,1227.185067
4,3:53.79,LAURA MUIR,GBR,"Stade Charléty, Paris (FRA)",1500m,2024-07-07,1256,GBR,09 May 1993,1500m,25,1224.989250
...,...,...,...,...,...,...,...,...,...,...,...,...
374,1:10:21,MERITXELL SOLER,ESP,Santa Pola (ESP),Half Marathon,2024-01-21,1122,ESP,20 Jul 1992,Marathon,193,925.068775
375,2:34:08,REBECCA CHEPTEGEI,UGA,Doha (QAT),Marathon,2024-02-16,1092,UGA,22 Feb 1991,Marathon,167,924.049976
376,1:10:46,CAROLINA WIKSTRÖM,SWE,Santa Pola (ESP),Half Marathon,2024-01-21,1114,SWE,04 Sep 1993,Marathon,193,918.472919
377,1:11:57,MALINDI ELMORE,CAN,Vancouver (CAN),Half Marathon,2024-02-11,1090,CAN,13 Mar 1980,Marathon,172,917.757299


In [21]:
def get_olympic_event_results(url, discipline):
    results = []

    # Fetch the webpage
    response = requests.get(url)
    data = response.text
    soup = BeautifulSoup(data, 'html.parser')

    # Locate the table containing the results
    table = soup.find('table')

    # Ensure the table exists
    if table:
        # Iterate over each row in the table (skipping the header row)
        for row in table.find_all('tr')[1:]:
            cols = row.find_all('td')
            if len(cols) > 1:
                place = cols[0].text.strip()
                nationality = cols[2].text.strip()
                competitor = cols[3].text.strip()
                mark = cols[4].text.strip()


                results.append({
                    'Place': place,
                    'Competitor': competitor,
                    'Nat': nationality,
                    'Mark': mark,
                }) #what about the results score? would probably have to scrape data (again) for post OLY results lmao

    # Convert the results into a DataFrame
    df = pd.DataFrame(results)

    # Insert the discipline column
    df.insert(4, 'Discipline', discipline)

    return df


In [22]:
url = "https://worldathletics.org/competitions/olympic-games/paris24/results/women/1500-metres/final/result"
discipline = "1500m"
df_1500_results = get_olympic_event_results(url, discipline)

df_1500_results

Unnamed: 0,Place,Competitor,Nat,Mark,Discipline
0,1,Faith KIPYEGON,KEN,3:51.29 OR,1500m
1,2,Jessica HULL,AUS,3:52.56,1500m
2,3,Georgia BELL,GBR,3:52.61 NR,1500m
3,4,Diribe WELTEJI,ETH,3:52.75 PB,1500m
4,5,Laura MUIR,GBR,3:53.37 PB,1500m
5,6,Susan Lokayo EJORE,KEN,3:56.07 PB,1500m
6,7,Nikki HILTZ,USA,3:56.38,1500m
7,8,Elle ST. PIERRE,USA,3:57.52,1500m
8,9,Agathe GUILLEMOT,FRA,3:59.08,1500m
9,10,Klaudia KAZIMIERSKA,POL,4:00.12 PB,1500m


In [23]:
df_1500_results['Competitor'] = df_1500_results['Competitor'].str.upper()
df_1500_results


Unnamed: 0,Place,Competitor,Nat,Mark,Discipline
0,1,FAITH KIPYEGON,KEN,3:51.29 OR,1500m
1,2,JESSICA HULL,AUS,3:52.56,1500m
2,3,GEORGIA BELL,GBR,3:52.61 NR,1500m
3,4,DIRIBE WELTEJI,ETH,3:52.75 PB,1500m
4,5,LAURA MUIR,GBR,3:53.37 PB,1500m
5,6,SUSAN LOKAYO EJORE,KEN,3:56.07 PB,1500m
6,7,NIKKI HILTZ,USA,3:56.38,1500m
7,8,ELLE ST. PIERRE,USA,3:57.52,1500m
8,9,AGATHE GUILLEMOT,FRA,3:59.08,1500m
9,10,KLAUDIA KAZIMIERSKA,POL,4:00.12 PB,1500m


In [32]:
def find_index_in_larger_df(competitor_name, larger_df):
    try:
        return (larger_df[larger_df['Competitor'] == competitor_name].index[0])+1  # Get the first occurrence
    except IndexError:
        return None  # Return None if the competitor is not found



In [33]:
def get_filtered_event_df(df, event):
    return df[df['Event'].str.contains(event)]


In [44]:
def fetch_athlete_data(driver, urls):
    all_athletes = []

    for url in urls:
        driver.get(url)

        # Wait for the table to be present on the page
        try:
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CLASS_NAME, "records-table"))
            )
        except Exception as e:
            print(f"An error occurred while waiting for the table at {url}:", e)
            continue


        time.sleep(5)

        raw_html = driver.page_source

        soup = BeautifulSoup(raw_html, 'html.parser')


        table = soup.find('table', class_='records-table')
        if table:
            for row in table.find_all('tr')[1:]:
                cols = row.find_all('td')
                if len(cols) > 1:
                    rank = cols[0].text.strip()
                    athlete = cols[1].text.strip()
                    nationality = cols[2].text.strip()
                    score = cols[3].text.strip()
                    event_list = cols[4].text.strip()

                    all_athletes.append({
                        'Rank': rank,
                        'Athlete': athlete,
                        'Nat': nationality,
                        'Score': score,
                        'Event List': event_list
                    })
        else:
            print(f"Table not found at {url}")

    # Convert to DataFrame
    df = pd.DataFrame(all_athletes)
    return df


# Example usage:
# Assuming driver is a Selenium WebDriver instance that has been properly configured





In [45]:
def complete_analysis(result_url, event, ranking_urls):
    df_results = get_olympic_event_results(result_url, event)
    filtered_event_df = get_filtered_event_df(df, event)
    df_results['Competitor'] = df_results['Competitor'].str.upper()
    df_results['Index in rerank'] = df_results['Competitor'].apply(find_index_in_larger_df, args=(filtered_event_df,)) #okay this doesn't work quite well now
    driver = web_driver()
    df_rankings = fetch_athlete_data(driver, ranking_urls)
    df_rankings = df_rankings.rename(columns={'Athlete': 'Competitor'})
    df_rankings['Competitor'] = df_rankings['Competitor'].str.upper()
    df_results['Index in IAAF rank'] = df_results['Competitor'].apply(find_index_in_larger_df, args=(df_rankings,))
    df_results['Index in rerank'] = pd.to_numeric(df_results['Index in rerank'], errors='coerce')
    df_results['Index in IAAF rank'] = pd.to_numeric(df_results['Index in IAAF rank'], errors='coerce')
    df_results['Place'] = pd.to_numeric(df_results['Place'], errors='coerce')

# Calculate differences and squared errors
    df_results['Diff from Rerank'] = (df_results['Index in rerank'] - df_results['Place']).fillna(0)
    df_results['Diff from IAAF'] = (df_results['Index in IAAF rank'] - df_results['Place']).fillna(0)

    df_results['Mean Squared Error Rerank'] = (df_results['Diff from Rerank'] ** 2)
    df_results['Mean Squared Error IAAF'] = (df_results['Diff from IAAF'] ** 2)

    # Calculate root mean squared errors
    rmse_rerank = np.sqrt(df_results['Mean Squared Error Rerank'].mean())
    rmse_iaaf = np.sqrt(df_results['Mean Squared Error IAAF'].mean())

    print("Root Mean Squared Error (Rerank):", rmse_rerank)
    print("Root Mean Squared Error (IAAF):", rmse_iaaf)


    return df_results


In [46]:
complete_analysis("https://worldathletics.org/competitions/olympic-games/paris24/results/women/1500-metres/final/result", "1500m", ["https://worldathletics.org/world-rankings/1500m/women?regionType=world&page=1&rankDate=2024-07-23&limitByCountry=0", "https://worldathletics.org/world-rankings/1500m/women?regionType=world&page=2&rankDate=2024-07-23&limitByCountry=0"] )

Root Mean Squared Error (Rerank): 35.36830030785572
Root Mean Squared Error (IAAF): 18.463928798245153


Unnamed: 0,Place,Competitor,Nat,Mark,Discipline,Index in rerank,Index in IAAF rank,Diff from Rerank,Diff from IAAF,Mean Squared Error Rerank,Mean Squared Error IAAF
0,1,FAITH KIPYEGON,KEN,3:51.29 OR,1500m,1,1.0,0,0.0,0,0.0
1,2,JESSICA HULL,AUS,3:52.56,1500m,2,3.0,0,1.0,0,1.0
2,3,GEORGIA BELL,GBR,3:52.61 NR,1500m,7,13.0,4,10.0,16,100.0
3,4,DIRIBE WELTEJI,ETH,3:52.75 PB,1500m,40,2.0,36,-2.0,1296,4.0
4,5,LAURA MUIR,GBR,3:53.37 PB,1500m,5,4.0,0,-1.0,0,1.0
5,6,SUSAN LOKAYO EJORE,KEN,3:56.07 PB,1500m,18,20.0,12,14.0,144,196.0
6,7,NIKKI HILTZ,USA,3:56.38,1500m,52,19.0,45,12.0,2025,144.0
7,8,ELLE ST. PIERRE,USA,3:57.52,1500m,16,26.0,8,18.0,64,324.0
8,9,AGATHE GUILLEMOT,FRA,3:59.08,1500m,26,23.0,17,14.0,289,196.0
9,10,KLAUDIA KAZIMIERSKA,POL,4:00.12 PB,1500m,79,48.0,69,38.0,4761,1444.0


In [47]:
complete_analysis("https://worldathletics.org/competitions/olympic-games/paris24/results/women/3000-metres-steeplechase/final/result", "3000m SC", ["https://worldathletics.org/world-rankings/3000msc/women?regionType=world&page=1&rankDate=2024-07-23&limitByCountry=0", "https://worldathletics.org/world-rankings/3000msc/women?regionType=world&page=2&rankDate=2024-07-23&limitByCountry=0"] )

Root Mean Squared Error (Rerank): 79.25360139030487
Root Mean Squared Error (IAAF): 6.668333125052067


Unnamed: 0,Place,Competitor,Nat,Mark,Discipline,Index in rerank,Index in IAAF rank,Diff from Rerank,Diff from IAAF,Mean Squared Error Rerank,Mean Squared Error IAAF
0,1,WINFRED MUTILE YAVI,BRN,8:52.76 OR,3000m SC,,,0.0,0.0,0.0,0.0
1,2,PERUTH CHEMUTAI,UGA,8:53.34 NR,3000m SC,39.0,4.0,37.0,2.0,1369.0,4.0
2,3,FAITH CHEROTICH,KEN,8:55.15 PB,3000m SC,103.0,3.0,100.0,0.0,10000.0,0.0
3,4,ALICE FINOT,FRA,8:58.67 AR,3000m SC,12.0,5.0,8.0,1.0,64.0,1.0
4,5,SEMBO ALMAYEW,ETH,9:00.83 SB,3000m SC,76.0,9.0,71.0,4.0,5041.0,16.0
5,6,BEATRICE CHEPKOECH,KEN,9:04.24,3000m SC,74.0,2.0,68.0,-4.0,4624.0,16.0
6,7,ELIZABETH BIRD,GBR,9:04.35 NR,3000m SC,27.0,11.0,20.0,4.0,400.0,16.0
7,8,LOMI MULETA,ETH,9:06.07 PB,3000m SC,130.0,13.0,122.0,5.0,14884.0,25.0
8,9,NORAH JERUTO,KAZ,9:08.97 SB,3000m SC,158.0,,149.0,0.0,22201.0,0.0
9,10,LEA MEYER,GER,9:09.59 PB,3000m SC,38.0,22.0,28.0,12.0,784.0,144.0


In [48]:
complete_analysis("https://worldathletics.org/competitions/olympic-games/paris24/results/women/800-metres/final/result", "800m", ["https://worldathletics.org/world-rankings/800m/women?regionType=world&page=1&rankDate=2024-07-23&limitByCountry=0", "https://worldathletics.org/world-rankings/800m/women?regionType=world&page=2&rankDate=2024-07-23&limitByCountry=0"])

Root Mean Squared Error (Rerank): 109.54565258375159
Root Mean Squared Error (IAAF): 17.240939649566666


Unnamed: 0,Place,Competitor,Nat,Mark,Discipline,Index in rerank,Index in IAAF rank,Diff from Rerank,Diff from IAAF,Mean Squared Error Rerank,Mean Squared Error IAAF
0,1,KEELY HODGKINSON,GBR,1:56.72,800m,3,1,2,0,4,0
1,2,TSIGE DUGUMA,ETH,1:57.15 PB,800m,221,9,219,7,47961,49
2,3,MARY MORAA,KEN,1:57.42,800m,132,2,129,-1,16641,1
3,4,SHAFIQUA MALONEY,VIN,1:57.66,800m,116,27,112,23,12544,529
4,5,RÉNELLE LAMOTE,FRA,1:58.19,800m,13,16,8,11,64,121
5,6,WORKNESH MESELE,ETH,1:58.28,800m,114,21,108,15,11664,225
6,7,JULIETTE WHITTAKER,USA,1:58.50,800m,75,45,68,38,4624,1444
7,8,PRUDENCE SEKGODISO,RSA,1:58.79,800m,58,11,50,3,2500,9


In [49]:
complete_analysis("https://worldathletics.org/competitions/olympic-games/paris24/results/women/5000-metres/final/result", "5000m", ["https://worldathletics.org/world-rankings/5000m/women?regionType=world&page=1&rankDate=2024-07-23&limitByCountry=0", "https://worldathletics.org/world-rankings/5000m/women?regionType=world&page=2&rankDate=2024-07-23&limitByCountry=0"])

Root Mean Squared Error (Rerank): 80.6330422345579
Root Mean Squared Error (IAAF): 20.781602440620407


Unnamed: 0,Place,Competitor,Nat,Mark,Discipline,Index in rerank,Index in IAAF rank,Diff from Rerank,Diff from IAAF,Mean Squared Error Rerank,Mean Squared Error IAAF
0,1,BEATRICE CHEBET,KEN,14:28.56,5000m,6,1,5,0,25,0
1,2,FAITH KIPYEGON,KEN,14:29.60 SB,5000m,1,12,-1,10,1,100
2,3,SIFAN HASSAN,NED,14:30.61 SB,5000m,87,7,84,4,7056,16
3,4,NADIA BATTOCLETTI,ITA,14:31.64 NR,5000m,36,13,32,9,1024,81
4,5,MARGARET CHELIMO KIPKEMBOI,KEN,14:32.23 SB,5000m,23,22,18,17,324,289
5,6,EJGAYEHU TAYE,ETH,14:32.98,5000m,20,3,14,-3,196,9
6,7,MEDINA EISA,ETH,14:35.43,5000m,112,4,105,-3,11025,9
7,8,KAROLINE BJERKELI GRØVDAL,NOR,14:43.21,5000m,89,17,81,9,6561,81
8,9,GUDAF TSEGAY,ETH,14:45.21 SB,5000m,8,2,-1,-7,1,49
9,10,KARISSA SCHWEIZER,USA,14:45.57,5000m,43,38,33,28,1089,784


In [50]:
complete_analysis("https://worldathletics.org/en/competitions/olympic-games/paris24/results/women/10000-metres/final/result", "10000m", ["https://worldathletics.org/world-rankings/10000m/women?regionType=world&page=1&rankDate=2024-07-23&limitByCountry=0", "https://worldathletics.org/world-rankings/10000m/women?regionType=world&page=2&rankDate=2024-07-23&limitByCountry=0"])

Root Mean Squared Error (Rerank): 90.43937195712938
Root Mean Squared Error (IAAF): 16.418282492392436


Unnamed: 0,Place,Competitor,Nat,Mark,Discipline,Index in rerank,Index in IAAF rank,Diff from Rerank,Diff from IAAF,Mean Squared Error Rerank,Mean Squared Error IAAF
0,1.0,BEATRICE CHEBET,KEN,30:43.25,10000m,6.0,,5.0,0.0,25.0,0.0
1,2.0,NADIA BATTOCLETTI,ITA,30:43.35 NR,10000m,36.0,14.0,34.0,12.0,1156.0,144.0
2,3.0,SIFAN HASSAN,NED,30:44.12 SB,10000m,87.0,9.0,84.0,6.0,7056.0,36.0
3,4.0,MARGARET CHELIMO KIPKEMBOI,KEN,30:44.58,10000m,23.0,22.0,19.0,18.0,361.0,324.0
4,5.0,LILIAN KASAIT RENGERUK,KEN,30:45.04,10000m,22.0,11.0,17.0,6.0,289.0,36.0
5,6.0,GUDAF TSEGAY,ETH,30:45.21,10000m,8.0,1.0,2.0,-5.0,4.0,25.0
6,7.0,FOTYEN TESFAY,ETH,30:46.93,10000m,14.0,16.0,7.0,9.0,49.0,81.0
7,8.0,WEINI KELATI FREZGHI,USA,30:49.98,10000m,44.0,18.0,36.0,10.0,1296.0,100.0
8,9.0,KARISSA SCHWEIZER,USA,30:51.99 SB,10000m,43.0,37.0,34.0,28.0,1156.0,784.0
9,10.0,TSIGIE GEBRESELAMA,ETH,30:54.57,10000m,17.0,5.0,7.0,-5.0,49.0,25.0


In [65]:
complete_analysis("https://worldathletics.org/en/competitions/olympic-games/paris24/results/women/marathon/final/result", "Marathon", ["https://worldathletics.org/world-rankings/marathon/women?regionType=world&page=1&rankDate=2024-07-23&limitByCountry=0", "https://worldathletics.org/world-rankings/marathon/women?regionType=world&page=2&rankDate=2024-07-23&limitByCountry=0"])

Root Mean Squared Error (Rerank): 205.6568410331547
Root Mean Squared Error (IAAF): 54.27230892898805


Unnamed: 0,Place,Competitor,Nat,Mark,Discipline,Index in rerank,Index in IAAF rank,Diff from Rerank,Diff from IAAF,Mean Squared Error Rerank,Mean Squared Error IAAF
0,1.0,SIFAN HASSAN,NED,2:22:55 OR,Marathon,90.0,2.0,89.0,1.0,7921.0,1.0
1,2.0,TIGST ASSEFA,ETH,2:22:58,Marathon,70.0,1.0,68.0,-1.0,4624.0,1.0
2,3.0,HELLEN OBIRI,KEN,2:23:10 PB,Marathon,346.0,11.0,343.0,8.0,117649.0,64.0
3,4.0,SHARON LOKEDI,KEN,2:23:14 PB,Marathon,,40.0,0.0,36.0,0.0,1296.0
4,5.0,AMANE BERISO SHANKULE,ETH,2:23:57,Marathon,199.0,3.0,194.0,-2.0,37636.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...
86,,MELAT YISAK KEJETA,GER,DNF,Marathon,217.0,122.0,0.0,0.0,0.0,0.0
87,,JACKLINE SAKILU,TAN,DNF,Marathon,237.0,66.0,0.0,0.0,0.0,0.0
88,,JOAN CHELIMO MELLY,ROU,DNF,Marathon,193.0,145.0,0.0,0.0,0.0,0.0
89,,SINEAD DIVER,AUS,DNF,Marathon,,,0.0,0.0,0.0,0.0


In [71]:
import pandas as pd

data = {
    'WOMEN Event': ['800m', '1500m', '3000m SC', '5000m', '10000m', 'Marathon'],
    'MSE Exponential Decay Rerank (WOMEN)': [109.54565258375159, 35.36830030785572, 79.25360139030487, 80.6330422345579, 90.43937195712938, 205.6568410331547],
    'MSE IAAF Rank (WOMEN)': [17.240939649566666, 18.463928798245153, 6.668333125052067, 20.781602440620407, 16.418282492392436, 54.27230892898805]
}

RMSEdf = pd.DataFrame(data)
RMSEdf


Unnamed: 0,WOMEN Event,MSE Exponential Decay Rerank (WOMEN),MSE IAAF Rank (WOMEN)
0,800m,109.545653,17.24094
1,1500m,35.3683,18.463929
2,3000m SC,79.253601,6.668333
3,5000m,80.633042,20.781602
4,10000m,90.439372,16.418282
5,Marathon,205.656841,54.272309


Comparing between the men's and womens one below:

In [72]:
import pandas as pd

dataMEN = {
    'Event': ['800m', '1500m', '3000m SC', '5000m', '10000m', 'Marathon'],
    'MSE Exponential Decay Rerank (MEN)': [52.15361924162119, 49.83556293785928, 104.02974334294976, 163.25732616165973, 131.43833929655776, 245.42829784422995],
    'MSE IAAF Rank (MEN)': [41.11265012134343, 13.231905884389192, 14.49784466739798, 29.077482697097423, 28.996168329242707, 66.1949048802003]
}

RMSEdfMEN = pd.DataFrame(dataMEN)
RMSEdfMEN


Unnamed: 0,Event,MSE Exponential Decay Rerank (MEN),MSE IAAF Rank (MEN)
0,800m,52.153619,41.11265
1,1500m,49.835563,13.231906
2,3000m SC,104.029743,14.497845
3,5000m,163.257326,29.077483
4,10000m,131.438339,28.996168
5,Marathon,245.428298,66.194905


In [75]:

import pandas as pd
RMSEdf.rename(columns={'WOMEN Event': 'Event'}, inplace=True)

combined_RMSEdf = pd.merge(RMSEdf, RMSEdfMEN, on='Event', how='outer')

combined_RMSEdf


Unnamed: 0,Event,MSE Exponential Decay Rerank (WOMEN),MSE IAAF Rank (WOMEN),MSE Exponential Decay Rerank (MEN),MSE IAAF Rank (MEN)
0,800m,109.545653,17.24094,52.153619,41.11265
1,1500m,35.3683,18.463929,49.835563,13.231906
2,3000m SC,79.253601,6.668333,104.029743,14.497845
3,5000m,80.633042,20.781602,163.257326,29.077483
4,10000m,90.439372,16.418282,131.438339,28.996168
5,Marathon,205.656841,54.272309,245.428298,66.194905


# Claude Config and setup

I wanted to try out Claude 3.5 sonnet just to see how code would be generated and generally (given that it was zero shot) it seems rather accurate! Might just modify the code since it's rather accurate.

In [None]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.23.1-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.26.2-py3-none-any.whl (475 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m476.0/476.0 kB[0m [31m18.

In [None]:
# Import necessary libraries
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from google.colab import files
import io
import time
import requests
import math



In [None]:

# Set up Selenium WebDriver for Google Colab
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Connecting to security.ubuntu.com (185.125.190.81)] [Connected to cloud.r-project.org (18.160.21                                                                                                    Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
0% [2 InRelease 12.7 kB/128 kB 10%] [Waiting for headers] [Waiting for headers] [Waiting for headers                                                                                                    Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Ign:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy Release.gpg [7

In [None]:

# Step 1: Get all athlete performances >1100 points for 800m -> Marathon
def get_performances(event):
    url = f"https://www.worldathletics.org/records/all-time-toplists/{event}/outdoor/women/senior"
    driver.get(url)

    # Wait for the table to load
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "records-table")))

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    table = soup.find('table', class_='records-table')

    performances = []
    for row in table.find_all('tr')[1:]:  # Skip header row
        cols = row.find_all('td')
        if len(cols) >= 5:
            performance = cols[2].text.strip()
            points = int(cols[4].text.strip())
            if points > 1100:
                performances.append((performance, points))
            else:
                break  # Stop when points fall below 1100

    return performances

events = ['800m', '1500m', '5000m', '10000m', 'marathon']
all_performances = {}

for event in events:
    all_performances[event] = get_performances(event)

# Step 2: Get all outdoor performances and filter out indoor performances
df1 = pd.DataFrame([(event, perf[0], perf[1]) for event, perfs in all_performances.items() for perf in perfs],
                   columns=['Event', 'Performance', 'Points'])

# Step 3: Get list of female athletes who competed in the events through Selenium
def get_athletes(event):
    url = f"https://www.worldathletics.org/records/all-time-toplists/{event}/outdoor/women/senior"
    driver.get(url)

    # Wait for the table to load
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "records-table")))

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    table = soup.find('table', class_='records-table')

    athletes = []
    for row in table.find_all('tr')[1:]:  # Skip header row
        cols = row.find_all('td')
        if len(cols) >= 3:
            athlete = cols[1].text.strip()
            athletes.append(athlete)

    return athletes

all_athletes = []
for event in events:
    all_athletes.extend(get_athletes(event))

df2 = pd.DataFrame(all_athletes, columns=['Athlete'])
df2 = df2.drop_duplicates()

# Step 4: Process names and combine DF1 and DF2
# Assuming the athlete names are in the same format in both dataframes
df1['Athlete'] = df1['Performance'].str.split(' ', n=1).str[1]
df = pd.merge(df1, df2, on='Athlete', how='inner')

print(df.head())
print(f"Total number of performances: {len(df)}")

I think with better prompting Claude's potential is great! but unfortunately needs more customisation, should probably have added some examples.