## Predicting CrossFit Games Ranking based on Benchmark Workout Performance
A machine learning model that analyzes over 1 million observations of CrossFit athletes' benchmark workout data to predict ranking in the CrossFit Games

The CrossFit Games website (https://games.crossfit.com/athletes) has publicly available data on athletes’ statistics for 14 benchmark workouts: Back Squat, Chad, Clean and Jerk, Deadlift, Fight Gone Bad, Filthy 50, Fran, Grace, Helen, Max Pull-ups, Run 5k, Snatch, Sprint 400m. These stats are self-reported by the athlete; it is an optional input when an athlete signs up for the CrossFit Games. It also contains information on athletes' affiliate gym, region, division, games level, qualifier event, and rank. 

In [1]:
# Import Libraries
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

## Scrape athlete data from the [CrossFit Games](https://games.crossfit.com/athletes) website

#### Explore the structure of one athlete's profile

In [2]:
url = "https://games.crossfit.com/athlete/591912"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    print(soup)

<!DOCTYPE html>

<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--><html class="no-js" lang="en-US"><!--<![endif]-->
<head>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta charset="utf-8"/>
<title>Athlete: Laura Horvath | CrossFit Games</title>
<meta content="English" name="language"/>
<meta content="CrossFit LLC." name="author"/>
<meta content="yes" name="apple-mobile-web-app-capable"/>
<meta content="index, follow" name="robots"/>
<link href="https://games.crossfit.com/athlete/591912" rel="canonical"/>
<meta content="22565487675,156147724427094,948262938543680" property="fb:pages">
<meta content="CrossFit Games | The Fittest on Earth" property="og:title">
<meta content="https://assets.crossfit.com/build/img/sites/games/logos/share.jpg" property="og:image">
<meta content="https://a

In [3]:
# Create empty lists to store relevant athlete data
athlete_names = []
athlete_affiliates = []
country_names = []
competitions = []
placements = []
regions = []
divisions = []

In [4]:
athlete = soup.find_all('div', class_='athlete-name')
athlete

[<div class="athlete-name">
 <h3>
 <span>Laura</span>
 <span>Horvath</span>
 </h3>
 </div>]

In [5]:
# Extract relevant fields
for athlete in athlete:
    athlete_name = ' '.join(span.text for span in athlete.find_all('span'))
    athlete_affiliate = athlete.find_next('div', class_='athlete-affiliate').a.text.strip()
    country_name = athlete.find_previous('span', class_='country-name').text.strip()
    competition = athlete.find_next('span', class_='competition').text.strip()
    placement = athlete.find_next('span', class_='placement').a.text.strip()
    region = athlete.find_next('span', class_='region').text.strip()
    division = athlete.find_next('span', class_='division').text.strip()
    
    # Append data to lists
    athlete_names.append(athlete_name)
    athlete_affiliates.append(athlete_affiliate)
    country_names.append(country_name)
    competitions.append(competition)
    placements.append(placement)
    regions.append(region)
    divisions.append(division)

athlete_data = {
    'Name': athlete_names,
    'Affiliate': athlete_affiliates,
    'Country': country_names,
    'Competition': competitions,
    'Placement': placements,
    'Region': regions,
    'Division': divisions
}

athlete_df = pd.DataFrame(athlete_data)
athlete_df

Unnamed: 0,Name,Affiliate,Country,Competition,Placement,Region,Division
0,Laura Horvath,CrossFit Glasshouse,Hungary,2023 games\n1st\nworldwide,1st,worldwide,Women


In [6]:
# Find all the tables inside stats-section to extract benchmark workout data
tables = soup.select('.stats-section table')
tables

[<table class="stats">
 <tbody>
 <tr>
 <th class="stats-header" scope="row"> Back Squat </th>
 <td> --</td>
 </tr>
 <tr>
 <th class="stats-header" scope="row"> Chad1000x </th>
 <td> --</td>
 </tr>
 <tr>
 <th class="stats-header" scope="row"> Clean and Jerk </th>
 <td> --</td>
 </tr>
 <tr>
 <th class="stats-header" scope="row"> Deadlift </th>
 <td> --</td>
 </tr>
 </tbody>
 </table>,
 <table class="stats">
 <tbody>
 <tr>
 <th class="stats-header" scope="row"> Fight Gone Bad </th>
 <td> --</td>
 </tr>
 <tr>
 <th class="stats-header" scope="row"> Filthy 50 </th>
 <td> --</td>
 </tr>
 <tr>
 <th class="stats-header" scope="row"> Fran </th>
 <td> --</td>
 </tr>
 <tr>
 <th class="stats-header" scope="row"> Grace </th>
 <td> --</td>
 </tr>
 </tbody>
 </table>,
 <table class="stats">
 <tbody>
 <tr>
 <th class="stats-header" scope="row"> Helen </th>
 <td> --</td>
 </tr>
 <tr>
 <th class="stats-header" scope="row"> L1 Benchmark </th>
 <td> --</td>
 </tr>
 <tr>
 <th class="stats-header" scope="row

In [7]:
exercises = []
results = []

# Iterate through each table and extract stats-header and result values
for table in tables:
    rows = table.select('tr')
    for row in rows:
        exercise = row.select_one('.stats-header').text.strip()
        result = row.select_one('td').text.strip()
        exercises.append(exercise)
        results.append(result)

# Add exercise columns to the athlete DataFrame
for exercise, result in zip(exercises, results):
    athlete_df[exercise] = result

athlete_df

Unnamed: 0,Name,Affiliate,Country,Competition,Placement,Region,Division,Back Squat,Chad1000x,Clean and Jerk,...,Fight Gone Bad,Filthy 50,Fran,Grace,Helen,L1 Benchmark,Max Pull-ups,Run 5k,Snatch,Sprint 400m
0,Laura Horvath,CrossFit Glasshouse,Hungary,2023 games\n1st\nworldwide,1st,worldwide,Women,--,--,--,...,--,--,--,--,--,--,--,--,--,--


#### Expand code to scrape all athletes' benchmark and profile data

In [8]:
# Create empty lists to store athlete data
athlete_data_list = []

# Loop through every combination of the URL with a six-digit ID
for athlete_id in range(1, 1000000): # [469656,591912,621610]:
    url = f"https://games.crossfit.com/athlete/{str(athlete_id).zfill(6)}"
    response = requests.get(url)

    # Check if the page exists
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Create empty lists to store athlete data
        athlete_names = []
        athlete_affiliates = []
        country_names = []
        competitions = []
        placements = []
        regions = []
        divisions = []
        stats_level_block = []

        # Extract athlete information
        athletes = soup.find_all('div', class_='athlete-name')

        for athlete in athletes:
            athlete_name = ' '.join(span.text for span in athlete.find_all('span'))
            
            athlete_affiliate_elem = athlete.find_next('div', class_='athlete-affiliate')
            athlete_affiliate = athlete_affiliate_elem.a.text.strip() if (athlete_affiliate_elem and athlete_affiliate_elem.a) else ""
            
            country_elem = athlete.find_next('span', class_='country-name')
            country_name = country_elem.text.strip() if country_elem else ""
                        
            division_elem = athlete.find_next('span', class_='division')
            division = division_elem.text.strip() if division_elem else ""

            competition_elem = athlete.find_next('span', class_='competition')
            competition = competition_elem.text.strip() if competition_elem else ""

            placement_elem = athlete.find_next('span', class_='placement')
            placement = placement_elem.a.text.strip() if placement_elem else ""
           
            region_elem = athlete.find_next('span', class_='region')
            region = region_elem.text.strip() if region_elem else ""

            stats_level_block_elem = soup.select_one('.stats-level-block ul.tab-nav li.active a')
            stats_level_block_text = stats_level_block_elem.text.strip() if stats_level_block_elem else ""
            stats_level_block.append(stats_level_block_text)

            # Append data to lists
            athlete_names.append(athlete_name)
            athlete_affiliates.append(athlete_affiliate)
            country_names.append(country_name)
            competitions.append(competition)
            placements.append(placement)
            regions.append(region)
            divisions.append(division)

        # Create a DataFrame for athlete information
        athlete_data = {
            'Athlete': athlete_names,
            'Affiliate': athlete_affiliates,
            'Country': country_names,
            'Competition': competitions,
            'Placement': placements,
            'Region': regions,
            'Division': divisions,
            'Division_2': stats_level_block
        }

        athlete_df = pd.DataFrame(athlete_data)

        # Create empty lists to store benchmark workout data
        exercises = []
        results = []

        # Find all the tables inside stats-section
        tables = soup.select('.stats-section table')

        # Iterate through each table and extract stats-header and result values
        for table in tables:
            rows = table.select('tr')
            for row in rows:
                exercise = row.select_one('.stats-header').text.strip()
                result = row.select_one('td').text.strip()
                exercises.append(exercise)
                results.append(result)

        # Add exercise columns to the athlete DataFrame
        for exercise, result in zip(exercises, results):
            athlete_df[exercise] = result

        athlete_data_list.append(athlete_df)

final_df = pd.concat(athlete_data_list, ignore_index=True)
pd.set_option('display.max_columns', None)
final_df.to_csv('data_raw.csv')

# Replace blank values with NaN
final_df.replace('--', np.nan, inplace=True)
final_df.replace('', np.nan, inplace=True)

final_df

Unnamed: 0,Athlete,Affiliate,Country,Competition,Placement,Region,Division,Division_2,Back Squat,Chad1000x,Clean and Jerk,Deadlift,Fight Gone Bad,Filthy 50,Fran,Grace,Helen,L1 Benchmark,Max Pull-ups,Run 5k,Snatch,Sprint 400m
0,Vincent Zadnancin,,,,,,,,,,,,,,,,,,,,,
1,Loris Moretto,,,,,,,,,,,,,,,,,,,,,
2,Gael Perez,,,,,,,,,,,,,,,,,,,,,
3,Gary Worth,,,,,,,,,,,,,,,,,,,,,
4,Sjoerd Braamhaar,Twee12 CrossFit,,,,Europe,Men,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39842,Annelie Van Deventer,,,,,Africa,Women,,,,,,,,,,,,,,,
39843,Stephan Taling,CA CrossFit,,,,Netherlands,Men,,100 kg,,70 kg,170 kg,,,,,,,,,50 kg,
39844,Fournier Yoann,CrossFit Sainte-Marie,,,,Africa Middle East,Men,,,,,,,,,,,,,,,
39845,Daisy van de Sluis,CrossFit Tiel,,,,Netherlands,Women,,,,,,,,,,,,,,,


In [9]:
# Drop any row with no data
df = final_df.dropna(subset=final_df.columns[1:], how='all')

# Drop any row where there is no benchmark workout data
benchmark_workouts = ['Back Squat', 'Chad1000x', 'Clean and Jerk',
                      'Deadlift', 'Fight Gone Bad', 'Filthy 50', 'Fran', 'Grace', 'Helen',
                      'L1 Benchmark', 'Max Pull-ups', 'Run 5k', 'Snatch', 'Sprint 400m']
df_ = df.dropna(subset=benchmark_workouts, how='all')

df_.to_csv('data.csv')
df_

Unnamed: 0,Athlete,Affiliate,Country,Competition,Placement,Region,Division,Division_2,Back Squat,Chad1000x,Clean and Jerk,Deadlift,Fight Gone Bad,Filthy 50,Fran,Grace,Helen,L1 Benchmark,Max Pull-ups,Run 5k,Snatch,Sprint 400m
24,Nicolas Freund,,,,,,,,130 kg,,,155 kg,,,,,,,40,,,1:29
33,Brett Wood,,,,,West Coast,Men,,,,315 lb,,,,,,,,,,255 lb,
39,Thierry Martel,CrossFit Eight,,,,Samoa,Men,,180 kg,,155 kg,1 kg,1,10000:00,2:25,1:44,10000:00,,1,10000:00,130 kg,1:00
83,Eveline Meister,CrossFit Zug,,,,Europe South,Women,,68 kg,,52 kg,100 kg,,34:06,8:21,3:27,14:04,,,26:30,38 kg,
84,Alexey Gorbik,,,,,Russian Federation,Men,,160 lb,,120 lb,165 lb,,,3:30,,,,54,21:00,100 lb,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39817,Tristan Bi,,,,,Asia,Men,,185 kg,,135 kg,210 kg,,,2:40,,,,40,19:00,110 kg,1:00
39824,Rachel Sullivan,,,,,Europe,Women,,78 lb,,50 kg,135 lb,,,,,,,,,30 kg,
39827,Emilia Delacour,CrossFit Biarritz,,,,Europe,Women,,90 kg,,65 kg,110 kg,,,6:50,,,,25,,50 kg,
39828,Carl Edwards,,,2023 open\n1594th\nworldwide,1594th,worldwide,Men (60-64),Men (60-64),95 lb,,75 lb,135 lb,,,10:44,,,,,23:00,65 lb,
