* Scrape https://eugene.diamondleague.com/program-results-eugene/ for eventname,gender,lastname,firstname,Country,SB,PB,DOB,html link

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
BASE_URL = 'https://eugene.diamondleague.com'
EUGENE_URL = 'https://eugene.diamondleague.com/program-results-eugene/'

In [3]:
def make_request_and_get_soup(url):
    r = requests.get(url)
    s = BeautifulSoup(r.text, 'html.parser')
    
    return s

In [4]:
soup = make_request_and_get_soup(EUGENE_URL)

In [5]:
event_names = []
event_elems = soup.findAll("div", {'class': 'name'})
for elem in event_elems:
    event_names.append(elem.text)
    

In [6]:
link_elems = soup.findAll("div", {'class': 'links'})

In [7]:
event_hrefs = []
for link in link_elems:
    a_elem = link.find('a')
    if a_elem:
        event_hrefs.append('{}{}'.format(BASE_URL, a_elem['href']))
    else:
        event_hrefs.append(pd.np.nan)

In [8]:
df = pd.DataFrame()

In [9]:
df['event_name'] = event_names
df['event_hrefs'] = event_hrefs

In [10]:
def get_athlete_hrefs(event_href):
    try:
        event_soup = make_request_and_get_soup(event_href)

        athlete_info = []
        athlete_elems = event_soup.findAll('div', {'class': 'column name'})
        for elem in athlete_elems[1:]:
            href = '{}{}'.format(BASE_URL, elem.find('a')['href'])
            sb = elem.find('div', {'class': 'column sb'})
            pb = elem.find('div', {'class': 'column pb'})
            athlete_info.append([href, sb, pb])
        
        return athlete_info
    
    except:
        return pd.np.nan
    
    

In [11]:
df['athlete_infos'] = df['event_hrefs'].apply(get_athlete_hrefs)

In [12]:
# split each athlete onto their own row.
df = df.athlete_infos.apply(pd.Series) \
    .merge(df, right_index = True, left_index = True) \
    .drop(["athlete_infos"], axis = 1) \
    .melt(id_vars = ['event_name', 'event_hrefs'], value_name = "athlete_info") \
    .drop("variable", axis = 1) \
    .dropna()

In [14]:
def return_list_index(x, index=None):
    return x[index]

df['athlete_href'] = df['athlete_info'].apply(return_list_index, index=0)
df['athlete_sb'] = df['athlete_info'].apply(return_list_index, index=1)
df['athlete_pb'] = df['athlete_info'].apply(return_list_index, index=2)
df.drop(columns='athlete_info', inplace=True)

In [15]:
s = make_request_and_get_soup('https://eugene.diamondleague.com/athletes/14377443.html')

In [16]:
def get_athlete_data(athlete_href):
    try:
        athlete_soup = make_request_and_get_soup(athlete_href)
        header = athlete_soup.find('h1')
        
        full_name = header.find('div').text
        full_name_list = full_name.split(' ')
        first_name = full_name_list[0]
        last_name = full_name_list[1]
        
        birthdate = header.find('div', {'class': 'birthdate'}).text
        country = header.find('div', {'class': 'country'}).text
        
        return pd.Series([first_name, last_name, birthdate, country])
    
    except:
        return pd.Series([pd.np.nan, pd.np.nan, pd.np.nan, pd.np.nan])
    



In [None]:
df[['first_name', 'last_name', 'birthdate', 'country']] = df['athlete_href'].apply(get_athlete_data)

In [None]:
def clean_bd(x):
    try: 
        return x.replace('.', '/')
    except: 
        return pd.np.nan
    
    
df['birthdate'] = df['birthdate'].apply(clean_bd)
    

In [None]:
df.tail()