* Scrape https://eugene.diamondleague.com/program-results-eugene/ for eventname,gender,lastname,firstname,Country,SB,PB,DOB,html link

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
BASE_URL = 'https://eugene.diamondleague.com'
EUGENE_URL = 'https://eugene.diamondleague.com/program-results-eugene/'

In [3]:
def make_request_and_get_soup(url):
    r = requests.get(url)
    s = BeautifulSoup(r.text, 'html.parser')
    
    return s

In [4]:
soup = make_request_and_get_soup(EUGENE_URL)

In [5]:
event_names = []
event_elems = soup.findAll("div", {'class': 'name'})
for elem in event_elems:
    event_names.append(elem.text)
    

In [6]:
link_elems = soup.findAll("div", {'class': 'links'})

In [7]:
event_hrefs = []
for link in link_elems:
    a_elem = link.find('a')
    if a_elem:
        event_hrefs.append('{}{}'.format(BASE_URL, a_elem['href']))
    else:
        event_hrefs.append(pd.np.nan)

In [8]:
df = pd.DataFrame()

In [9]:
df['event_name'] = event_names
df['event_hrefs'] = event_hrefs

In [10]:
def get_athlete_hrefs(event_href):
    try:
        event_soup = make_request_and_get_soup(event_href)

        athlete_info = []
        athlete_href_elems = event_soup.findAll('div', {'class': 'column name'})
        athlete_sb_elems = event_soup.findAll('div', {'class': 'column sb'})
        athlete_pb_elems = event_soup.findAll('div', {'class': 'column pb'})
        
        
        for i in range(1, len(athlete_href_elems)):            
            href = '{}{}'.format(BASE_URL, athlete_href_elems[i].find('a')['href'])
            sb = athlete_sb_elems[i].text
            pb = athlete_pb_elems[i].text
            
            athlete_info.append([href, sb, pb])
        
        return athlete_info
    
    except:
        return pd.np.nan
    
    

In [11]:
df['athlete_infos'] = df['event_hrefs'].apply(get_athlete_hrefs)

In [12]:
df.head()

Unnamed: 0,event_name,event_hrefs,athlete_infos
0,Men's Pole Vault,https://eugene.diamondleague.com/program-resul...,[[https://eugene.diamondleague.com/athletes/14...
1,Men High School 100m,,
2,Women High School 100m,,
3,Women's Shot Put,https://eugene.diamondleague.com/program-resul...,[[https://eugene.diamondleague.com/athletes/14...
4,400m Hurdles Men,https://eugene.diamondleague.com/program-resul...,[[https://eugene.diamondleague.com/athletes/14...


In [13]:
# split each athlete onto their own row.
df = df.athlete_infos.apply(pd.Series) \
    .merge(df, right_index = True, left_index = True) \
    .drop(["athlete_infos"], axis = 1) \
    .melt(id_vars = ['event_name', 'event_hrefs'], value_name = "athlete_info") \
    .drop("variable", axis = 1) \
    .dropna()

In [14]:
def return_list_index(x, index=None):
    return x[index]

df['athlete_href'] = df['athlete_info'].apply(return_list_index, index=0)
df['athlete_sb'] = df['athlete_info'].apply(return_list_index, index=1)
df['athlete_pb'] = df['athlete_info'].apply(return_list_index, index=2)
df.drop(columns='athlete_info', inplace=True)

In [15]:
s = make_request_and_get_soup('https://eugene.diamondleague.com/athletes/14377443.html')

In [16]:
def get_athlete_data(athlete_href):
    try:
        athlete_soup = make_request_and_get_soup(athlete_href)
        header = athlete_soup.find('h1')
        
        full_name = header.find('div').text
        full_name_list = full_name.split(' ')
        first_name = full_name_list[0]
        last_name = full_name_list[1]
        
        birthdate = header.find('div', {'class': 'birthdate'}).text
        country = header.find('div', {'class': 'country'}).text
        
        return pd.Series([first_name, last_name, birthdate, country])
    
    except:
        return pd.Series([pd.np.nan, pd.np.nan, pd.np.nan, pd.np.nan])
    



In [17]:
df[['first_name', 'last_name', 'birthdate', 'country']] = df['athlete_href'].apply(get_athlete_data)

In [18]:
def clean_bd(x):
    try: 
        return x.replace('.', '/')
    except: 
        return pd.np.nan
    
    
df['birthdate'] = df['birthdate'].apply(clean_bd)
    

In [22]:
df.drop(columns=['event_hrefs'], inplace=True)
df.sort_values('event_name', inplace=True)

In [24]:
df = df[['event_name', 'last_name', 'first_name', 'country', 'athlete_sb', 'athlete_pb', 'birthdate', 'athlete_href']]

In [25]:
df.to_csv('pre_stats.csv', index=False)

Unnamed: 0,event_name,last_name,first_name,country,athlete_sb,athlete_pb,birthdate,athlete_href
143,1 Mile Men,KAAZOUZI,Brahim,Morocco,,,15/06/1990,https://eugene.diamondleague.com/athletes/1447...
125,1 Mile Men,INGEBRIGTSEN,Jakob,Norway,,3:52.28,19/09/2000,https://eugene.diamondleague.com/athletes/1465...
107,1 Mile Men,INGEBRIGTSEN,Filip,Norway,,3:53.23,20/04/1993,https://eugene.diamondleague.com/athletes/1445...
35,1 Mile Men,CENTROWITZ,Matthew,United States,,3:50.53,18/10/1989,https://eugene.diamondleague.com/athletes/1423...
161,1 Mile Men,Motonei,Elijah,Kenya,,3:49.08,05/01/1993,https://eugene.diamondleague.com/athletes/1459...
71,1 Mile Men,ENGELS,Craig,United States,,3:55.12,01/05/1994,https://eugene.diamondleague.com/athletes/1446...
197,1 Mile Men,TEFERA,Samuel,Ethiopia,,3:51.26,23/10/1999,https://eugene.diamondleague.com/athletes/1479...
17,1 Mile Men,BIRGEN,Bethwell,Kenya,,3:50.42,06/08/1988,https://eugene.diamondleague.com/athletes/1436...
179,1 Mile Men,SOULEIMAN,Ayanleh,Djibouti,,3:47.32,03/12/1992,https://eugene.diamondleague.com/athletes/1437...
215,1 Mile Men,WILLIS,Nick,New Zealand,,3:49.83,25/04/1983,https://eugene.diamondleague.com/athletes/1421...
