# **World Championship 2022**
## **49er and 49erFX**

In [4]:
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [5]:
options = Options()
options.add_argument("--headless")

Check your chrome version searching for chrome://version. If your chrome version is 115 or newer, you can install chromedriver here https://googlechromelabs.github.io/chrome-for-testing/#stable. Else, check it here https://chromedriver.chromium.org/downloads. After downloading, unzip the file, move it to the same folder as this notebook and inform the path in the next cell.

In [6]:
PATH_TO_CHROMEDRIVER = 'chromedriver-mac-x64/chromedriver'

In [3]:
URL = 'https://49er.org/events/2022-world-championship/#result-49'

driver = Chrome(PATH_TO_CHROMEDRIVER, options=options)
driver.get(URL)
html = driver.page_source

soup = BeautifulSoup(html, "html.parser")

In [4]:
data = []

for row in soup.find_all('tr'):
    resultados = []
    for cell in row.find_all('td'):
        # if cell contains <strike> tag, add () around it
        if cell.find('strike'):
            cell.string = '(' + cell.string + ')'
        resultados.append(cell.text)
    data.append(resultados)

In [5]:
# turn data into a pandas dataframe
df = pd.DataFrame(data)

# change column names
df.columns = ['Posição Geral', 'Sail Number', 'Nome Competidor', 'Nett', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 
              'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Pontuação Total']

# drop Sail Number column
df.drop('Sail Number', axis=1, inplace=True)

In [6]:
df_49er = df.iloc[1:66, :]
df_49erFX = df.iloc[67:103, :]

In [7]:
# turn all the Q1, Q2, ..., Q17 columns into rows
df_49er = pd.melt(df_49er, id_vars=['Posição Geral', 'Nome Competidor', 'Nett', 'Pontuação Total'], 
                  value_vars=['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 
                              'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 
                              'Q13', 'Q14', 'Q15', 'Q16', 'Q17'])

df_49erFX = pd.melt(df_49erFX, id_vars=['Posição Geral', 'Nome Competidor', 'Nett', 'Pontuação Total'],
                    value_vars=['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 
                                'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 
                                'Q13', 'Q14', 'Q15', 'Q16', 'Q17'])

In [8]:
# rename column 'variable' to 'Flotilha'
df_49er.rename(columns={'variable': 'Flotilha',
                        'value': 'Pontuação Regata'}, inplace=True)

df_49erFX.rename(columns={'variable': 'Flotilha',
                          'value': 'Pontuação Regata'}, inplace=True)

In [9]:
# define functions to rename fleet names
def rename_fleet_49er(x):
    if x == 'Q17':
        x = 'MR'
    elif x == 'Q16':
        x = 'P'
    else:
        x = 'G'
    return x

def rename_fleet_49erFX(x):
    if x == 'Q16':
        x = 'MR'
    else:
        x = 'G'
    return x

df_49er['Flotilha'] = df_49er['Flotilha'].apply(rename_fleet_49er)
df_49erFX['Flotilha'] = df_49erFX['Flotilha'].apply(rename_fleet_49erFX)

In [10]:
df_49er['Nome Competição'] = ['World Championship 2022'] * len(df_49er)
df_49er['ID Competição'] = [12] * len(df_49er)
df_49er['Classe Vela'] = ['49er'] * len(df_49er)
df_49er['Punição'] = [''] * len(df_49er)

df_49erFX['Nome Competição'] = ['World Championship 2022'] * len(df_49erFX)
df_49erFX['ID Competição'] = [12] * len(df_49erFX)
df_49erFX['Classe Vela'] = ['49erFX'] * len(df_49erFX)
df_49erFX['Punição'] = [''] * len(df_49erFX)

In [11]:
df_49er.head()

Unnamed: 0,Posição Geral,Nome Competidor,Nett,Pontuação Total,Flotilha,Pontuação Regata,Nome Competição,ID Competição,Classe Vela,Punição
0,1,Bart LAMBRIEXFloris van de WERKEN,85.0,100.0,G,12.0,World Championship 2022,12,49er,
1,2,Diego BOTINFlorian TRITTEL PAUL,107.0,132.0,G,6.0,World Championship 2022,12,49er,
2,3,Sime FANTELAMihovil FANTELA,109.0,134.0,G,3.0,World Championship 2022,12,49er,
3,4,Isaac McHARDIEWilliam McKENZIE,116.0,138.0,G,(22.0),World Championship 2022,12,49er,
4,5,Łukasz PRZYBYTEKJacek PIASECKI,117.0,138.0,G,3.0,World Championship 2022,12,49er,


In [12]:
df_49erFX.head()

Unnamed: 0,Posição Geral,Nome Competidor,Nett,Pontuação Total,Flotilha,Pontuação Regata,Nome Competição,ID Competição,Classe Vela,Punição
0,1,Odile van AANHOLTAnnette DUETZ,84.0,113.0,G,2.0,World Championship 2022,12,49erFX,
1,2,Vilma BOBECKRebecca NETZLER,103.0,134.0,G,5.0,World Championship 2022,12,49erFX,
2,3,Támara ECHEGOYENPaula BARCELÓ,127.0,151.0,G,4.0,World Championship 2022,12,49erFX,
3,4,Aleksandra MELZACKASandra JANKOWIAK,151.0,183.0,G,17.0,World Championship 2022,12,49erFX,
4,5,Stephanie ROBLEMaggie SHEA,155.0,187.0,G,14.0,World Championship 2022,12,49erFX,


In [13]:
column_order = ['Nome Competidor', 'ID Competição', 'Classe Vela', 
                'Pontuação Regata', 'Flotilha', 'Posição Geral', 
                'Punição', 'Pontuação Total', 'Nett', 'Nome Competição']

df_49er = df_49er[column_order]
df_49erFX = df_49erFX[column_order]

In [14]:
df_49er.to_csv('scraped-data/World Championship 2022_49er.csv', index=False)
df_49erFX.to_csv('scraped-data/World Championship 2022_49erFX.csv', index=False)

---

In [8]:
URL = 'https://nacra17.org/event/2022-world-championship/#nacra17-results'

driver = Chrome(PATH_TO_CHROMEDRIVER, options=options)
driver.get(URL)
html = driver.page_source

soup = BeautifulSoup(html, "html.parser")

In [9]:
data = []

for row in soup.find_all('tr'):
    resultados = []
    for cell in row.find_all('td'):
        # if cell contains <strike> tag, add () around it
        if cell.find('strike'):
            cell.string = '(' + cell.string + ')'
        resultados.append(cell.text)
    data.append(resultados)

In [10]:
# turn data into a pandas dataframe
df = pd.DataFrame(data)

# change column names
df.columns = ['Posição Geral', 'Sail Number', 'Nome Competidor', 'Nett', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 
              'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Pontuação Total']

# drop Sail Number column
df.drop('Sail Number', axis=1, inplace=True)

In [11]:
# turn all the Q1, Q2, ..., Q17 columns into rows
df = pd.melt(df, id_vars=['Posição Geral', 'Nome Competidor', 'Nett', 'Pontuação Total'], 
             value_vars=['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 
                         'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 
                         'Q13', 'Q14', 'Q15', 'Q16', 'Q17'])

In [12]:
# rename column 'variable' to 'Flotilha'
df.rename(columns={'variable': 'Flotilha',
                   'value': 'Pontuação Regata'}, inplace=True)

In [13]:
def rename_fleet_49erFX(x):
    if x == 'Q17':
        x = 'MR'
    else:
        x = 'G'
    return x

df['Flotilha'] = df['Flotilha'].apply(rename_fleet_49erFX)

In [15]:
df['Nome Competição'] = ['World Championship 2022'] * len(df)
df['ID Competição'] = [12] * len(df)
df['Classe Vela'] = ['Nacra17'] * len(df)
df['Punição'] = [''] * len(df)

In [16]:
df.head()

Unnamed: 0,Posição Geral,Nome Competidor,Nett,Pontuação Total,Flotilha,Pontuação Regata,Nome Competição,ID Competição,Classe Vela,Punição
0,,,,,G,,World Championship 2022,12,Nacra17,
1,1.0,Ruggero TITACaterina BANTI,26.0,46.0,G,1.0,World Championship 2022,12,Nacra17,
2,2.0,Gianluigi UGOLINIMaria GIUBILEI,86.0,115.0,G,3.0,World Championship 2022,12,Nacra17,
3,3.0,Sinem KURTBAYAkseli KESKINEN,101.0,115.0,G,5.0,World Championship 2022,12,Nacra17,
4,4.0,John GIMSONAnna BURNET,104.0,122.0,G,4.0,World Championship 2022,12,Nacra17,


In [None]:
df.to_excel('scraped-data/World Championship 2022_Nacra17.xlsx', index=False)