In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from itertools import product
from google.colab import files

In [23]:
def get_page_content(url, headers, parameters):
  response = requests.get(url, headers = headers, params = parameters).text
  soup = BeautifulSoup(response, 'html.parser')
  return soup

def parse_athletes(soup, kimono, category, gender, belt, division):
  table = soup.find('table')
  if not table:
    return None

  athletes = []
  rows = table.find_all('tr')
  if not rows:
    return None

  for row in rows:
    photo_cell = row.find('td', class_ = 'photo reduced')
    name_cell = row.find('td', class_ = 'name-academy')
    points_cell = row.find('td', class_ = 'pontuation')
    rank_cell = row.find('td', class_ = 'position')

    photo = photo_cell.find('img')['src']
    name_tag = name_cell.find('div', class_ = 'name').find('a')
    name = name_tag.get_text(strip=True)
    details = DOMAIN + name_tag['href']
    points = points_cell.get_text(strip=True)
    rank = rank_cell.get_text(strip=True)

    athlete = {
        'photo': photo,
        'name': name,
        'details': details,
        'points': points,
        'rank': rank,
        'kimono': kimono,
        'category': category,
        'gender': gender,
        'belt': belt,
        'division': division
    }

    athletes.append(athlete)
  return athletes

def list_filters(soup, filter_id):
  filters = soup.find(id = filter_id).find_all('option')
  return [item['value'] for item in filters[1:]]

In [24]:
DOMAIN = 'https://ibjjf.com'
URL = f'{DOMAIN}/2024-athletes-ranking'
HEADERS = {'User-Agent': 'Mozilla/5.0'}
PARAMETERS = {
    'utf8': '✓',
    'filters[s]': 'ranking-geral-gi',
    'filters[ranking_category]': 'adult',
    'filters[gender]': 'male',
    'filters[belt]': 'black',
    'filters[weight]': None,
    'page': 1
    }

soup_filters = get_page_content(URL, HEADERS, PARAMETERS)
kimono = list_filters(soup_filters, 'filters_s')
category = list_filters(soup_filters, 'filters_ranking_category')
gender = list_filters(soup_filters, 'filters_gender')
belt = list_filters(soup_filters, 'filters_belt')
division = list_filters(soup_filters, 'weight_filter')

print(kimono, category, gender, belt, division)

['ranking-geral-gi', 'ranking-geral-no-gi'] ['adult', 'master', 'juvenile', 'kids'] ['male', 'female'] ['black', 'brown', 'purple', 'blue', 'white'] ['rooster', 'lightfeather', 'feather', 'light', 'middle', 'mediumheavy', 'heavy', 'superheavy', 'ultraheavy', 'openclass']


In [25]:
all_athletes = []

for k, c, g, b, d in product(kimono, category, gender, belt, division):
  page = 1
  while page <= 2:
  #while True: (To obtain the whole page)
    print(f'Scraping: {k}, {c}, {g}, {b}, {d} for page {page}')
    PARAMETERS['filters[s]']: k
    PARAMETERS['filters[ranking_category]']: c
    PARAMETERS['filters[gender]']: g
    PARAMETERS['filters[belt]']: b
    PARAMETERS['filters[weight]']: d
    PARAMETERS['page']: page
    soup_athletes = get_page_content(URL, HEADERS, PARAMETERS)
    athletes = parse_athletes(soup_athletes, k, c, g, b, d)
    if athletes is None:
      break
    all_athletes.extend(athletes)
    page += 1

Scraping: ranking-geral-gi, adult, male, black, rooster for page 1
Scraping: ranking-geral-gi, adult, male, black, rooster for page 2
Scraping: ranking-geral-gi, adult, male, black, lightfeather for page 1
Scraping: ranking-geral-gi, adult, male, black, lightfeather for page 2
Scraping: ranking-geral-gi, adult, male, black, feather for page 1
Scraping: ranking-geral-gi, adult, male, black, feather for page 2
Scraping: ranking-geral-gi, adult, male, black, light for page 1
Scraping: ranking-geral-gi, adult, male, black, light for page 2
Scraping: ranking-geral-gi, adult, male, black, middle for page 1
Scraping: ranking-geral-gi, adult, male, black, middle for page 2
Scraping: ranking-geral-gi, adult, male, black, mediumheavy for page 1
Scraping: ranking-geral-gi, adult, male, black, mediumheavy for page 2
Scraping: ranking-geral-gi, adult, male, black, heavy for page 1
Scraping: ranking-geral-gi, adult, male, black, heavy for page 2
Scraping: ranking-geral-gi, adult, male, black, superh

In [26]:
df_athletes = pd.json_normalize(all_athletes)

In [27]:
df_athletes

Unnamed: 0,photo,name,details,points,rank,kimono,category,gender,belt,division
0,https://api.ibjjfdb.com/Athletes/134127/Rankin...,Erich Munis dos Santos,https://ibjjf.com/athletes/erich-santos,877.0,1,ranking-geral-gi,adult,male,black,rooster
1,https://api.ibjjfdb.com/Athletes/88408/Ranking...,Fellipe Andrew Leandro Silva,https://ibjjf.com/athletes/fellipe-silva,661.0,2,ranking-geral-gi,adult,male,black,rooster
2,https://api.ibjjfdb.com/Athletes/56444/Ranking...,Adam Wardzinski,https://ibjjf.com/athletes/adam-wardzinski,591.5,3,ranking-geral-gi,adult,male,black,rooster
3,https://api.ibjjfdb.com/Athletes/82191/Ranking...,Diego Oliveira Batista,https://ibjjf.com/athletes/diego-batista-65fb1...,532.5,4,ranking-geral-gi,adult,male,black,rooster
4,https://api.ibjjfdb.com/Athletes/133443/Rankin...,Victor Hugo Costa Marques,https://ibjjf.com/athletes/victor-marques-bbe0...,468.0,5,ranking-geral-gi,adult,male,black,rooster
...,...,...,...,...,...,...,...,...,...,...
15995,https://api.ibjjfdb.com/Athletes/125159/Rankin...,Micael Ferreira Galvâo,https://ibjjf.com/athletes/micael-galvao,459,6,ranking-geral-no-gi,kids,female,white,openclass
15996,https://api.ibjjfdb.com/Athletes/29697/Ranking...,Meyram Maquiné Alves,https://ibjjf.com/athletes/meyram-alves,438.0,7,ranking-geral-no-gi,kids,female,white,openclass
15997,https://api.ibjjfdb.com/Athletes/49233/Ranking...,Mateus Rodrigues de Souza,https://ibjjf.com/athletes/mateus-souza,434.5,8,ranking-geral-no-gi,kids,female,white,openclass
15998,https://api.ibjjfdb.com/Athletes/49978/Ranking...,Gutemberg de Jesus Santos Pereira,https://ibjjf.com/athletes/gutemberg-pereira,419.0,9,ranking-geral-no-gi,kids,female,white,openclass


In [28]:
excel_file = '/content/drive/MyDrive/Aula Webscraping IBJJF/athletes.xlsx'
df_athletes.to_excel(excel_file, index=False)