In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import requests
from bs4 import BeautifulSoup

import datetime

### Base URL

In [2]:
base_season_url = "http://www.bvbinfo.com/season.asp"
base_season_summ_url = "http://www.bvbinfo.com/Season.asp?AssocID=3&Year="
base_tournament_url = "http://www.bvbinfo.com/Tournament.asp?ID="

In [3]:
from_year = '2000'
to_year = str(datetime.date.today().year)

### Scrape Sessions

In [4]:
def bvbinfo_season_scraper():
    url = base_season_url
    
    year_index = 0
    fivb_index = 15
    
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extracting team information from the table
    table = soup.find_all('table')

    if len(table) < 2:
        return []
    rows = table[2].find_all('tr')
    header_element = rows[2]
    header_text = header_element.get_text(strip=True, separator="\n").split("\n")
    print(header_text[year_index], header_text[fivb_index])
    
    if (header_text[year_index].upper() != 'YEAR') or (header_text[fivb_index].upper() != 'FIVB'):
        print(f"bvbinfo_season_scraper - Error: check website structure")
        return []
        
    season_years = []
    for cols_element in rows[3:]:
        try:
            col_element_data = cols_element.find_all('td')
            len_col_element_data = len(col_element_data)

            if len_col_element_data >= fivb_index:
                col = col_element_data[fivb_index]
                col.find('a', href=True)
                a_tag = col.find('a', href=True)
                if a_tag and 'Season.asp' in a_tag['href']:
                    value = a_tag['href'].split('=')[-1]            
                    season_years.append(value)
        except ValueError as e:
                print(f"bvbinfo_tourni_scraper - Error: {e.args[0]}")

    return season_years

In [5]:
season_years = bvbinfo_season_scraper()
season_years_df = pd.DataFrame(season_years, columns=['YEAR'])
# season_years_df.info()

Year FIVB


### Scrape Season Tournament IDs

In [6]:
def bvbinfo_tourni_scraper(year):
    url = base_season_summ_url + str(year)
    
    location_index = 1
    
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extracting team information from the table
    table = soup.find_all('table')

    if len(table) < 2:
        return []
    rows = table[2].find_all('tr')
    header_element = rows[3]
    header_text = header_element.get_text(strip=True, separator="\n").split("\n")
    
    if header_text[location_index].upper() != 'LOCATION':
        print(f"bvbinfo_tourni_scraper - Error: check website structure")
        return []
    tournament_ids = []

    for cols_element in rows[4:]:
        try:
            col_element_data = cols_element.find_all('td')
            len_col_element_data = len(col_element_data)
            if len_col_element_data > location_index:
                col = col_element_data[location_index]

                col.find('a', href=True)
                a_tag = col.find('a', href=True)
                if a_tag and 'Tournament.asp' in a_tag['href']:
                    value = a_tag['href'].split('=')[-1]
                    tournament_ids.append(value)                
        except ValueError as e:
                print(f"bvbinfo_tourni_scraper - Error: {e.args[0]}")
                
    return tournament_ids

In [7]:
columns = ['TOURNID']
scraped_tournid_df = pd.DataFrame(columns=columns)

years_list = season_years_df[(season_years_df['YEAR'] >= from_year) & (season_years_df['YEAR'] <= to_year)]['YEAR'].unique()
for year in years_list:
    data = bvbinfo_tourni_scraper(year)
    if data:  # Check if data is not empty
        temp_df = pd.DataFrame(data, columns=['TOURNID'])
        scraped_tournid_df = pd.concat([scraped_tournid_df, temp_df], ignore_index=True)

In [8]:
#print(scraped_tournid_df.isnull().sum())
#scraped_tournid_df.info()

### Scrape Team Tournament Information

In [9]:
columns = ['Finish', 'Player', 'Partner', 'Player_ID', 'Partner_ID', 'Country'
           , 'TourniStartDate', 'TourniEndDate', 'TournLocation', 'Event', 'TOURNID']

tournament_list = ['CHALLENGE','ELITE16']
len_columns = len(columns)

In [10]:
import re
from datetime import datetime
def extract_dates(s):
    # Regular expression pattern to match the date scenarios
    pattern = r"([A-Za-z]+) (\d+)-(?:([A-Za-z]+) )?(\d+), (\d{4})"
    match = re.search(pattern, s)
    
    if not match:
        return None, None

    start_month, start_day, end_month, end_day, year = match.groups()
    # If end_month is not provided, use start_month
    end_month = end_month or start_month
    
    start_date_str = f"{start_month} {start_day}, {year}"
    end_date_str = f"{end_month} {end_day}, {year}"
    
    # Convert to datetime objects for further processing if needed
    start_date = datetime.strptime(start_date_str, '%B %d, %Y').date()
    end_date = datetime.strptime(end_date_str, '%B %d, %Y').date()
    
    return start_date, end_date

In [11]:

def bvbinfo_team_scraper(tourn_id, top_n):
    url = base_tournament_url + str(tourn_id)

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extracting team information from the table  
    table = soup.find_all('table')

    if len(table) < 5:
        return []
    rows = table[4].find_all('tr')
    header_element = rows[0]
    header_text = header_element.get_text(strip=True, separator="\n").split("\n")
   
    # Extract category, tournament, location, and start-end date
    category = header_text[0].split("'")[0]  
    tournament = ''.join(header_text[0].split()[4:])
    location = header_text[0].split()[3]  
    startenddate = header_text[1]  

    #print(startenddate)    
    tourniStartDate, tourniEndDate = extract_dates(startenddate)        
    
    event = str(tournament) + " " + str(category)
    
    misc_info = []
    misc_info =[tourniStartDate, tourniEndDate, location, event, tourn_id]
    data = []
            
    fetch_columns = columns[:3] + [columns[5]]
    
    row_data = rows[1].find_all('td')
    for val in row_data[:4]:
        # print(val.text.strip(), fetch_columns)
        if val.text.strip() not in fetch_columns:
            print(f"bvbinfo_team_scraper - Error: check website structure")
            return []
        
    if len([1 for val in tournament_list if val in tournament.upper()]) == 0:
        return []

    for cols_element in rows[2:]:
        values = []
        players = []
        
        try:
            col_element_data = cols_element.find_all('td')
            finish_rank = col_element_data[0].text.strip()
            if not finish_rank:
                finish_rank = 999
             
            if 'Canada' in cols_element.get_text() or int(finish_rank) <= int(top_n) or int(top_n) == 0:
                for col in col_element_data[:4]: # don't need all columns
                    a_tag = col.find('a', href=True)
                    if a_tag and 'player.asp' in a_tag['href']:
                        value = a_tag['href'].split('=')[-1]
                        players.append(value)
                    #else:
                    value = col.text.strip()
                    values.append(value)
                data.append(values[:3] + players + values[3:] + misc_info)
        except ValueError as e:
                print(f"bvbinfo_team_scraper - Error: {e.args[0]} - {tourn_id}")
                 
    return data

In [12]:
scraped_team_df = pd.DataFrame(columns=columns)

for tournament_id in scraped_tournid_df['TOURNID']:
# for tournament_id in [3982, 3961]:
    data = bvbinfo_team_scraper(tournament_id, 5)
    if data:  # Check if data is not empty
        if len(data[0]) == len_columns:
            temp_df = pd.DataFrame(data, columns=columns)
            scraped_team_df = pd.concat([scraped_team_df, temp_df], ignore_index=True)

In [13]:
print(len(list(set(scraped_team_df['Player_ID'].unique() + scraped_team_df['Partner_ID'].unique()))))

142


In [14]:
scraped_team_df.to_csv("scraped_team_info.csv", index=False)