In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import requests
from bs4 import BeautifulSoup

/kaggle/input/unique-bvbinfo-tournament-id/unique_tournid.csv


In [2]:
tourn_id_df = pd.read_csv("/kaggle/input/unique-bvbinfo-tournament-id/unique_tournid.csv")
tourn_id_df.info()                 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   TOURNID  50 non-null     int64
dtypes: int64(1)
memory usage: 528.0 bytes


### Scrape Sessions

In [3]:
base_season_url = "http://www.bvbinfo.com/season.asp"

In [4]:
def bvbinfo_season_scraper():
    url = base_season_url
    print(url)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extracting team information from the table

    table = soup.find_all('table')
    # print("len: ", len(table))
    # if len(table) < 2:
        #return []
    rows = table[2].find_all('tr')
    header_element = rows[2]
    header_text = header_element.get_text(strip=True, separator="\n").split("\n")
    #print(header_text[0])
    #print(header_text[15])
    season_years = []
    for cols_element in rows[3:]:
        col_element_data = cols_element.find_all('td')
        len_col_element_data = len(col_element_data)
        col_element_data

        if len_col_element_data > 15:
            #print(col_element_data[0].text.strip(),col_element_data[15].text.strip())
            col = col_element_data[15]
            col.find('a', href=True)
            a_tag = col.find('a', href=True)
            if a_tag and 'Season.asp' in a_tag['href']:
                value = a_tag['href'].split('=')[-1]            
                season_years.append(value)

    return season_years

In [5]:
season_years = bvbinfo_season_scraper()
season_years_df = pd.DataFrame(season_years, columns=['YEAR'])
season_years_df.info()

http://www.bvbinfo.com/season.asp
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   YEAR    38 non-null     object
dtypes: object(1)
memory usage: 432.0+ bytes


### Scrape Season Tournament IDs

In [6]:
base_season_summ_url = "http://www.bvbinfo.com/Season.asp?AssocID=3&Year="

In [7]:
def bvbinfo_tourni_scraper(year):
    url = base_season_summ_url + str(year)
    print(url)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extracting team information from the table

    table = soup.find_all('table')
    print("len: ", len(table))
    #if len(table) < 2:
        #return []
    rows = table[2].find_all('tr')
    header_element = rows[3]
    header_text = header_element.get_text(strip=True, separator="\n").split("\n")
    # print(header_text)

    tournament_ids = []

    for cols_element in rows[4:]:
        col_element_data = cols_element.find_all('td')
        len_col_element_data = len(col_element_data)
        if len_col_element_data > 1:
            col = col_element_data[1]

            col.find('a', href=True)
            a_tag = col.find('a', href=True)
            if a_tag and 'Tournament.asp' in a_tag['href']:
                value = a_tag['href'].split('=')[-1]
                tournament_ids.append(value)
    return tournament_ids

In [8]:
columns = ['TOURNID']
scraped_tournid_df = pd.DataFrame(columns=columns)

years_list = season_years_df[(season_years_df['YEAR'] >= '2010') & (season_years_df['YEAR'] <= '2023')]['YEAR'].unique()
for year in years_list:
# for year in [2023]:
    data = bvbinfo_tourni_scraper(year)
    if data:  # Check if data is not empty
        temp_df = pd.DataFrame(data, columns=['TOURNID'])
        scraped_tournid_df = pd.concat([scraped_tournid_df, temp_df], ignore_index=True)

http://www.bvbinfo.com/Season.asp?AssocID=3&Year=2023
len:  7
http://www.bvbinfo.com/Season.asp?AssocID=3&Year=2022
len:  6
http://www.bvbinfo.com/Season.asp?AssocID=3&Year=2021
len:  6
http://www.bvbinfo.com/Season.asp?AssocID=3&Year=2020
len:  6
http://www.bvbinfo.com/Season.asp?AssocID=3&Year=2019
len:  6
http://www.bvbinfo.com/Season.asp?AssocID=3&Year=2018
len:  6
http://www.bvbinfo.com/Season.asp?AssocID=3&Year=2017
len:  6
http://www.bvbinfo.com/Season.asp?AssocID=3&Year=2016
len:  6
http://www.bvbinfo.com/Season.asp?AssocID=3&Year=2015
len:  6
http://www.bvbinfo.com/Season.asp?AssocID=3&Year=2014
len:  6
http://www.bvbinfo.com/Season.asp?AssocID=3&Year=2013
len:  6
http://www.bvbinfo.com/Season.asp?AssocID=3&Year=2012
len:  6
http://www.bvbinfo.com/Season.asp?AssocID=3&Year=2011
len:  6
http://www.bvbinfo.com/Season.asp?AssocID=3&Year=2010
len:  6


In [9]:
scraped_tournid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 688 entries, 0 to 687
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   TOURNID  688 non-null    object
dtypes: object(1)
memory usage: 5.5+ KB


### Scrape Team Tournament Information

In [10]:
base_tournament_url = "http://www.bvbinfo.com/Tournament.asp?ID="

In [11]:

def bvbinfo_team_scraper(tourn_id, top_n):
    url = base_tournament_url + str(tourn_id)
    print(url)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extracting team information from the table
  
    table = soup.find_all('table')
    print("len: ", len(table))
    if len(table) < 5:
        return []
    rows = table[4].find_all('tr')
    header_element = rows[0]
    header_text = header_element.get_text(strip=True, separator="\n").split("\n")
    # Extract category, tournament, location, and start-end date
    category = header_text[0].split("'")[0]  
    tournament = ' '.join(header_text[0].split()[4:])
    location = header_text[0].split()[3]  
    startenddate = header_text[1]  

    event = str(tournament) + " " + str(category)
    # Split by space to get month and day range
    month, day_range, year = startenddate.split()[:3]

    # Split day range by '-'
    start_day, end_day = day_range.split('-')

    # Construct tourniStartDate and tourniEndDate
    tourniStartDate = f"{month} {start_day}, {year}"
    tourniEndDate = f"{month} {end_day}, {year}"
    
    misc_info = []
    misc_info =[tourniStartDate, tourniStartDate, location, event, tourn_id]
    data = []
    for cols_element in rows[2:]:
        values = []
        players = []
        col_element_data = cols_element.find_all('td')
        finish_rank = col_element_data[0].text.strip()
        if 'Canada' in cols_element.get_text() or int(finish_rank) <= int(top_n) or finish_rank == 0:
            for col in col_element_data[:-4]: # don't need all columns
                a_tag = col.find('a', href=True)
                if a_tag and 'player.asp' in a_tag['href']:
                    value = a_tag['href'].split('=')[-1]
                    players.append(value)
                #else:
                value = col.text.strip()
                values.append(value)
            data.append(values[:3] + players + values[3:] + misc_info)
                 
    return data


In [12]:
columns = ['Finish', 'Player', 'Partner', 'Player_ID', 'Partner_ID', 'Country'
           , 'TourniStartDate', 'TourniEndDate', 'TournLocation', 'Event', 'TOURNID']
scraped_team_df = pd.DataFrame(columns=columns)

for tournament_id in tourn_id_df['TOURNID']:
#for tournament_id in [3985]:
    data = bvbinfo_team_scraper(tournament_id, 5)
    if data:  # Check if data is not empty
        temp_df = pd.DataFrame(data, columns=columns)
        scraped_team_df = pd.concat([scraped_team_df, temp_df], ignore_index=True)

http://www.bvbinfo.com/Tournament.asp?ID=3962
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=3968
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=3970
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=3972
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=3992
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=3994
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=3964
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=3960
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=3982
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=4147
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=4149
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=4151
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=3961
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=3967
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=3969
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=3971
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=3991
len:  6
http://www.bvbinfo.com/Tournament.asp?ID=3993
len:  6
http://www.bvbinfo.com/Tourn

In [13]:
scraped_team_df

Unnamed: 0,Finish,Player,Partner,Player_ID,Partner_ID,Country,TourniStartDate,TourniEndDate,TournLocation,Event,TOURNID
0,1,Raisa Schoon,Katja Stam,16919,16917,Netherlands,"March 23, 2022","March 23, 2022",Rosarito,Elite 16 Women,3962
1,2,Tina Graudina,Anastasija Samoilova,14462,14774,Latvia,"March 23, 2022","March 23, 2022",Rosarito,Elite 16 Women,3962
2,3,Talita Antunes,Rebecca Cavalcanti,5146,11610,Brazil,"March 23, 2022","March 23, 2022",Rosarito,Elite 16 Women,3962
3,4,Kelly Cheng,Betsi Flint,14478,12974,United States,"March 23, 2022","March 23, 2022",Rosarito,Elite 16 Women,3962
4,5,Duda Lisboa,Ana Patricia Silva,13802,15329,Brazil,"March 23, 2022","March 23, 2022",Rosarito,Elite 16 Women,3962
...,...,...,...,...,...,...,...,...,...,...,...
453,5,Jagoda Gruszczynska,Aleksandra Wachowicz,13339,17223,Poland,"April 13, 2023","April 13, 2023",BPT,Challenge Saquarema Women,4171
454,5,Talita Antunes,Thamela Coradelli,5146,18464,Brazil,"April 13, 2023","April 13, 2023",BPT,Challenge Saquarema Women,4171
455,5,Liliana Fernandez,Paula Soria,8380,11614,Spain,"April 13, 2023","April 13, 2023",BPT,Challenge Saquarema Women,4171
456,25,Marie-Alex Belanger,Molley McBain,21327,18610,Canada,"April 13, 2023","April 13, 2023",BPT,Challenge Saquarema Women,4171


In [14]:
scraped_team_df.to_csv("scraped_tournament_info.csv")