In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/scraping-volleyball-teams/__results__.html
/kaggle/input/scraping-volleyball-teams/__notebook__.ipynb
/kaggle/input/scraping-volleyball-teams/scraped_team_info.csv
/kaggle/input/scraping-volleyball-teams/__output__.json
/kaggle/input/scraping-volleyball-teams/custom.css


In [2]:
# base url
base_url = "http://www.bvbinfo.com/player.asp?ID="

In [3]:
# file generated from another scraping script/notebook
team_df = pd.read_csv("/kaggle/input/scraping-volleyball-teams/scraped_team_info.csv")
team_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 611 entries, 0 to 610
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Finish           574 non-null    float64
 1   Player           611 non-null    object 
 2   Partner          611 non-null    object 
 3   Player_ID        611 non-null    int64  
 4   Partner_ID       611 non-null    int64  
 5   Country          611 non-null    object 
 6   TourniStartDate  611 non-null    object 
 7   TourniEndDate    611 non-null    object 
 8   TournLocation    611 non-null    object 
 9   Event            611 non-null    object 
 10  TOURNID          611 non-null    int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 52.6+ KB


In [4]:
team_df[team_df['Player_ID'] == 39937], team_df[team_df['Partner_ID'] == 39937]


(Empty DataFrame
 Columns: [Finish, Player, Partner, Player_ID, Partner_ID, Country, TourniStartDate, TourniEndDate, TournLocation, Event, TOURNID]
 Index: [],
 Empty DataFrame
 Columns: [Finish, Player, Partner, Player_ID, Partner_ID, Country, TourniStartDate, TourniEndDate, TournLocation, Event, TOURNID]
 Index: [])

In [5]:
scraped_df = pd.DataFrame(columns=["PLAYER_ID", "ATTRIBUTE", "VALUE"])
scraped_df

Unnamed: 0,PLAYER_ID,ATTRIBUTE,VALUE


In [6]:
[team_df['Player_ID'].unique(), team_df['Partner_ID'].unique()]
player_id_list = list(team_df['Player_ID'].unique())
player_id_list.extend(list(team_df['Partner_ID'].unique()))
player_id_list = list(set(player_id_list))

In [7]:
%%time

columns = ["PLAYER_ID", "ATTRIBUTE", "VALUE"]

# Create a list to store the data
data = []
data_values = []

print(len(player_id_list))

for player_id in player_id_list:
# for player_id in range(1,20):
    url = base_url + str(player_id)

    # Make a request to the URL
    try:
        response = requests.get(url, timeout=10)
    except requests.exceptions.Timeout as e:
        # If the request times out, skip to the next player ID
        print(f"Time out for player ID {player_id}: {e}")
        continue

    except requests.exceptions.RequestException as e1:
        # If there is another error, skip to the next player ID
        print(f"Error for player ID {player_id}: {e1}")
        continue
    try:
    # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, "html.parser")
        player_table_element = soup.find('table', {'class': 'clsPlayerTable'})
        # Get player name and country
        name_element = player_table_element.find(class_="clsPlayerName")
        country_element = player_table_element.find(class_="clsPlayerCountry")
        name = name_element.text.strip()
        
        country = country_element.text.strip()
        
        data.append([player_id, "Name", name])
        data.append([player_id, "Country", country])
        
        # tables
        tables =player_table_element.find_all('table')
        for table_element in tables:
            header_element = table_element.find('td',{'class':"clsPlayerHeader"})
            if header_element:
                header_text = header_element.text.strip()
                # print(header_text)
                if 'Vital Statistics' == header_text:
                    info_table = table_element.find('table').find('table')
                    if info_table:
                        info_rows = info_table.find_all('tr')
                        for rows in info_rows:
                            label = rows.find(class_="clsPlayerDataLabel")
                            if label:
                                label = label.text.strip()
                            value = rows.find(class_="clsPlayerData")
                            if value:
                                value = value.text.strip()
                            data.append([player_id, label, value])              
                if 'Career Summary' == header_text:               
                    categories = table_element.find_all('td',{'class':'clsPlayerCategoryHeader'})
                    totals = table_element.find_all('tr',{'class':'clsPlayerDataTotal'})
                    for i in range(len(categories)):
                        data.append([player_id, "Career-" + categories[i].text.strip(),totals[i].find_all('td')[2].text.split('(')[0]])
                    overall_total = totals[len(totals)-1].find_all('td')[2].text.split('(')[0]
                    data.append([player_id, "Career-total",overall_total])
    except:
        # If there is another error, skip to the next player ID
        print(f"There was an Error: {player_id}")
        continue
        
# Display the scraped DataFrame
scraped_df = pd.DataFrame(data, columns=columns)

267
CPU times: user 46 s, sys: 185 ms, total: 46.2 s
Wall time: 1min 27s


In [8]:
%time
scraped_df.to_csv("scraped_player_data.csv", index = False)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs
