In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/scraping-volleyball-teams/__results__.html
/kaggle/input/scraping-volleyball-teams/__notebook__.ipynb
/kaggle/input/scraping-volleyball-teams/scraped_team_info.csv
/kaggle/input/scraping-volleyball-teams/__output__.json
/kaggle/input/scraping-volleyball-teams/custom.css


In [2]:
# file generated from another scraping script/notebook
team_df = pd.read_csv("/kaggle/input/scraping-volleyball-teams/scraped_team_info.csv")
team_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 611 entries, 0 to 610
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Finish           574 non-null    float64
 1   Player           611 non-null    object 
 2   Partner          611 non-null    object 
 3   Player_ID        611 non-null    int64  
 4   Partner_ID       611 non-null    int64  
 5   Country          611 non-null    object 
 6   TourniStartDate  611 non-null    object 
 7   TourniEndDate    611 non-null    object 
 8   TournLocation    611 non-null    object 
 9   Event            611 non-null    object 
 10  TOURNID          611 non-null    int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 52.6+ KB


In [3]:
scraped_df = pd.DataFrame(columns=["PLAYER_ID", "ATTRIBUTE", "VALUE"])
scraped_df

Unnamed: 0,PLAYER_ID,ATTRIBUTE,VALUE


In [4]:
%%time

# Create a list to store the data
data = []
player_id_list = team_df['Player_ID'].unique()
print(len(player_id_list))

for player_id in player_id_list:
#for player_id in range(1, 30000):
    url = f"http://www.bvbinfo.com/player.asp?ID={player_id}"

    # Make a request to the URL
    try:
        response = requests.get(url, timeout=10)
    except requests.exceptions.Timeout as e:
        # If the request times out, skip to the next player ID
        print(f"Time out for player ID {player_id}: {e}")
        continue

    except requests.exceptions.RequestException as e:
        # If there is another error, skip to the next player ID
        print(f"Error for player ID {player_id}: {e}")
        continue

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, "html.parser")

    # Get player name and country
    name_element = soup.find(class_="clsPlayerName")
    if name_element:
        name = name_element.text.strip()
        country_element = soup.find(class_="clsPlayerCountry")
        if country_element:
            country = country_element.text.strip()
            # Append player ID, name, and country to the data list
            data.append([player_id, "Name", name])
            data.append([player_id, "Country", country])
        else:
            country = ' '
            data.append([player_id, "Country", country])
    else:
        name = ' '
        data.append([player_id, "Name", name])

    # Find the element with the "Vital Statistics" string
    vital_statistics = soup.find(string="Vital Statistics")

    if vital_statistics:
        # Find the parent table of the "Vital Statistics"
        table = vital_statistics.find_parent("table")

        if table:
            # Iterate over the rows in the table
            for row in table.find_all("tr"):
                # Get the columns in the row
                columns = row.find_all("td")

                # If the row has at least 2 columns of data and without the 'Photo' in the first column
                if len(columns) >= 2 and "Photo" not in columns[0].text:

                    # Get the text in the first column (when class="clsPlayerDataLabel")
                    # Let the text in the first column become the key
                    key = columns[0].text.strip()

                    # Get the text in the second column (when class="clsPlayerData")
                    # Let the text in the second column become the value
                    value = columns[1].text.strip()

                    if len(columns) >= 3:
                        # Split the text from the second column and the third column
                        value += ';'
                        # Get the text in the third column (when class="clsPlayerData")
                        value += columns[2].text.strip()

                    # Append player ID, key, and value to the data list
                    data.append([player_id, key, value])

# Create the DataFrame from the data list
scraped_df = pd.DataFrame(data, columns=["PLAYER_ID", "ATTRIBUTE", "VALUE"])

# Display the scraped DataFrame
scraped_df.head()

142
CPU times: user 46.9 s, sys: 166 ms, total: 47.1 s
Wall time: 1min 14s


Unnamed: 0,PLAYER_ID,ATTRIBUTE,VALUE
0,14741,Name,Anders Berntsen Mol
1,14741,Country,Norway
2,14741,Birth Date,"July 2, 1997 (26 years old)"
3,14741,Home Town,"Stord, Hordaland"
4,14741,Resides,Strandvik


In [5]:
%time
scraped_df.to_csv("scraped_player_data.csv", index = False)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.82 µs
