# Automatic data scraping of basketball statistics on RealGM

## Define get_data function

In [2]:
import urllib.request as urllib
from bs4 import BeautifulSoup
import csv
import pandas as pd
import os

In [3]:
def get_data(
        league_id,
        league_name,
        league_year,
        stat_type='Averages'
):


    # Find all pages with valid stat tables

    table_list = []

    # Use and append positions
    positions = ['C', 'PF', 'SF', 'SG', 'PG']

    for position in positions:
        page_count = 0
        while True:
            page_count += 1
            url = f"http://basketball.realgm.com/international/league/{league_id}/{league_name}/stats/{league_year}/{stat_type}/Qualified/All/points/{position}/desc/{page_count}/Regular_Season"
            print("Parsing " + url)
            page = urllib.urlopen(url)
            soup = BeautifulSoup(page, 'html.parser')
            table = soup.find('table')
            if table is None:
                print('No table data found. Exiting...')
                break
            else:
                table_list.append((position,table))

    # Get column names
    table = table_list[0][1]
    columns = table.find_all('th')
    columns = [column.text for column in columns]
    columns.insert(4, 'Position')

    # initialize csv file
    with open(f"Data/{league_name}_{league_year}_{stat_type}.csv", 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(columns)

        # write data to csv file
        for table in table_list:
            position = table[0]
            table_data = table[1]

            rows = table_data.find_all('tr')

            for row in rows:
                cells = row.find_all('td')
                if len(cells) == 0:
                    continue
                row_data = [cell.text for cell in cells]
                row_data.insert(4, position)
                writer.writerow(row_data)

    print(str(len(table_list)) + " pages parsed. Data saved to " + f"RealGM_data/{league_name}_{league_year}_averages.csv")

## Running function

In [7]:

get_data(
    '131', 
    'PBA--Commissioners-Cup', 
    '2024',
    'Totals'
)
get_data(
    '130', 
    'PBA--Philippine-Cup', 
    '2024',
    'Totals'
)
get_data(
    '105',
    'Japanese-BLeague',
    '2024',
    'Totals'
)
get_data(
    '63', 
    'South-Korean-KBL', 
    '2024',
    'Totals'
)

Parsing http://basketball.realgm.com/international/league/131/PBA--Commissioners-Cup/stats/2024/Totals/Qualified/All/points/C/desc/1/Regular_Season
Parsing http://basketball.realgm.com/international/league/131/PBA--Commissioners-Cup/stats/2024/Totals/Qualified/All/points/C/desc/2/Regular_Season
No table data found. Exiting...
Parsing http://basketball.realgm.com/international/league/131/PBA--Commissioners-Cup/stats/2024/Totals/Qualified/All/points/PF/desc/1/Regular_Season
Parsing http://basketball.realgm.com/international/league/131/PBA--Commissioners-Cup/stats/2024/Totals/Qualified/All/points/PF/desc/2/Regular_Season
No table data found. Exiting...
Parsing http://basketball.realgm.com/international/league/131/PBA--Commissioners-Cup/stats/2024/Totals/Qualified/All/points/SF/desc/1/Regular_Season
Parsing http://basketball.realgm.com/international/league/131/PBA--Commissioners-Cup/stats/2024/Totals/Qualified/All/points/SF/desc/2/Regular_Season
No table data found. Exiting...
Parsing http

## CSV file checking

In [8]:
# read csv files of each league 

pba_df = pd.read_csv('Data/PBA--Commissioners-Cup_2024_Totals.csv')
pba_df2 = pd.read_csv('Data/PBA--Philippine-Cup_2024_Totals.csv')
japan_df = pd.read_csv('Data/Japanese-BLeague_2024_Totals.csv')
korean_df = pd.read_csv('Data/South-Korean-KBL_2024_Totals.csv')

In [64]:
# get head of each dataframe

print(pba_df.head())
print(bleague_df.head())
print(kbl_df.head())

   #                   Player  Team  GP Position   MPG   PPG   FGM   FGA  \
0  1  Rondae Hollis-Jefferson   TNT  10        C  41.2  34.6  12.9  26.2   
1  2          Thomas Robinson  NLEX   4        C  37.3  34.5  12.8  22.3   
2  3         Suleiman Braimoh   MER   5        C  42.7  32.8  11.6  25.4   
3  4        Christopher Ortiz  BLAC  11        C  41.2  25.7   9.5  21.8   
4  5           DaJuan Summers   ROS   4        C  39.6  24.8   7.5  19.5   

     FG%  ...   FTA    FT%  ORB   DRB   RPG  APG  SPG  BPG  TOV   PF  
0  0.492  ...  11.0  0.691  2.5   9.3  11.8  6.6  2.2  1.4  3.2  2.6  
1  0.573  ...  14.0  0.625  3.0  10.3  13.3  3.5  1.8  1.8  5.8  4.0  
2  0.457  ...  10.4  0.692  3.4   8.6  12.0  2.8  2.2  0.8  3.6  2.2  
3  0.438  ...   6.5  0.746  1.5   8.9  10.4  4.9  1.4  0.7  3.2  1.8  
4  0.385  ...   7.3  0.862  2.3   7.0   9.3  2.8  1.0  1.5  3.5  3.3  

[5 rows x 24 columns]
   #          Player  Team  GP Position   MPG   PPG  FGM   FGA    FG%  ...  \
0  1     John Mo

In [4]:
get_data(
    '105',
    'Japanese-BLeague',
    '2024',
    'Averages'
)

Parsing http://basketball.realgm.com/international/league/105/Japanese-BLeague/stats/2024/Averages/Qualified/All/points/C/desc/1/Regular_Season
Parsing http://basketball.realgm.com/international/league/105/Japanese-BLeague/stats/2024/Averages/Qualified/All/points/C/desc/2/Regular_Season
No table data found. Exiting...
Parsing http://basketball.realgm.com/international/league/105/Japanese-BLeague/stats/2024/Averages/Qualified/All/points/PF/desc/1/Regular_Season
Parsing http://basketball.realgm.com/international/league/105/Japanese-BLeague/stats/2024/Averages/Qualified/All/points/PF/desc/2/Regular_Season
No table data found. Exiting...
Parsing http://basketball.realgm.com/international/league/105/Japanese-BLeague/stats/2024/Averages/Qualified/All/points/SF/desc/1/Regular_Season
Parsing http://basketball.realgm.com/international/league/105/Japanese-BLeague/stats/2024/Averages/Qualified/All/points/SF/desc/2/Regular_Season
No table data found. Exiting...
Parsing http://basketball.realgm.com

## Test cells

In [70]:
# print duplicate rows in pba_df dropping the position column

pba_df_drop = pba_df.drop(columns=['Position'])

pba_df_drop.duplicated()

# print rows in pba_df with the same name

pba_df_names = pba_df['Player']
pba_df[pba_df_names.duplicated()]

Unnamed: 0,#,Player,Team,GP,Position,MPG,PPG,FGM,FGA,FG%,...,FTA,FT%,ORB,DRB,RPG,APG,SPG,BPG,TOV,PF
76,10,Celedonio Trollano,SMB,15,SG,23.8,9.5,3.5,6.7,0.515,...,1.9,0.759,1.5,2.7,4.2,0.9,0.5,0.2,1.1,1.5


In [71]:
# print all matches for celedonio trollano

pba_df[pba_df['Player'] == 'Celedonio Trollano']

Unnamed: 0,#,Player,Team,GP,Position,MPG,PPG,FGM,FGA,FG%,...,FTA,FT%,ORB,DRB,RPG,APG,SPG,BPG,TOV,PF
67,1,Celedonio Trollano,NLEX,7,SG,31.4,17.4,6.0,12.4,0.483,...,4.1,0.828,0.3,2.9,3.1,2.7,0.9,0.1,1.9,1.6
76,10,Celedonio Trollano,SMB,15,SG,23.8,9.5,3.5,6.7,0.515,...,1.9,0.759,1.5,2.7,4.2,0.9,0.5,0.2,1.1,1.5
