# 1. Set Up

In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

In [2]:
year = input("Input a year to scrape between 2017 - 2019: ")
assert (year in ['2017','2018','2019']),"Input year not in range"

Input a year to scrape between 2016 - 2019: 2016


# 2. Get list of links to all games data

In [3]:
url = 'https://www.asia-basket.com/Vietnam/games-schedule.asp?League=1&LName=VBA&year='+year
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
html = web_byte.decode('utf-8')
soup = BeautifulSoup(html, 'lxml')

In [4]:
# get link list of all VBA games with the specified year
link_list = []
for a in soup.find_all('a', href=True): 
    if a.text and 'http://www.eurobasket.com/Basketball-Box-Score.asp?Game=' in a['href']:
            link = a['href']
            link_list.append(link)
print("List of %d %s VBA games scoreboard links:"%(len(link_list), year))
link_list

List of 0 2016 VBA games scoreboard links:


[]

# 3. Build scrape functions

2 different functions due to different XML format for 2019 and prior to 2019 (2017 & 2018)

In [None]:
# Used for 2019
def game_scrape_2019(soup2):
    # get team names
    teams = []
    for i in soup2.find_all('td',{'class':'my_top_center'}):
        team_link = i.find_all('a')[0]['href']
        k = len('https://basketball.eurobasket.com/team/Vietnam/')
        teams.append(team_link[k:-6])
    
    # get round detail
    r = re.search(r"((Vietnam)(\w*-\w*)*(\w+\s)+\d*)",soup2.text)[0][7:].split(' ')
    round_name = r[0]+' '+r[-1]
    round_name

    table_list = soup2.find_all('table',{'class':'my_Title'})
    
    # Iterate through 2 teams
    for i in [0,1]:
        df = pd.read_html(str(table_list[i]))[0]
        df.columns = df.keys().droplevel() # Drop multi-level index

        # Replace zero playing time
        df = df.fillna(0)
        df = df.replace("DNP - Coach's Decision",0)
        df = df.rename(columns={'AS':'AST','ST':'STL','OFF':'OREB','DEF':'DREB','F':'PF','FV':'BLK'})
        df['#'] = df['#'].astype(int)

        three_point = df['3PM-A'].str.split('-',n=1,expand=True).fillna(0)
        df['3PM']=three_point[0].astype(int)
        df['3PA']=three_point[1].astype(int)

        free_throw = df['FTM-A'].str.split('-',n=1,expand=True).fillna(0)
        df['FTM']=free_throw[0].astype(int)
        df['FTA']=free_throw[1].astype(int)

        two_point = df['2PM-A'].str.split('-',n=1,expand=True).fillna(0)
        df['FGM']=two_point[0].astype(int)+df['3PM']
        df['FGA']=two_point[1].astype(int)+df['3PA']

        df['ROUND'] = round_name
        df['TEAM'] = teams[i]
        df['OPP'] = teams[i-1]

        colnames = ['ROUND','TEAM','OPP','#','NAME','MIN','FGM','FGA','3PM','3PA','FTM','FTA','OREB','DREB','AST','STL','BLK','TO','PF','PTS']
        for i in colnames:
            if (i not in ['ROUND','TEAM','OPP','NAME']):
                df[i] = df[i].astype(int)
        df = df[colnames]
        
        # Separate total row
        players = df[:-1]
        team = df[-1:]
        team = team.reset_index(drop=True)
        team = team[colnames[0:3]+colnames[6:]]
        
        team_filename = 'vba'+year+'_team_data.csv'
        player_filename = 'vba'+year+'_player_data.csv'
    
        with open(team_filename, 'a') as f:
            team.to_csv(f, header=f.tell()==0)
        with open(player_filename, 'a') as f:
            players.to_csv(f, header=f.tell()==0)
    return

# Used for 2017 & 2018
def game_scrape_2018(soup2):
    teams = []
    for i in soup2.find_all('td',{'class':'my_top_center'}):
        team_link = i.find_all('a')[0]['href']
        k = len('http://basketball.eurobasket.com/team/Vietnam/')
        teams.append(team_link[k:-6])
    
    r = re.search(r"((Vietnam)(\w*-\w*)*(\w+\s)+\d*)",soup2.text)[0][7:].split(' ')
    round_name = r[0]+' '+r[-1]
    round_name

    table_list = soup2.find_all('table',{'class':'my_Title'})
    
    for i in [0,1]:
        df = pd.read_html(str(table_list[i]),header=1)[0]

        # Replace zero playing time
        df = df.fillna(0)
        df = df.replace("DNP - Coach's Decision",0)
        df = df.rename(columns={'AS':'AST','ST':'STL','OFF':'OREB','DEF':'DREB','CM':'PF','FV':'BLK'})
        
        # Process total row
        df2 = df[-1:].drop('+/-',axis=1)
        rename = {}
        for j in range(len(df.keys())-1):
            rename[df.keys()[j]] = df.keys()[j+1]
        df2 = df2.rename(columns=rename)
        df2['NAME'] = 'Total'
        df2['#'] = 0
        df = pd.concat([df[:-2],df2],sort=True)
        
        df['#'] = df['#'].astype(int)

        three_point = df['3PM-A'].str.split('-',n=1,expand=True).fillna(0)
        df['3PM']=three_point[0].astype(int)
        df['3PA']=three_point[1].astype(int)

        free_throw = df['FTM-A'].str.split('-',n=1,expand=True).fillna(0)
        df['FTM']=free_throw[0].astype(int)
        df['FTA']=free_throw[1].astype(int)

        two_point = df['2PM-A'].str.split('-',n=1,expand=True).fillna(0)
        df['FGM']=two_point[0].astype(int)+df['3PM']
        df['FGA']=two_point[1].astype(int)+df['3PA']

        df['ROUND'] = round_name
        df['TEAM'] = teams[i]
        df['OPP'] = teams[i-1]

        colnames = ['ROUND','TEAM','OPP','#','NAME','MIN','FGM','FGA','3PM','3PA','FTM','FTA','OREB','DREB','AST','STL','BLK','TO','PF','PTS']
        for i in colnames:
            if (i not in ['ROUND','TEAM','OPP','NAME']):
                df[i] = df[i].astype(int)
        df = df[colnames]
    
        players = df[:-1]
        team = df[-1:]
        team = team.reset_index(drop=True)
        team = team[colnames[0:3]+colnames[6:]]
        
        team_filename = 'vba'+year+'_team_data.csv'
        player_filename = 'vba'+year+'_player_data.csv'
    
        with open(team_filename, 'a') as f:
            team.to_csv(f, header=f.tell()==0)
        with open(player_filename, 'a') as f:
            players.to_csv(f, header=f.tell()==0)
    return

# 4. Scrape through link list

In [None]:
for link in link_list:
    req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
    web_byte = urlopen(req).read()
    html = web_byte.decode('utf-8')
    soup = BeautifulSoup(html, 'lxml')
    if year == 2019:
        game_scrape_2019(soup)
    else:
        game_scrape_2018(soup)
    print("Completed: "+link)