# 1. Set Up

In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

In [2]:
year = input("Input a year to scrape between 2016 - 2019: ")
assert (year in ['2016','2017','2018','2019']),"Input year not in range"

Input a year to scrape between 2016 - 2019: 2017


In [3]:
url = 'https://www.asia-basket.com/Vietnam/games-schedule.asp?League=1&LName=VBA&year='+year
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
html = web_byte.decode('utf-8')
soup = BeautifulSoup(html, 'lxml')

# 2. Get list of links to all games data

In [4]:
# get link list of all VBA games in 2019
link_list = []
for a in soup.find_all('a', href=True): 
    if a.text and 'http://www.eurobasket.com/Basketball-Box-Score.asp?Game=' in a['href']:
            link = a['href']
            link_list.append(link)
print("List of %d %s VBA games scoreboard links:"%(len(link_list), year))
link_list

List of 51 2017 VBA games scoreboard links:


['http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0905_30968_21682-Vietnam',
 'http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0907_21746_21745-Vietnam',
 'http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0909_16639_30968-Vietnam',
 'http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0909_21682_21747-Vietnam',
 'http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0910_21745_21746-Vietnam',
 'http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0912_21747_30968-Vietnam',
 'http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0916_21745_21747-Vietnam',
 'http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0916_16639_21746-Vietnam',
 'http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0917_21682_30968-Vietnam',
 'http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0919_21747_16639-Vietnam',
 'http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0921_21745_30968-Vietnam',
 'http://w

# 3. Get test game data

In [5]:
# get individual link from link list (this is first game in season 2019)
url2 = link_list[-8]
req2 = Request(url2, headers={'User-Agent': 'Mozilla/5.0'})
web_byte2 = urlopen(req2).read()
html2 = web_byte2.decode('utf-8')
soup2 = BeautifulSoup(html2, 'lxml')

In [6]:
# get team names
teams = []
for i in soup2.find_all('td',{'class':'my_top_center'}):
    team_link = i.find_all('a')[0]['href']
    k = len('http://basketball.eurobasket.com/team/Vietnam/')
    teams.append(team_link[k:-6])

teams

['Can-Tho-Catfish', 'Saigon-Heat']

In [7]:
# get round detail
import re
r = re.search(r"((Vietnam)(\w*-\w*)*(\w+\s)+\d*)",soup2.text)[0][7:].split(' ')
round_name = r[0]+' '+r[-1]
round_name

'Semi-Finals 1'

In [8]:
table_list = soup2.find_all('table',{'class':'my_Title'})

# Team 1
df = pd.read_html(str(table_list[0]),header=1)[0]
# Replace zero playing time
df = df.fillna(0)
df = df.replace("DNP - Coach's Decision",0)
df = df.rename(columns={'AS':'AST','ST':'STL','OFF':'OREB','DEF':'DREB','CM':'PF','FV':'BLK'})

df2 = df[-1:].drop('+/-',axis=1)
rename = {}
for i in range(len(df.keys())-1):
    rename[df.keys()[i]] = df.keys()[i+1]
df2 = df2.rename(columns=rename)
df2['NAME'] = 'Total'
df2['#'] = 0
df = pd.concat([df[:-2],df2],sort=True)

df['#'] = df['#'].astype(int)

three_point = df['3PM-A'].str.split('-',n=1,expand=True).fillna(0)
df['3PM']=three_point[0].astype(int)
df['3PA']=three_point[1].astype(int)

free_throw = df['FTM-A'].str.split('-',n=1,expand=True).fillna(0)
df['FTM']=free_throw[0].astype(int)
df['FTA']=free_throw[1].astype(int)

two_point = df['2PM-A'].str.split('-',n=1,expand=True).fillna(0)
df['FGM']=two_point[0].astype(int)+df['3PM']
df['FGA']=two_point[1].astype(int)+df['3PA']

df['ROUND'] = round_name
df['TEAM'] = teams[0]
df['OPP'] = teams[0-1]

colnames = ['ROUND','TEAM','OPP','#','NAME','MIN','FGM','FGA','3PM','3PA','FTM','FTA','OREB','DREB','AST','STL','BLK','TO','PF','PTS']
for i in colnames:
    if (i not in ['ROUND','TEAM','OPP','NAME']):
        df[i] = df[i].astype(int)
df = df[colnames]

df

Unnamed: 0,ROUND,TEAM,OPP,#,NAME,MIN,FGM,FGA,3PM,3PA,FTM,FTA,OREB,DREB,AST,STL,BLK,TO,PF,PTS
0,Semi-Finals 1,Can-Tho-Catfish,Saigon-Heat,21,"Hamilton, DeAngelo",45,12,29,2,9,5,5,6,16,2,3,6,5,3,31
1,Semi-Finals 1,Can-Tho-Catfish,Saigon-Heat,23,"Dinh, Tan",45,8,25,2,11,3,4,4,8,0,1,2,2,4,21
2,Semi-Finals 1,Can-Tho-Catfish,Saigon-Heat,65,"Le-Hieu, Thanh",45,6,11,3,7,0,0,0,6,1,3,1,5,2,15
3,Semi-Finals 1,Can-Tho-Catfish,Saigon-Heat,11,"Thanh, Dinh",42,4,14,1,8,0,2,0,2,4,0,0,3,4,9
4,Semi-Finals 1,Can-Tho-Catfish,Saigon-Heat,52,"Nguyen-Hoang, Tu",23,4,11,0,3,1,1,3,2,0,1,0,0,3,9
5,Semi-Finals 1,Can-Tho-Catfish,Saigon-Heat,7,"Du-Minh, An",11,0,3,0,2,0,0,1,4,1,1,0,1,1,0
6,Semi-Finals 1,Can-Tho-Catfish,Saigon-Heat,3,"Le, Nguycn",6,0,1,0,0,0,0,0,0,0,0,1,0,0,0
7,Semi-Finals 1,Can-Tho-Catfish,Saigon-Heat,13,"Le-Van, Day",5,0,1,0,0,0,0,0,0,0,1,0,1,0,0
8,Semi-Finals 1,Can-Tho-Catfish,Saigon-Heat,19,"Ngcc, Nguycn",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,Semi-Finals 1,Can-Tho-Catfish,Saigon-Heat,77,"Huynh-Huu, Thang",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Separate total row
players = df[:-1]
players

In [None]:
team = df[-1:]
team = team.reset_index(drop=True)
team = team[colnames[0:3]+colnames[6:]]
team

# 4. Build scrape function and Scrape through all links!

In [10]:
def game_scrape_2019(soup2):
    teams = []
    for i in soup2.find_all('td',{'class':'my_top_center'}):
        team_link = i.find_all('a')[0]['href']
        k = len('https://basketball.eurobasket.com/team/Vietnam/')
        teams.append(team_link[k:-6])
    
    r = re.search(r"((Vietnam)(\w*-\w*)*(\w+\s)+\d*)",soup2.text)[0][7:].split(' ')
    round_name = r[0]+' '+r[-1]
    round_name

    table_list = soup2.find_all('table',{'class':'my_Title'})
    
    for i in [0,1]:
        df = pd.read_html(str(table_list[i]))[0]
        df.columns = df.keys().droplevel() # Drop multi-level index

        # Replace zero playing time
        df = df.fillna(0)
        df = df.replace("DNP - Coach's Decision",0)
        df = df.rename(columns={'AS':'AST','ST':'STL','OFF':'OREB','DEF':'DREB','F':'PF','FV':'BLK'})
        df['#'] = df['#'].astype(int)

        three_point = df['3PM-A'].str.split('-',n=1,expand=True).fillna(0)
        df['3PM']=three_point[0].astype(int)
        df['3PA']=three_point[1].astype(int)

        free_throw = df['FTM-A'].str.split('-',n=1,expand=True).fillna(0)
        df['FTM']=free_throw[0].astype(int)
        df['FTA']=free_throw[1].astype(int)

        two_point = df['2PM-A'].str.split('-',n=1,expand=True).fillna(0)
        df['FGM']=two_point[0].astype(int)+df['3PM']
        df['FGA']=two_point[1].astype(int)+df['3PA']

        df['ROUND'] = round_name
        df['TEAM'] = teams[i]
        df['OPP'] = teams[i-1]

        colnames = ['ROUND','TEAM','OPP','#','NAME','MIN','FGM','FGA','3PM','3PA','FTM','FTA','OREB','DREB','AST','STL','BLK','TO','PF','PTS']
        for i in colnames:
            if (i not in ['ROUND','TEAM','OPP','NAME']):
                df[i] = df[i].astype(int)
        df = df[colnames]
    
        players = df[:-1]
        team = df[-1:]
        team = team.reset_index(drop=True)
        team = team[colnames[0:3]+colnames[6:]]
        
        team_filename = 'vba'+year+'_team_data.csv'
        player_filename = 'vba'+year+'_player_data.csv'
    
        with open(team_filename, 'a') as f:
            team.to_csv(f, header=f.tell()==0)
        with open(player_filename, 'a') as f:
            players.to_csv(f, header=f.tell()==0)
    return


def game_scrape_2018(soup2):
    teams = []
    for i in soup2.find_all('td',{'class':'my_top_center'}):
        team_link = i.find_all('a')[0]['href']
        k = len('http://basketball.eurobasket.com/team/Vietnam/')
        teams.append(team_link[k:-6])
    
    r = re.search(r"((Vietnam)(\w*-\w*)*(\w+\s)+\d*)",soup2.text)[0][7:].split(' ')
    round_name = r[0]+' '+r[-1]
    round_name

    table_list = soup2.find_all('table',{'class':'my_Title'})
    
    for i in [0,1]:
        df = pd.read_html(str(table_list[i]),header=1)[0]

        # Replace zero playing time
        df = df.fillna(0)
        df = df.replace("DNP - Coach's Decision",0)
        df = df.rename(columns={'AS':'AST','ST':'STL','OFF':'OREB','DEF':'DREB','CM':'PF','FV':'BLK'})
        
        # Process total row
        df2 = df[-1:].drop('+/-',axis=1)
        rename = {}
        for j in range(len(df.keys())-1):
            rename[df.keys()[j]] = df.keys()[j+1]
        df2 = df2.rename(columns=rename)
        df2['NAME'] = 'Total'
        df2['#'] = 0
        df = pd.concat([df[:-2],df2],sort=True)
        
        df['#'] = df['#'].astype(int)

        three_point = df['3PM-A'].str.split('-',n=1,expand=True).fillna(0)
        df['3PM']=three_point[0].astype(int)
        df['3PA']=three_point[1].astype(int)

        free_throw = df['FTM-A'].str.split('-',n=1,expand=True).fillna(0)
        df['FTM']=free_throw[0].astype(int)
        df['FTA']=free_throw[1].astype(int)

        two_point = df['2PM-A'].str.split('-',n=1,expand=True).fillna(0)
        df['FGM']=two_point[0].astype(int)+df['3PM']
        df['FGA']=two_point[1].astype(int)+df['3PA']

        df['ROUND'] = round_name
        df['TEAM'] = teams[i]
        df['OPP'] = teams[i-1]

        colnames = ['ROUND','TEAM','OPP','#','NAME','MIN','FGM','FGA','3PM','3PA','FTM','FTA','OREB','DREB','AST','STL','BLK','TO','PF','PTS']
        for i in colnames:
            if (i not in ['ROUND','TEAM','OPP','NAME']):
                df[i] = df[i].astype(int)
        df = df[colnames]
    
        players = df[:-1]
        team = df[-1:]
        team = team.reset_index(drop=True)
        team = team[colnames[0:3]+colnames[6:]]
        
        team_filename = 'vba'+year+'_team_data.csv'
        player_filename = 'vba'+year+'_player_data.csv'
    
        with open(team_filename, 'a') as f:
            team.to_csv(f, header=f.tell()==0)
        with open(player_filename, 'a') as f:
            players.to_csv(f, header=f.tell()==0)
    return

In [11]:
for link in link_list:
    print(link)
    req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
    web_byte = urlopen(req).read()
    html = web_byte.decode('utf-8')
    soup = BeautifulSoup(html, 'lxml')
    game_scrape_2018(soup)

http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0905_30968_21682-Vietnam
http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0907_21746_21745-Vietnam
http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0909_16639_30968-Vietnam
http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0909_21682_21747-Vietnam
http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0910_21745_21746-Vietnam
http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0912_21747_30968-Vietnam
http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0916_21745_21747-Vietnam
http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0916_16639_21746-Vietnam
http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0917_21682_30968-Vietnam
http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0919_21747_16639-Vietnam
http://www.eurobasket.com/Basketball-Box-Score.asp?Game=2017_0921_21745_30968-Vietnam
http://www.eurobasket.com/Basketball-Box-Score.asp?Gam