# 1. Set Up

In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

# 2. Get list of links

In [2]:
year_list = [2017,2018,2019]
teamName = ['Danang-Dragons','Hanoi-Buffalos','Can-Tho-Catfish','Hochiminh-City-Wings','Saigon-Heat','Thang-Long-Warriors']
teamID = ['DND','HNB','CTC','HCW','SGH','TLW']
teamDict = {teamName[i]:teamID[i] for i in range(len(teamName))}
teamDict2 = {teamID[i]:teamName[i] for i in range(len(teamName))}
team_list = ['https://basketball.asia-basket.com/team/Vietnam/Can-Tho-Catfish/21746?Year=',
            'https://basketball.asia-basket.com/team/Vietnam/Saigon-Heat/16639?Year=',
            'https://basketball.asia-basket.com/team/Vietnam/Hochiminh-City-Wings/21747?Year=',
            'https://basketball.asia-basket.com/team/Vietnam/Thang-Long-Warriors/30968?Year=',
            'https://basketball.asia-basket.com/team/Vietnam/Hanoi-Buffalos/21745?Year=',
            'https://basketball.asia-basket.com/team/Vietnam/Danang-Dragons/21682?Year=']
url_list = []
for i in team_list:
    for j in year_list:
        url_list.append(i+str(j))
url_list

Test URL: https://basketball.asia-basket.com/team/Vietnam/Can-Tho-Catfish/21746?Year=2017


# 3. Build scrape function

In [6]:
url = 'https://basketball.asia-basket.com/team/Vietnam/Thang-Long-Warriors/30968?Year=2017'
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
html = web_byte.decode('utf-8')
soup = BeautifulSoup(html, 'lxml')

In [9]:
def profile_scrape(url):
    #Start scrape pipeline
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    web_byte = urlopen(req).read()
    html = web_byte.decode('utf-8')
    soup = BeautifulSoup(html, 'lxml')
    
    #Extract team name & year
    header = str(soup.find_all('td',{'class':'tabletop'})[0])
    year = int(re.findall(r'(\d+)', header)[0])
    teamName = "-".join([i[0] for i in re.findall(r'(([A-Z])\w+)', header)])

    #Get main table
    #Locate main table (different for a specific page)
    i = 4
    if url in ['https://basketball.asia-basket.com/team/Vietnam/Thang-Long-Warriors/30968?Year=2017']:
        i = 3
    #Table to df
    table = soup.find_all('table', {'align':"center", 'cellpadding':"0",'cellspacing':"0"})[i]
    df = pd.read_html(str(table),header=0)[0]
    df.columns = df.columns.str.lower()
    
    #Drop `coach` row
    if ('coach' in str(df[-1:]['#'].astype(str).str.lower())):
          df = df[:-1]
    
    #Drop `bo` row
    df = df.drop(['bo','nat'],axis=1)

    # Set 'Nationality'
    nationality = []
    for row in table.find_all('tr'): #Only pick first nationality if multiple
        for im in row.find_all('img'):
            nationality.append(im.get('alt'))
            break
    df['nat'] = nationality

    # Split 'Height' column
    df['cm (inch)'] = df['cm (inch)'].replace("0 (0'0'')",np.NaN)
    df['height_cm']=df['cm (inch)'].str.split(' ',n=2,expand=True)[0].astype(float)
    df = df.drop('cm (inch)',axis=1)

    # Convert '#' column and fill in NaN with '-1'
    df['#'] = df['#'].fillna(-1).astype(int)

    # Split 'Pos' column
    lst = df['pos'].str.split('/',n=2,expand=True)
    df['pos1']=lst[0]
    if len(list(lst.keys())) > 1:
        df['pos2']=lst[1]
    else:
        df['pos2']=np.nan
    df = df.drop('pos',axis=1)

    # Split 'Name' column
    def name_sep(strName):
        lst = strName.split(' ')
        first_middle = '-'.join(lst[:-1])
        last = lst[-1]
        return first_middle, last

    df['first_middle'] = df['name'].apply(lambda x: name_sep(x)[0])
    df['last'] = df['name'].apply(lambda x: name_sep(x)[1])
    df = df.drop('name',axis=1)

    # Set 'teamID' & 'year' Column
    df['teamID'] = teamDict[teamName]
    df['year'] = year

    # Display final df
    colnames = ['teamID','year', "#",'first_middle','last','pos1','pos2','nat','height_cm']
    df = df[colnames] #Rearrange columns
    
    return(df)

#Test with first url
print("Test URL: "+url_list[0])
profile_scrape(url_list[0])

Unnamed: 0,teamID,year,#,first_middle,last,pos1,pos2,nat,height_cm
0,CTC,2017,21,Hamilton,DeAngelo,F,C,USA,203.0
1,CTC,2017,23,Dinh-Duy,Tan,G,F,Vietnam,192.0
2,CTC,2017,11,Thanh,Dinh,,,Vietnam,
3,CTC,2017,8,Ngcc,Nguycn,,,Vietnam,
4,CTC,2017,65,Huynh-Huu,Thang,F,,Vietnam,
5,CTC,2017,-1,Brown,Khalil,F,,USA,206.0
6,CTC,2017,8,Linh-Tran,Vu,F,,Vietnam,187.0
7,CTC,2017,-1,Le-Van,Day,F,,Vietnam,185.0
8,CTC,2017,6,Nguycn,Hoang,,,Vietnam,
9,CTC,2017,12,Hucnh,Nguycn,,,Vietnam,


# 4. Scrape & Save to csv

In [11]:
for url in url_list:
    print("Started: "+url)
    df = profile_scrape(url)
    print("Completed: "+url)
    with open('player_profile.csv', 'a', encoding='utf-8') as f:
            df.to_csv(f, header=f.tell()==0)

Started: https://basketball.asia-basket.com/team/Vietnam/Thang-Long-Warriors/30968?Year=2019
Completed: https://basketball.asia-basket.com/team/Vietnam/Thang-Long-Warriors/30968?Year=2019
Started: https://basketball.asia-basket.com/team/Vietnam/Hanoi-Buffalos/21745?Year=2017
Completed: https://basketball.asia-basket.com/team/Vietnam/Hanoi-Buffalos/21745?Year=2017
Started: https://basketball.asia-basket.com/team/Vietnam/Hanoi-Buffalos/21745?Year=2018
Completed: https://basketball.asia-basket.com/team/Vietnam/Hanoi-Buffalos/21745?Year=2018
Started: https://basketball.asia-basket.com/team/Vietnam/Hanoi-Buffalos/21745?Year=2019
Completed: https://basketball.asia-basket.com/team/Vietnam/Hanoi-Buffalos/21745?Year=2019
Started: https://basketball.asia-basket.com/team/Vietnam/Danang-Dragons/21682?Year=2017
Completed: https://basketball.asia-basket.com/team/Vietnam/Danang-Dragons/21682?Year=2017
Started: https://basketball.asia-basket.com/team/Vietnam/Danang-Dragons/21682?Year=2018
Completed: h