# MGT 8803 Project
## Basketball Reference Web Scraper

### Fred Sackfield

This notebook contains code to scrape twitter handle data from player pages on basketball-reference.com. These twitter handles will be stored in a csv file and used as input for an R script that collects twitter data for each player. 

In [1]:
#import necessary libraries
import requests
from bs4 import BeautifulSoup
from urllib.error import URLError, HTTPError
import csv
from datetime import datetime
from time import sleep


In [2]:
def create_soup(url):
    try:
        page = requests.get(url, timeout=10)
    except HTTPError as e:
        print('ERROR CODE: ',e.code)
        return None
    except URLError as e:
        print('ERROR REASON: ',e.reason)
        return None

    html = page.text
    soup = BeautifulSoup(html, 'html.parser')

    return soup
    

In [3]:
#Step 1 - create a list of all team-specific urls

home_url = 'https://www.basketball-reference.com'

home_soup = create_soup(home_url)

team_hrefs = []

for team in home_soup.find_all('th',{'data-stat':'team_name'}):
    
    if 'aria-label' not in team.attrs.keys():
        team_hrefs.append(team.find('a').attrs['href'])

len(team_hrefs)


30

In [8]:
## store the player data in a list of dictionaries, which have the following keys:
## id, fullname, team, age, exp, college, handle

players = []

for href in team_hrefs:
    sleep(2)
    team_soup = create_soup(home_url+href)
    roster = team_soup.find('div',{'id':'div_roster'}).find('tbody')
    
    for player in roster.find_all('tr'):

        pref = player.find('a').attrs['href']
        #print(pref)
        player_soup = create_soup(home_url+pref)
        
        #filter out players who don't have Twitter or who play < 10mpg
        if player_soup.find('tr',{'id':'per_game.2020'}) is None:
            continue
        elif float(player_soup.find('tr',{'id':'per_game.2020'}).find('td',{'data-stat':'mp_per_g'}).text) < 10:
            continue
        elif player_soup.find(text='Twitter') is None:
            continue
        else:
            
            player_cols = player.find_all('td')
            bday = datetime.strptime(player_cols[4].text, "%B %d, %Y")
            today = datetime.strptime(datetime.today().strftime("%B %d, %Y"), "%B %d, %Y")

            player_dict = {}
            player_dict['id'] = pref.split('/')[3].split('.')[0]
            player_dict['fullname'] = player.find('a').text
            player_dict['team'] = href.split('/')[2]
            player_dict['age'] = int(abs((today - bday).days)/365)
            player_dict['exp'] = int(player_cols[6].text) if player_cols[6].text != 'R' else 0
            player_dict['college'] = player_cols[7].text
            player_dict['handle'] = player_soup.find(text='Twitter').find_next('a').text


            players.append(player_dict)
    print('Collected players from '+href.split('/')[2])
        

Collected players from MIL
Collected players from TOR
Collected players from BOS
Collected players from MIA
Collected players from IND
Collected players from PHI
Collected players from BRK
Collected players from ORL
Collected players from WAS
Collected players from CHO
Collected players from CHI
Collected players from NYK
Collected players from DET
Collected players from ATL
Collected players from CLE
Collected players from LAL
Collected players from LAC
Collected players from DEN
Collected players from UTA
Collected players from OKC
Collected players from HOU
Collected players from DAL
Collected players from MEM
Collected players from POR
Collected players from NOP
Collected players from SAC
Collected players from SAS
Collected players from PHO
Collected players from MIN
Collected players from GSW


In [9]:
print(len(players))

383


In [None]:
## data cleaning - if player attended multiple colleges, take the most recent one
for p in players:
    if len(p['college'].split(',')) > 1:
        p['college'] = p['college'].split(',')[len(p['college'].split(','))-1]

In [20]:
#write player data to a csv file

player_columns = ['id','fullname','team','age','exp','college','handle']

csv_file = "players.csv"
try:
    with open(csv_file, 'w', newline='\n', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=player_columns)
        writer.writeheader()
        for data in players:
            writer.writerow(data)
except IOError:
    print("I/O error")