In [1]:
import re       # regex -- sequence of characters forming a search pattern
import requests # sending HTTP requests and communicate with web servers -- download data from web
from bs4 import BeautifulSoup # pulling/parsing data from HTML, XML files

import pandas as pd 
import numpy as np 
from time import sleep 

We are going to use [Sports-reference.com](https://www.sports-reference.com/), where you can get player-level statistics for free.

<img src="http://www.cwhenrypta.org/wp-content/uploads/2014/10/sports-reference-logo.png" width=25% align="left"/>
<br/>
<br/>
<br/>

<br>Let's start with one specific player `Larry Bird`. <br>
<br/>His information is listed at: https://www.basketball-reference.com/players/b/birdla01.html <br>

In [2]:
target_url = "https://www.basketball-reference.com/players/b/birdla01.html"
response = requests.get(target_url).text        # need HTML content as a string for parsing and simple text manipulation
response



### 1. Scrape Player's page unique identifier -- onealsh01

In [3]:
# r -- raw string telling Python to interpret \ as characters not escape characters
# \n -- new line, \t -- tab, \r -- return
# span -- markup to text/portion so we can change that specific portion
 
full_name = re.findall(r'\n\t<h1>\n\t\t<span>(.+?)<\/span>', response)
full_name

['Larry Bird']

In [4]:
p_id = re.findall(r'\/[A-Za-z]\/(.+?).html', target_url)
p_id

['birdla01']

### 2. Scrape NBA Debut Date

In [5]:
# [^<]* -- matches any character except < character 0 or more times (*).
    # It skips any characters between </strong> tag and <a> tag
    # matches any number of characters up until next < character
# [^"] -- matches any character that isn't a " " 
nba_debut = re.findall(r'NBA Debut: </strong>[^<]*<a href="[^"]+">(.+?)</a>', response)
nba_debut

['October 12, 1979']

### 3. Scrape Shooting Hand

In [6]:
# \s -- matches "Shoots" text followed by any whitespace characters (space, tab, new line)
# </strong>\s* -- matches the literal </strong> tag followed by any whitespace characters
# (\w+) -- capturing group that matches one or more word characters (letters, digits, or underscores)
shooting_hand = re.findall(r'Shoots:\s*</strong>\s*(\w+)', response)
shooting_hand

['Right']

### 4. Scrape Date of birth

In [7]:
# . -- one of matching characters, + -- one or more of the matching characters, ? -- match few characters until next part of pattern is satisfied
dob = re.findall(r'data-birth="(.+?)">', response)
dob

['1956-12-07']

### 5. Scrape Draft Team

In [9]:
draft_team = re.findall(r'Draft:\s*</strong>\s*<a href="/teams/BOS/draft.html">\s*(.*?)\s*</a>', response)
draft_team

['Boston Celtics']