In [1]:
import requests
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
import time
import pandas as pd

In [2]:
webpage = requests.get('https://www.basketball-reference.com/boxscores/201810190ORL.html')

In [3]:
year = 2018
month = 10
day = 19
team = 'ORL'

In [4]:
web_name = (f'https://www.basketball-reference.com/boxscores/{year}{month}{day}{0}{team}.html')

In [5]:
web_name

'https://www.basketball-reference.com/boxscores/201810190ORL.html'

In [6]:
web_many = requests.get(web_name)

In [7]:
# now that I have the framework, I can create a list or dictionary to loop through to run the scrapes

In [8]:
soup = BeautifulSoup(webpage.text, 'html.parser')

In [9]:
soup.findAll('tr', limit=2) # search the page for table rows and play around with how many to limit it to
# found that limiting it to two got me what I needed

[<tr class="over_header"><th></th>
 <th aria-label="" class=" over_header center" colspan="20" data-stat="header_tmp">Basic Box Score Stats</th>
 </tr>, <tr>
 <th aria-label="Starters" class=" poptip sort_default_asc center" data-stat="player" scope="col">Starters</th>
 <th aria-label="Minutes Played" class=" poptip center" data-over-header="Basic Box Score Stats" data-stat="mp" data-tip="Minutes Played" scope="col">MP</th>
 <th aria-label="Field Goals" class=" poptip center" data-over-header="Basic Box Score Stats" data-stat="fg" data-tip="Field Goals" scope="col">FG</th>
 <th aria-label="Field Goal Attempts" class=" poptip center" data-over-header="Basic Box Score Stats" data-stat="fga" data-tip="Field Goal Attempts" scope="col">FGA</th>
 <th aria-label="Field Goal Percentage" class=" poptip center" data-over-header="Basic Box Score Stats" data-stat="fg_pct" data-tip="Field Goal Percentage" scope="col">FG%</th>
 <th aria-label="3-Point Field Goals" class=" poptip center" data-over-he

## Data I need to extract: 
FGM, 3P, FGA, FT, ORB, Opp DRB, TOV, FTA

In [10]:
headers_four_factors = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]
# this allows me to get the header data from the html file - search the file for table rows, 
# and then return all table headers. I got the index of position 1 where the header info resides

In [11]:
print(headers_four_factors)

['Starters', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+/-']


## This has everything except the opponents DRB but I can get that from the matchup

In [12]:
rows = soup.findAll('tr')[2:] # this pulls the rows data; need to start from the second row to eliminate
# the headers for the rows
player_stats1 = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
# the player_stats1 provides me all the data; however, the players name is not a table data 
# but a table head so I need to create a separate dataframe and then join the two

In [13]:
rows[0].findAll('td')
# this is the first player in the box score for the day

[<td class="right " csk="1799" data-stat="mp">29:59</td>,
 <td class="right " data-stat="fg">6</td>,
 <td class="right " data-stat="fga">8</td>,
 <td class="right " data-stat="fg_pct">.750</td>,
 <td class="right " data-stat="fg3">3</td>,
 <td class="right " data-stat="fg3a">4</td>,
 <td class="right " data-stat="fg3_pct">.750</td>,
 <td class="right iz" data-stat="ft">0</td>,
 <td class="right iz" data-stat="fta">0</td>,
 <td class="right iz" data-stat="ft_pct"></td>,
 <td class="right iz" data-stat="orb">0</td>,
 <td class="right " data-stat="drb">6</td>,
 <td class="right " data-stat="trb">6</td>,
 <td class="right " data-stat="ast">5</td>,
 <td class="right " data-stat="stl">2</td>,
 <td class="right iz" data-stat="blk">0</td>,
 <td class="right " data-stat="tov">2</td>,
 <td class="right iz" data-stat="pf">0</td>,
 <td class="right " data-stat="pts">15</td>,
 <td class="right " data-stat="plus_minus">+32</td>]

In [14]:
rows[0].findAll('th') # looks like the data for the player name is in table head, not table data - this 
# confirms my thought

[<th class="left " csk="Batum,Nicolas" data-append-csv="batumni01" data-stat="player" scope="row"><a href="/players/b/batumni01.html">Nicolas Batum</a></th>]

In [15]:
player_names1 = [[td.getText() for td in rows[i].findAll('th')] for i in range(len(rows))]
# this returns much more than just the name; instead it returns the entire table head flag

In [16]:
print(player_names1[:10])

[['Nicolas Batum'], ['Kemba Walker'], ['Jeremy Lamb'], ['Cody Zeller'], ['Marvin Williams'], ['Reserves', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+/-'], ['Malik Monk'], ['Michael Kidd-Gilchrist'], ['Willy Hernangómez'], ['Tony Parker']]


In [17]:
stats = pd.DataFrame(player_stats1, columns = headers_four_factors[1:]) 
# create a dataframe with the stats from the game

In [18]:
stats[:10]

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-
0,29:59,6.0,8.0,0.75,3.0,4.0,0.75,0.0,0.0,,0.0,6.0,6.0,5.0,2.0,0.0,2.0,0.0,15.0,32.0
1,26:46,8.0,16.0,0.5,5.0,10.0,0.5,5.0,5.0,1.0,0.0,2.0,2.0,5.0,1.0,0.0,3.0,0.0,26.0,34.0
2,25:05,2.0,7.0,0.286,0.0,2.0,0.0,4.0,4.0,1.0,1.0,6.0,7.0,1.0,1.0,1.0,0.0,0.0,8.0,19.0
3,22:45,3.0,8.0,0.375,0.0,0.0,,2.0,2.0,1.0,3.0,5.0,8.0,2.0,1.0,0.0,0.0,2.0,8.0,17.0
4,20:29,3.0,7.0,0.429,2.0,6.0,0.333,0.0,0.0,,1.0,3.0,4.0,0.0,0.0,0.0,1.0,0.0,8.0,13.0
5,,,,,,,,,,,,,,,,,,,,
6,23:37,4.0,12.0,0.333,2.0,6.0,0.333,1.0,1.0,1.0,0.0,1.0,1.0,2.0,0.0,0.0,1.0,3.0,11.0,5.0
7,22:29,5.0,8.0,0.625,0.0,2.0,0.0,2.0,4.0,0.5,2.0,7.0,9.0,5.0,0.0,2.0,1.0,1.0,12.0,24.0
8,17:18,2.0,5.0,0.4,1.0,1.0,1.0,1.0,2.0,0.5,3.0,2.0,5.0,2.0,1.0,0.0,2.0,4.0,6.0,15.0
9,16:12,0.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,3.0,3.0,6.0,0.0,0.0,2.0,2.0,0.0,3.0


In [19]:
player = pd.DataFrame(player_names1) # create a separate dataframe for player names

In [20]:
player = player[0][:66] # filter out unnecessary info from player name dataframe

In [21]:
player.shape

(66,)

In [22]:
stats.shape

(66, 20)

In [23]:
stats['Player'] = player #add a new column to stats dataframe w player name in it

In [24]:
stats[:3]

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,Player
0,29:59,6,8,0.75,3,4,0.75,0,0,,...,6,6,5,2,0,2,0,15,32,Nicolas Batum
1,26:46,8,16,0.5,5,10,0.5,5,5,1.0,...,2,2,5,1,0,3,0,26,34,Kemba Walker
2,25:05,2,7,0.286,0,2,0.0,4,4,1.0,...,6,7,1,1,1,0,0,8,19,Jeremy Lamb


In [25]:
# I need to create a unique identifier that I can scrape from rows

Now I need to make sure I clean up the fields so that I do not have blank rows

# Need to create information about what team is playing, what is the unique identifier so date+hometeam+awayteam, and then how to cross reference those databases

In [26]:
rows2 = soup.findAll(class_='scorebox')

In [27]:
rows2

[<div class="scorebox">
 <div>
 <div itemprop="performer" itemscope="" itemtype="https://schema.org/Organization">
 <div class="media-item logo loader">
 <img alt="2019 Charlotte Hornets Logo" class="teamlogo" itemscope="image" src="https://d2p3bygnnzw9w3.cloudfront.net/req/201908281/tlogo/bbr/CHO-2019.png"/>
 <p><a href="http://www.sportslogos.net/">via Sports Logos.net</a></p>
 <p><a href="https://www.sports-reference.com/blog/2016/06/redesign-team-and-league-logos-courtesy-sportslogos-net/">About logos</a></p>
 </div>
 <strong>
 <a href="/teams/CHO/2019.html" itemprop="name">Charlotte Hornets</a>
 </strong>
 </div>
 <div class="scores">
 <div class="score">120</div>
 </div><div>1-1</div>
 <div class="prevnext">
 <a class="button2 prev" href="/boxscores/201810170CHO.html">Prev Game</a>
 <a class="button2 next" href="/boxscores/201810200MIA.html">Next Game</a>
 </div>
 </div>
 <div>
 <div itemprop="performer" itemscope="" itemtype="https://schema.org/Organization">
 <div class="media-

In [28]:
overall_teams = [strong.getText() for strong in rows2[0].findAll('strong')]

In [29]:
overall_teams = [items.strip('\n') for items in overall_teams]
overall_teams

['Charlotte Hornets', 'Orlando Magic']

In [30]:
overall_score = [scores.getText() for scores in rows2[0].findAll(class_='scores')]

In [31]:
overall_score = [items.strip('\n') for items in overall_score]
overall_score

['120', '88']

In [32]:
rows2[0].findAll(class_='scorebox_meta')

[<div class="scorebox_meta">
 <div>7:00 PM, October 19, 2018</div><div>Amway Center, Orlando, Florida</div>
 <div><em>Logos <a href="http://www.sportslogos.net/">via Sports Logos.net</a>
             / <a href="//www.sports-reference.com/blog/2016/06/redesign-team-and-league-logos-courtesy-sportslogos-net/">About logos</a></em></div>
 </div>]

In [33]:
overall_date = [dates.getText() for dates in rows2[0].findAll(class_='scorebox_meta')]

In [34]:
overall_date

['\n7:00 PM, October 19, 2018Amway Center, Orlando, Florida\nLogos via Sports Logos.net\n            / About logos\n']

In [35]:
overall_date = [items.strip('\n') for items in overall_date]

In [36]:
overall_date_2 = [items.split(',') for items in overall_date]

In [37]:
overall_date_2[0][:4]

['7:00 PM', ' October 19', ' 2018Amway Center', ' Orlando']

In [38]:
overall_date_2[0][2][:5].strip(' ')

'2018'

In [39]:
def date_adjustment():
    overall_date = [dates.getText() for dates in rows2[0].findAll(class_='scorebox_meta')]
    overall_date = [items.strip('\n') for items in overall_date]
    overall_date_2 = [items.split(',') for items in overall_date]
    output_list = []
    output_list.append(overall_date_2[0][0])
    output_list.append(overall_date_2[0][1])
    output_list.append(overall_date_2[0][2][:5].strip(' '))
    return output_list

In [40]:
date_adjustment()

['7:00 PM', ' October 19', '2018']

In [41]:
date_list = ' '.join(date_adjustment())

In [42]:
date_list_1 = []

In [43]:
date_list_1.append(date_list)

In [44]:
date_list_1.append(date_list)

In [45]:
date_list_1

['7:00 PM  October 19 2018', '7:00 PM  October 19 2018']

In [46]:
teams_scores = pd.DataFrame(overall_teams, columns=['Team_Name'])

In [47]:
teams_scores['Score'] = overall_score

In [48]:
teams_scores['Date'] = date_list_1

In [49]:
teams_scores

Unnamed: 0,Team_Name,Score,Date
0,Charlotte Hornets,120,7:00 PM October 19 2018
1,Orlando Magic,88,7:00 PM October 19 2018


In [50]:
# all this code should be written into a function and then run the function for each scrape

# Now I need to create a unique identifier that allows me to link my stats box score info with me team summary info
This actually could be based on the search critera for the original website scrape - ie the url

##### Then I will need to figure out how the data on the website is structured to scrape each boxscore individually

# Then I need to think about how to feed the data into a database, an PostGresSQL database using a Docker instance

In [51]:
rows2[0].findAll(class_="button2 prev")

[<a class="button2 prev" href="/boxscores/201810170CHO.html">Prev Game</a>,
 <a class="button2 prev" href="/boxscores/201810170ORL.html">Prev Game</a>]

In [52]:
[boxscores.getText() for boxscores in rows2[0].findAll(class_="button2 prev")]

['Prev Game', 'Prev Game']

In [53]:
# I need to extract data from the rows2 scrape segment to create the unique identifier
# and add to my dataframe