In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup # Most common library for dealing with html
import matplotlib.pyplot as plt
%matplotlib inline


# Using the requests library

In [32]:
res = requests.get('https://rldaggie.github.io/sample-html')

### Status Codes

In [33]:
res.status_code

200

### Creating a `BeautifulSoup` object

In [34]:
soup = BeautifulSoup(res.content)

In [35]:
soup

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>The title</title>
<style media="screen">
      tbody tr {
        color: red;
      }
    </style>
</head>
<body>
<h1 class="foobar" id="title">This is an h1</h1>
<div>
<h1 class="foobar">This is yet another heading.</h1>

      Something inside the div
    </div>
<h3>Todo List</h3>
<ol class="todo">
<li class="foobar">Take out trash</li>
<li>Pay billz</li>
<li class="foobar">Feed dog</li>
</ol>
<h3>Completed</h3>
<ol class="done">
<li>Mow lawn</li>
<li class="foobar"><span>Take out compost</span></li>
<li><span>Create scraping lecture</span></li>
</ol>
<p class="foobar">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. <span>Duis aute irure dolor</span> in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. <em>Excepteu

# `soup.find()`

Returns either:

1. A soup object of the first match
2. `None`

In [40]:
found = soup.find('h1')

In [41]:
found.attrs

{'class': ['foobar'], 'id': 'title'}

In [43]:
found.attrs['class']

['foobar']

### Can have multiple classes but only one ID

# `soup.find_all()`

Returns a **_LIST_** of soup objects that match your query

In [45]:
soup.find_all('h1')

[<h1 class="foobar" id="title">This is an h1</h1>,
 <h1 class="foobar">This is yet another heading.</h1>]

In [46]:
type(soup.find_all('h1'))

bs4.element.ResultSet

In [50]:
for element in soup.find_all('h1'):
    print(f'Element text:         {element.text}')
    print(f'Element attributes:   {element.attrs}')

Element text:       This is an h1
Element attributes: {'class': ['foobar'], 'id': 'title'}
Element text:       This is yet another heading.
Element attributes: {'class': ['foobar']}


In [51]:
[f.text for f in soup.find_all('h1')]

['This is an h1', 'This is yet another heading.']

# Creating a `pandas` DataFrame from a scrape

### Todo List

In [52]:
people = [
    {'name': 'Jireh', 'role': 'instructor'},
    {'name': 'Ryan', 'role': 'associate'},
]

pd.DataFrame(people)

Unnamed: 0,name,role
0,Jireh,instructor
1,Ryan,associate


In [53]:
soup.find('ol', {'class': 'done'})
# finds ordered list (ol) where 'class' is 'done'

<ol class="done">
<li>Mow lawn</li>
<li class="foobar"><span>Take out compost</span></li>
<li><span>Create scraping lecture</span></li>
</ol>

In [59]:
done_tasks = soup.find('ol', {'class': 'done'})
done_tasks.find('li')

<li>Mow lawn</li>

### Previvously, html is the root.

### Now, ol does not have any parent aka is the root.

In [63]:
done_tasks.find_all('li')

[<li>Mow lawn</li>,
 <li class="foobar"><span>Take out compost</span></li>,
 <li><span>Create scraping lecture</span></li>]

In [62]:
[t.text for t in done_tasks.find_all('li')]

['Mow lawn', 'Take out compost', 'Create scraping lecture']

In [69]:
for i in done_tasks.find_all('li'):
    print(i.text)

Mow lawn
Take out compost
Create scraping lecture


In [76]:
d = []
for element in done_tasks.find_all('li'):
    d.append({'task': element.text})

In [78]:
d

[{'task': 'Mow lawn'},
 {'task': 'Take out compost'},
 {'task': 'Create scraping lecture'}]

In [77]:
pd.DataFrame(d)

Unnamed: 0,task
0,Mow lawn
1,Take out compost
2,Create scraping lecture


### GA Directory

In [80]:
table = soup.find('table', {'id': 'directory'})

In [81]:
table

<table id="directory">
<thead>
<tr>
<th>Name</th>
<th>Role</th>
</tr>
</thead>
<tbody>
<tr class="student">
<th><a href="mailto:praveen@ga.co">Praveen



          </a></th>
<td>Student</td>
</tr>
<tr class="student">
<th><a href="mailto:fred@ga.co">Fred



          </a></th>
<td>Student</td>
</tr>
<tr class="student">
<th><a href="mailto:homer@ga.co">Homer



          </a></th>
<td><span class="foobar">Student</span></td>
</tr>
<tr class="student">
<th><a href="mailto:kyle@ga.co">Kyle



          </a></th>
<td>Student</td>
</tr>
<tr class="student">
<th><a href="mailto:sam@ga.co">Sam



          </a></th>
<td>Student</td>
</tr>
<tr class="student">
<th><a href="mailto:javier@ga.co">Javier



          </a></th>
<td>Student</td>
</tr>
<tr class="student">
<th><a href="mailto:nengkuan@ga.co">Nengkuan



          </a></th>
<td>Student</td>
</tr>
<tr class="student">
<th><a href="mailto:kieth@ga.co">Kieth



          </a></th>
<td>Student</td>
</tr>
<tr class="student">
<th><a href=

In [90]:
# Find one name first
# Use text.strip() to find name
table.find('tbody').find('tr').find('th').text.strip()

'Praveen'

In [93]:
# Then find all
for x in table.find('tbody').find_all('tr'):
    print(x.find('th').text.strip())

Praveen
Fred
Homer
Kyle
Sam
Javier
Nengkuan
Kieth
Bola
Steve
Nichole
Riley


In [104]:
# Find one email
table.find('tbody').find('th').find('a').attrs['href'].replace('mailto', '')

':praveen@ga.co'

In [107]:
# Then find all
for x in table.find('tbody').find_all('tr'):
    print(x.find('a').attrs['href'].replace('mailto:', ''))

praveen@ga.co
fred@ga.co
homer@ga.co
kyle@ga.co
sam@ga.co
javier@ga.co
nengkuan@ga.co
kieth@ga.co
bola@ga.co
steve@ga.co
nichole@ga.co
riley@ga.co


In [110]:
# Find one role
# This is more straightforward and will get the exact way it was written on the table
table.find('tbody').find('td').text

'Student'

In [111]:
# Then find all
for x in table.find('tbody').find_all('tr'):
    print(x.find('td').text)

Student
Student
Student
Student
Student
Student
Student
Student
Student
Student
Instructor
Instructor


In [113]:
# Combining all of it:
result = []
for element in table.find('tbody').find_all('tr'):
    row = {}
    row['name'] = element.find('th').text.strip()
    row['email'] = element.find('a').attrs['href'].replace('mailto:', '')
    row['role'] = element.find('td').text
    result.append(row)

In [114]:
result

[{'name': 'Praveen', 'email': 'praveen@ga.co', 'role': 'Student'},
 {'name': 'Fred', 'email': 'fred@ga.co', 'role': 'Student'},
 {'name': 'Homer', 'email': 'homer@ga.co', 'role': 'Student'},
 {'name': 'Kyle', 'email': 'kyle@ga.co', 'role': 'Student'},
 {'name': 'Sam', 'email': 'sam@ga.co', 'role': 'Student'},
 {'name': 'Javier', 'email': 'javier@ga.co', 'role': 'Student'},
 {'name': 'Nengkuan', 'email': 'nengkuan@ga.co', 'role': 'Student'},
 {'name': 'Kieth', 'email': 'kieth@ga.co', 'role': 'Student'},
 {'name': 'Bola', 'email': 'bola@ga.co', 'role': 'Student'},
 {'name': 'Steve', 'email': 'steve@ga.co', 'role': 'Student'},
 {'name': 'Nichole', 'email': 'nichole@ga.co', 'role': 'Instructor'},
 {'name': 'Riley', 'email': 'riley@ga.co', 'role': 'Instructor'}]

In [115]:
pd.DataFrame(result)

Unnamed: 0,name,email,role
0,Praveen,praveen@ga.co,Student
1,Fred,fred@ga.co,Student
2,Homer,homer@ga.co,Student
3,Kyle,kyle@ga.co,Student
4,Sam,sam@ga.co,Student
5,Javier,javier@ga.co,Student
6,Nengkuan,nengkuan@ga.co,Student
7,Kieth,kieth@ga.co,Student
8,Bola,bola@ga.co,Student
9,Steve,steve@ga.co,Student


### Basketball Reference

In [116]:
bball = requests.get('https://www.basketball-reference.com')

In [118]:
bball.status_code

200

In [123]:
soup = BeautifulSoup(bball.content)

In [124]:
soup.find_all('table')

[<table class="suppress_all sortable stats_table" data-cols-to-freeze=",1" id="confs_standings_E">
 <caption> Table</caption>
 <colgroup><col/><col/><col/><col/><col/></colgroup>
 <thead>
 <tr>
 <th aria-label="East" class="poptip sort_default_asc left" data-stat="team_name" scope="col">East</th>
 <th aria-label=" " class="poptip center" data-stat="franchise_text" scope="col"> </th>
 <th aria-label=" " class="poptip center" data-stat="payroll_text" scope="col"> </th>
 <th aria-label="Wins" class="poptip right" data-stat="wins" data-tip="Wins" scope="col">W</th>
 <th aria-label="Losses" class="poptip right" data-stat="losses" data-tip="Losses" scope="col">L</th>
 </tr>
 </thead>
 <tbody><tr class="full_table"><th class="left" data-stat="team_name" scope="row"><a href="/teams/PHI/2021.html" title="Philadelphia 76ers">PHI</a> * <span class="seed">(1) </span></th><td class="center" data-stat="franchise_text"><a href="/teams/PHI/" title="Philadelphia 76ers Franchise Index">F</a></td><td cla

In [139]:
soup.find_all('table', {'id': 'confs_standings_W'})

<tbody><tr class="full_table"><th class="left" data-stat="team_name" scope="row"><a href="/teams/UTA/2021.html" title="Utah Jazz">UTA</a> * <span class="seed">(1) </span></th><td class="center" data-stat="franchise_text"><a href="/teams/UTA/" title="Utah Jazz Franchise Index">F</a></td><td class="right" data-stat="payroll_text"><a href="/contracts/UTA.html" title="Utah Jazz Team Payroll">$</a></td><td class="right" data-stat="wins">52</td><td class="right" data-stat="losses">20</td></tr>
<tr class="full_table"><th class="left" data-stat="team_name" scope="row"><a href="/teams/PHO/2021.html" title="Phoenix Suns">PHO</a> * <span class="seed">(2) </span></th><td class="center" data-stat="franchise_text"><a href="/teams/PHO/" title="Phoenix Suns Franchise Index">F</a></td><td class="right" data-stat="payroll_text"><a href="/contracts/PHO.html" title="Phoenix Suns Team Payroll">$</a></td><td class="right" data-stat="wins">51</td><td class="right" data-stat="losses">21</td></tr>
<tr class="f

In [133]:
soup.find('table', {'id': 'confs_standings_W'}).find('tbody').find('tr').find('a').text

'UTA'

In [134]:
soup.find('table', {'id': 'confs_standings_W'}).find('tbody').find('a').attrs['title']

'Utah Jazz'

In [135]:
soup.find('table', {'id': 'confs_standings_W'}).find('tbody').find('tr').find('td', {'data-stat': 'wins'}).text

'52'

In [136]:
soup.find('table', {'id': 'confs_standings_W'}).find('tbody').find('tr').find('td', {'data-stat': 'losses'}).text

'20'

In [145]:
soup.find('table', {'id': 'confs_standings_W'}).find('tbody').find('tr').find('span').text.strip().replace('(', '').replace(')', '')

'1'

In [None]:
# .find('a').text                                                         ---> short form name
# .find('a').attrs['title']                                               ---> long form name
# .find('tr').find('td', {'data-stat': 'wins'}).text                      ---> wins
# .find('tr').find('td', {'data-stat': 'losses'}).text                    ---> losses
# .find('tr').find('span').text.strip().replace('(', '').replace(')', '') ---> rank



In [150]:
teams = []

for conf in ['E', 'W']:
    table = soup.find('table' , {'id': 'confs_standings_' + conf})
    for element in table.find('tbody').find_all('tr'):
        team = {}
        team['conference'] = conf
        team['shortname'] = element.find('a').text
        team['longname'] = element.find('a').attrs['title']
        team['wins'] = int(element.find('td', {'data-stat': 'wins'}).text)
        team['losses'] = int(element.find('td', {'data-stat': 'losses'}).text)
        team['rank'] = int(element.find('span').text.strip().replace('(', '').replace(')', ''))
        teams.append(team)

teams    

[{'conference': 'E',
  'shortname': 'PHI',
  'longname': 'Philadelphia 76ers',
  'wins': 49,
  'losses': 23,
  'rank': 1},
 {'conference': 'E',
  'shortname': 'BRK',
  'longname': 'Brooklyn Nets',
  'wins': 48,
  'losses': 24,
  'rank': 2},
 {'conference': 'E',
  'shortname': 'MIL',
  'longname': 'Milwaukee Bucks',
  'wins': 46,
  'losses': 26,
  'rank': 3},
 {'conference': 'E',
  'shortname': 'NYK',
  'longname': 'New York Knicks',
  'wins': 41,
  'losses': 31,
  'rank': 4},
 {'conference': 'E',
  'shortname': 'ATL',
  'longname': 'Atlanta Hawks',
  'wins': 41,
  'losses': 31,
  'rank': 5},
 {'conference': 'E',
  'shortname': 'MIA',
  'longname': 'Miami Heat',
  'wins': 40,
  'losses': 32,
  'rank': 6},
 {'conference': 'E',
  'shortname': 'BOS',
  'longname': 'Boston Celtics',
  'wins': 36,
  'losses': 36,
  'rank': 7},
 {'conference': 'E',
  'shortname': 'WAS',
  'longname': 'Washington Wizards',
  'wins': 34,
  'losses': 38,
  'rank': 8},
 {'conference': 'E',
  'shortname': 'IND',
 

In [151]:
pd.DataFrame(teams)

Unnamed: 0,conference,shortname,longname,wins,losses,rank
0,E,PHI,Philadelphia 76ers,49,23,1
1,E,BRK,Brooklyn Nets,48,24,2
2,E,MIL,Milwaukee Bucks,46,26,3
3,E,NYK,New York Knicks,41,31,4
4,E,ATL,Atlanta Hawks,41,31,5
5,E,MIA,Miami Heat,40,32,6
6,E,BOS,Boston Celtics,36,36,7
7,E,WAS,Washington Wizards,34,38,8
8,E,IND,Indiana Pacers,34,38,9
9,E,CHO,Charlotte Hornets,33,39,10
