In [1]:
# imports
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Using the requests library

In [2]:
url = 'https://rldaggie.github.io/sample-html/' # open on your web browser to relate downstream outputs
res = requests.get(url) # get contents by hitting url

### Status Codes

In [3]:
res.status_code # to confirm that the request has succeeded

200

### Creating a `BeautifulSoup` object

In [4]:
res.text # returns html

'<!DOCTYPE html>\n<html>\n  <head>\n    <meta charset="utf-8">\n    <title>The title</title>\n\n    <style media="screen">\n      tbody tr {\n        color: red;\n      }\n    </style>\n  </head>\n  <body>\n    <h1 class="foobar" id="title">This is an h1</h1>\n\n    <div>\n      <h1 class="foobar">This is yet another heading.</h1>\n\n      Something inside the div\n    </div>\n\n    <h3>Todo List</h3>\n    <ol class="todo">\n      <li class="foobar">Take out trash</li>\n      <li>Pay billz</li>\n      <li class="foobar">Feed dog</li>\n    </ol>\n\n    <h3>Completed</h3>\n    <ol class=\'done\'>\n      <li>Mow lawn</li>\n      <li class="foobar"><span>Take out compost</span></li>\n      <li><span>Create scraping lecture</span></li>\n    </ol>\n\n    <p class=\'foobar\'>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo

In [5]:
soup = BeautifulSoup(res.text, 'lxml') # parse html for python
soup

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>The title</title>
<style media="screen">
      tbody tr {
        color: red;
      }
    </style>
</head>
<body>
<h1 class="foobar" id="title">This is an h1</h1>
<div>
<h1 class="foobar">This is yet another heading.</h1>

      Something inside the div
    </div>
<h3>Todo List</h3>
<ol class="todo">
<li class="foobar">Take out trash</li>
<li>Pay billz</li>
<li class="foobar">Feed dog</li>
</ol>
<h3>Completed</h3>
<ol class="done">
<li>Mow lawn</li>
<li class="foobar"><span>Take out compost</span></li>
<li><span>Create scraping lecture</span></li>
</ol>
<p class="foobar">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. <span>Duis aute irure dolor</span> in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. <em>Excepteu

# `soup.find()`

Returns either:

1. A soup object of the **first match**
2. `None`

In [6]:
h1 = soup.find('h1') # we can see from output of 'soup', below output is the 1st match corresponding to 'h1'
h1

<h1 class="foobar" id="title">This is an h1</h1>

In [7]:
type(h1) # <h1> is a heading tag

bs4.element.Tag

In [8]:
h1.text # to get text 'within' tag

'This is an h1'

In [9]:
h1.attrs # to get attributes associated with the <h1> tag (attributes provide additional information about HTML elements)

{'class': ['foobar'], 'id': 'title'}

In [10]:
if h1: # print out the text within <h1> if the html doc has <h1>
    print(h1.text)

This is an h1


# `soup.find_all()`

Returns a **_LIST_** of **_ALL_** soup objects that match your query

In [11]:
h1_tags = soup.find_all('h1')
h1_tags

[<h1 class="foobar" id="title">This is an h1</h1>,
 <h1 class="foobar">This is yet another heading.</h1>]

In [12]:
[tag.text for tag in h1_tags] # list comprehension filter to extract 'text' within <h1> tags

['This is an h1', 'This is yet another heading.']

In [13]:
[tag.attrs for tag in h1_tags] # list comprehension filter to extract 'attributes' within <h1> tags

[{'class': ['foobar'], 'id': 'title'}, {'class': ['foobar']}]

# Creating a `pandas` DataFrame from a scrape

In [14]:
# example refresher on how a dataframe can be created given a dict of col:values
people = [
    {'name': 'Bethany', 'market': 'BOS'},
    {'name': 'Tucker', 'market': 'NYC'}
]

pd.DataFrame(people)

Unnamed: 0,name,market
0,Bethany,BOS
1,Tucker,NYC


In [15]:
ol = soup.find('ol', {'class': 'done'})
ol # extracting the ordered list fragment with class='done'

<ol class="done">
<li>Mow lawn</li>
<li class="foobar"><span>Take out compost</span></li>
<li><span>Create scraping lecture</span></li>
</ol>

In [16]:
# let's create a dataframe with all the 'text' elements within <li>, under a dataframe col 'task'
# todo_dict and todos_list will help gathering information in the format for pandas dataframe creation
todos = []
for li in ol.find_all('li'):
    todo = {'task': li.text}
    todos.append(todo)
    print(f"todo_dict: {todo}")
    
print(f"todos_list: {todos}")
pd.DataFrame(todos)

todo_dict: {'task': 'Mow lawn'}
todo_dict: {'task': 'Take out compost'}
todo_dict: {'task': 'Create scraping lecture'}
todos_list: [{'task': 'Mow lawn'}, {'task': 'Take out compost'}, {'task': 'Create scraping lecture'}]


Unnamed: 0,task
0,Mow lawn
1,Take out compost
2,Create scraping lecture


### Directory

In [17]:
table = soup.find('table', {'id': 'directory'}) # extracting contents within <table id="directory"> from soup
table

<table id="directory">
<thead>
<tr>
<th>Name</th>
<th>Role</th>
</tr>
</thead>
<tbody>
<tr class="student">
<th><a href="mailto:praveen@ga.co">Praveen



          </a></th>
<td>Student</td>
</tr>
<tr class="student">
<th><a href="mailto:fred@ga.co">Fred



          </a></th>
<td>Student</td>
</tr>
<tr class="student">
<th><a href="mailto:homer@ga.co">Homer



          </a></th>
<td><span class="foobar">Student</span></td>
</tr>
<tr class="student">
<th><a href="mailto:kyle@ga.co">Kyle



          </a></th>
<td>Student</td>
</tr>
<tr class="student">
<th><a href="mailto:sam@ga.co">Sam



          </a></th>
<td>Student</td>
</tr>
<tr class="student">
<th><a href="mailto:javier@ga.co">Javier



          </a></th>
<td>Student</td>
</tr>
<tr class="student">
<th><a href="mailto:nengkuan@ga.co">Nengkuan



          </a></th>
<td>Student</td>
</tr>
<tr class="student">
<th><a href="mailto:kieth@ga.co">Kieth



          </a></th>
<td>Student</td>
</tr>
<tr class="student">
<th><a href=

In [18]:
# similar looping operations as done above to port data from html extract to pandas dataframe
people = []
for row in table.find('tbody').find_all('tr'): # find 1st match of 'tbody' and all content in 'tr'
    person = {
        'name': row.find('a').text.strip(), # to remove extra whitespace, else you will see 'name'\n\n... 
        'email': row.find('a').attrs['href'].replace('mailto:', ''),
        'role': row.find('td').text
    }
    
    people.append(person)

pd.DataFrame(people)

Unnamed: 0,name,email,role
0,Praveen,praveen@ga.co,Student
1,Fred,fred@ga.co,Student
2,Homer,homer@ga.co,Student
3,Kyle,kyle@ga.co,Student
4,Sam,sam@ga.co,Student
5,Javier,javier@ga.co,Student
6,Nengkuan,nengkuan@ga.co,Student
7,Kieth,kieth@ga.co,Student
8,Bola,bola@ga.co,Student
9,Steve,steve@ga.co,Student


### Basketball Reference

In [19]:
url = 'https://www.basketball-reference.com/'
res = requests.get(url)

In [20]:
res.status_code

200

In [21]:
soup = BeautifulSoup(res.content, 'lxml')
soup

<!DOCTYPE html>
<html class="no-js" data-root="/home/bbr/build" data-version="klecko-" itemscope="" itemtype="https://schema.org/WebSite" lang="en">
<head id="suppress_all_ads">
<meta charset="utf-8"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport"/>
<link href="https://d2p3bygnnzw9w3.cloudfront.net/req/202111121" rel="dns-prefetch"/>
<!-- Quantcast Choice. Consent Manager Tag v2.0 (for TCF 2.0) -->
<script async="true" type="text/javascript">
    (function() {
	var host = window.location.hostname;
	var element = document.createElement('script');
	var firstScript = document.getElementsByTagName('script')[0];
	var url = 'https://quantcast.mgr.consensu.org'
	    .concat('/choice/', 'XwNYEpNeFfhfr', '/', host, '/choice.js')
	var uspTries = 0;
	var uspTriesLimit = 3;
	element.async = true;
	element.type = 'text/javascript';
	element.src = url;
	
	firstScript.parentNode.insertBefore(element, fir

In [22]:
teams = []
for conf in ['E', 'W']:
    table = soup.find('table', {'id': 'confs_standings_'+conf})
    for row in table.find('tbody').find_all('tr'):
        team = {
            'slug': row.find('a').text,
            'name': row.find('a').attrs['title'],
            'wins': row.find('td', {'data-stat': 'wins'}).text,
            'losses': row.find('td', {'data-stat': 'losses'}).text,
            'conference': conf
        }

        teams.append(team)
        
df = pd.DataFrame(teams)

In [23]:
df.head() # dataframe output parsed from conference 'E' in soup

Unnamed: 0,slug,name,wins,losses,conference
0,WAS,Washington Wizards,10,3,E
1,CHI,Chicago Bulls,10,4,E
2,BRK,Brooklyn Nets,10,4,E
3,MIA,Miami Heat,9,5,E
4,CLE,Cleveland Cavaliers,9,6,E


In [24]:
df.tail() # dataframe output parsed from conference 'W' in soup

Unnamed: 0,slug,name,wins,losses,conference
25,OKC,Oklahoma City Thunder,5,8,W
26,MIN,Minnesota Timberwolves,4,9,W
27,SAS,San Antonio Spurs,4,9,W
28,NOP,New Orleans Pelicans,2,13,W
29,HOU,Houston Rockets,1,13,W
