In [1]:
# Import Libraries

import requests
from bs4 import BeautifulSoup

## Scrape Google Homepage For "About" Link

In [2]:
# Create variable for webpage to be accessed (Google)
result = requests.get("https://www.google.com/")

# Make sure website is being accessed (200 = good, 404 = bad)
print(result.status_code)

200


In [4]:
# Check HTTP header to make sure the correct page is being accessed
print(result.headers)

{'Date': 'Fri, 14 Jun 2019 20:05:16 GMT', 'Expires': '-1', 'Cache-Control': 'private, max-age=0', 'Content-Type': 'text/html; charset=ISO-8859-1', 'P3P': 'CP="This is not a P3P policy! See g.co/p3phelp for more info."', 'Content-Encoding': 'gzip', 'Server': 'gws', 'X-XSS-Protection': '0', 'X-Frame-Options': 'SAMEORIGIN', 'Set-Cookie': '1P_JAR=2019-06-14-20; expires=Sun, 14-Jul-2019 20:05:16 GMT; path=/; domain=.google.com, NID=185=VZyRYyO9Q1SbF5pxx9GrEuMBI49TYEGscHE-yZ5YKQnGne9Aa6sBlEnZ43gG3XkJwZzsccqoluAwbeaD9siCojBwHNAMdcXIaBZOF6tifLMAsWXbwe1d4QOELDUXQle39slmvto5hHPIvUGU6l1ckPM5cEQIapeo4PQo06d-78E; expires=Sat, 14-Dec-2019 20:05:16 GMT; path=/; domain=.google.com; HttpOnly', 'Alt-Svc': 'quic=":443"; ma=2592000; v="46,44,43,39"', 'Transfer-Encoding': 'chunked'}

In [7]:
# Extract content from page and store in variable
src = result.content

In [9]:
# Create soup object by passing src variable into BeautifulSoup class
# This will process the source material to be used with BeautifulSoup module
soup = BeautifulSoup(src, 'lxml')

In [12]:
# Find all links on the page by finding HTML "a" tags
links = soup.find_all('a')
print(links)

[<a class="gb1" href="https://www.google.com/imghp?hl=en&amp;tab=wi">Images</a>, <a class="gb1" href="https://maps.google.com/maps?hl=en&amp;tab=wl">Maps</a>, <a class="gb1" href="https://play.google.com/?hl=en&amp;tab=w8">Play</a>, <a class="gb1" href="https://www.youtube.com/?gl=US&amp;tab=w1">YouTube</a>, <a class="gb1" href="https://news.google.com/nwshp?hl=en&amp;tab=wn">News</a>, <a class="gb1" href="https://mail.google.com/mail/?tab=wm">Gmail</a>, <a class="gb1" href="https://drive.google.com/?tab=wo">Drive</a>, <a class="gb1" href="https://www.google.com/intl/en/about/products?tab=wh" style="text-decoration:none"><u>More</u> »</a>, <a class="gb4" href="http://www.google.com/history/optout?hl=en">Web History</a>, <a class="gb4" href="/preferences?hl=en">Settings</a>, <a class="gb4" href="https://accounts.google.com/ServiceLogin?hl=en&amp;passive=true&amp;continue=https://www.google.com/" id="gb_70" target="_top">Sign in</a>, <a href="/advanced_search?hl=en&amp;authuser=0">Advanc

In [13]:
# Extract certain desired links (Links with word "About" in this example)
# Loop through all links in var "links"
for link in links:
    if "About" in link.text:  # See if "About" is in the text
        print(link)           # Print link if it contains "About"
        print(link.attrs["href"]) # Print href attribute for link with "About"
    

<a href="/intl/en/about.html">About Google</a>
/intl/en/about.html


## Scrape White House Briefings & Statements Page for Links

In [14]:
# Store webpage to be accessed in variable
result = requests.get("https://www.whitehouse.gov/briefings-statements/")
# Store content from that webpage
src = result.content
# Process content with BeautifulSoup module
soup = BeautifulSoup(src, 'lxml')

In [15]:
# Create list to store links we find
urls = []

# All links on this page are under an "h2" HTML heading
for h2_tag in soup.find_all("h2"):  # find h2 tags (all h2 headings)
    a_tag = h2_tag.find('a') # find 'a' tag inside h2 tag (actual link)
    urls.append(a_tag.attrs["href"]) # add link to urls list starting with "href" tag

# Make sure list is populated
urls

['https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-working-improve-health-insurance-coverage-american-workers-help-small-businesses/',
 'https://www.whitehouse.gov/briefings-statements/new-annual-data-released-white-house-office-national-drug-control-policy-shows-poppy-cultivation-potential-heroin-production-remain-record-high-levels-mexico/',
 'https://www.whitehouse.gov/briefings-statements/statement-press-secretary-visit-prime-minister-justin-trudeau-canada/',
 'https://www.whitehouse.gov/briefings-statements/presidential-message-244th-birthday-united-states-army/',
 'https://www.whitehouse.gov/briefings-statements/remarks-president-trump-second-chance-hiring/',
 'https://www.whitehouse.gov/briefings-statements/remarks-president-trump-working-lunch-governors-workforce-freedom-mobility/',
 'https://www.whitehouse.gov/briefings-statements/remarks-vice-president-pence-press-gaggle-5/',
 'https://www.whitehouse.gov/briefings-statements/remarks-vice-president-penc

## Create and Use BeautifulSoup Objects to Extract Content

#### Create a "webpage"

In [17]:
# Create basic/fake HTML site to use for this exercise
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; their names:
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
<b class="boldest">Extremely bold</b>
<blockquote class="boldest">Extremely bold</blockquote>
<b id="1">Test 1</b>
<b another-attribute="1" id="verybold">Test 2</b>
"""

# Save file ("publish" webpage) 
with open('index.html', 'w') as f:
    f.write(html_doc)

#### View HTML for created "webpage"

In [18]:
# Create BeautifulSoup object
soup = BeautifulSoup(html_doc, 'lxml')

# Use prettify to print content in formatted HTML
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; their names:
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
  <b class="boldest">
   Extremely bold
  </b>
  <blockquote class="boldest">
   Extremely bold
  </blockquote>
  <b id="1">
   Test 1
  </b>
  <b another-attribute="1" id="verybold">
   Test 2
  </b>
 </body>
</html>


#### Find specific tags

In [19]:
# Find 1st occurence of bold content with HTML tags 'b'
print(soup.b)

<b>The Dormouse's story</b>


In [21]:
# Alt way to find 1st occurence of bold content with HTML tags 'b'
print(soup.find('b'))

<b>The Dormouse's story</b>


In [20]:
# Find 1st occurence of paragraph content with HTML tags 'p'
print(soup.p)

<p class="title"><b>The Dormouse's story</b></p>


In [22]:
# Find all occurences of bold content with HTML tags 'b'
print(soup.find_all('b'))

[<b>The Dormouse's story</b>, <b class="boldest">Extremely bold</b>, <b id="1">Test 1</b>, <b another-attribute="1" id="verybold">Test 2</b>]


In [23]:
# Find name of 1st bold tag 'b'
print(soup.b.name)

b


In [24]:
# Alter the name of 1st 'b' tag

# Create object
tag = soup.b
print(tag)

# Change name
tag.name = "blockquote"
print(tag)

<b>The Dormouse's story</b>
<blockquote>The Dormouse's story</blockquote>


#### Find attributes

In [39]:
# Find specific bold tag
tag = soup.find_all('b')[1]
print(tag)

<b id="1">Test 1</b>


In [40]:
# Access and print id
print(tag['id'])

1


In [42]:
# Access another tag
tag  =soup.find_all('b')[2]
print(tag)

<b another-attribute="1" id="verybold">Test 2</b>


In [32]:
# Access and print tag id like above
print(tag['id'])

verybold


In [33]:
# Access and print other attribute
print(tag['another-attribute'])

1


In [41]:
# Print tag
print(tag)

# Print all attributes of specific tag
print(tag.attrs)

<b another-attribute="1" id="verybold">Test 2</b>
{'another-attribute': '1', 'id': 'verybold'}


In [43]:
# Change values of attributes
print(tag)
tag['another-attribute'] = 2 # Change value of 'another-attribute'
print(tag)

<b another-attribute="1" id="verybold">Test 2</b>
<b another-attribute="2" id="verybold">Test 2</b>


In [44]:
# Delete an attribute from a tag
print(tag)
del tag['id'] # Delete 'id' attribute
print(tag)

<b another-attribute="2" id="verybold">Test 2</b>
<b another-attribute="2">Test 2</b>


In [45]:
# Delete another attribute from a tag
print(tag)
del tag['another-attribute'] # Delete 'another-attribute' attribute
print(tag)

<b another-attribute="2">Test 2</b>
<b>Test 2</b>


#### Navigate strings

In [47]:
# Define tag to work with
tag = soup.find_all('b')[2]
print(tag) # Print entire tag
print(tag.string) # Print only the content between tags

<b>Test 2</b>
Test 2


In [49]:
# Alter the string content between tags
tag.string.replace_with("This is another string")  # Replace content with new string
print(tag)

<b>This is another string</b>
