# Beautiful Soup Tutorial

- based Vprusso tutorial: https://www.youtube.com/watch?v=87Gx3U0BDlo

- github: https://github.com/vprusso/youtube_tutorials/blob/master/web_scraping_and_automation/beautiful_soup/beautiful_soup_and_requests.py

In [2]:
import requests
from bs4 import BeautifulSoup  # BeautifulSoup is a class of bs4

In [36]:
# using requests, we can access a webpage

wp1 = requests.get('https://www.google.com')
wp2 = requests.get('https://www.clinicaltrials.gov')
wp3 = requests.get('https://www.whitehouse.gov/briefings-statements')


# verify access to the webpage: a code of 200 confirms access
# list of status codes: https://en.wikipedia.org/wiki/List_of_HTTP_status_codes

print(wp1.status_code)
print(wp2.status_code)
print(wp3.status_code) 

200
200
200


In [37]:
# verify access to correct page
# list of header information: https://en.wikipedia.org/wiki/List_of_HTTP_header_fields

print(wp1.headers)
print(wp2.headers)
#print(wp3.headers)

{'Date': 'Tue, 03 Mar 2020 03:07:32 GMT', 'Expires': '-1', 'Cache-Control': 'private, max-age=0', 'Content-Type': 'text/html; charset=ISO-8859-1', 'P3P': 'CP="This is not a P3P policy! See g.co/p3phelp for more info."', 'Content-Encoding': 'gzip', 'Server': 'gws', 'X-XSS-Protection': '0', 'X-Frame-Options': 'SAMEORIGIN', 'Set-Cookie': '1P_JAR=2020-03-03-03; expires=Thu, 02-Apr-2020 03:07:32 GMT; path=/; domain=.google.com; Secure, NID=199=ijbeFXY8eUyZt-JLawl_R0GfWK_uOs5nSdKdbkhuRV9Hp9gftEwMAjCt0lpnXsZetXetHx0y_oxkq_QkzU-4GiRw-V3qfUhM9rX3VOb-sYsm3MDN2jIEncfxhvjHdiwHePpbwnHCLfzeqdUC1MKhWc5lg8M42kNfhc4AJUoAB9w; expires=Wed, 02-Sep-2020 03:07:32 GMT; path=/; domain=.google.com; HttpOnly', 'Alt-Svc': 'quic=":443"; ma=2592000; v="46,43",h3-Q050=":443"; ma=2592000,h3-Q049=":443"; ma=2592000,h3-Q048=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000', 'Transfer-Encoding': 'chunked'}
{'Date': 'Tue, 03 Mar 2020 03:07:32 GMT', 'Server': 'Apache', 'Strict-Transport-Security':

###  Create BeautifulSoup Object

In [38]:
# store content of website in a variable then create a
# beautiful soup object from that content

src1 = wp1.content  # get content of www.google.com
src2 = wp2.content  # get content of www.clinicaltrials.gov
src3 = wp3.content

bs1 = BeautifulSoup(src1, 'lxml')  # store content as soup object
bs2 = BeautifulSoup(src2, 'lxml')
bs3 = BeautifulSoup(src3, 'lxml')

#print(bs1)
#print(bs2)
#print(bs3)

### Links on webpage

In [39]:
# Get all the links on the page
# find_all is a method in bs
# this code finds all the 'a' tags where a tags are links on the page
# an a-tag contains all information between <a and /a>

bs1links = bs1.find_all('a')
bs2links = bs2.find_all('a')
bs3links = bs3.find_all('a')

# Print all the a-tags (links on the page)
#print(bs1links)
#print(bs2links)
#print(bs3links)
#print('\n')  # not sure what this does

In [40]:
# Access links of interst from google.com
# will provide link information when "About" is in the text field 
# 'href' is an attribute within the a-tag (there are different kinds)

for link in bs1links:
    if "About" in link.text:
        print(link)
        print(link.attrs['href']) # prints the attributes for the link

<a href="/intl/en/about.html">About Google</a>
/intl/en/about.html


In [41]:
# Access links of interst from clinicaltrials.com
# will provide link information when "About" is in the text field 
# 'href' is an attribute within the a-tag (there are different kinds)
# link here is <a href="/ct2/search/browse?brwse=cond_cat">See Studies by Topic</a>
# 'href' here is /ct2/search/browse?brwse=cond_cat

for link in bs2links:
    if "Studies by Topic" in link.text:
        print(link)
        print(link.attrs['href']) # prints the attributes for the link

<a href="/ct2/search/browse?brwse=cond_cat">See Studies by Topic</a>
/ct2/search/browse?brwse=cond_cat
<a href="/ct2/search/browse?brwse=cond_cat">See Studies by Topic</a>
/ct2/search/browse?brwse=cond_cat
<a href="/ct2/search/browse?brwse=cond_cat"><span style="white-space:nowrap;">Studies by Topic</span></a>
/ct2/search/browse?brwse=cond_cat


Tutorial stop using google.com, switch to whitehouse.gov


In [44]:
# Get a list of all the links on a page that point to links of interest

# You need to identify format of the links: if you right click on list 
#    and click 'inspect', you can find the header type, ie: h2, h3, etc
# for the whitehouse.gov example, the links of interest are header2 (h2)
# once you specify header type, you then find the tag of interest 

urlList = []

for h2_tag in bs3.find_all('h2'):  # similar to finding a-tag above
    a_tag = h2_tag.find('a')  # find a-tags within each h2 header
    
    # href is the attribute that gives the URL for each a-tag link
    urlList.append(a_tag.attrs['href'])  # append each URL to urlList

print(urlList)


['https://www.whitehouse.gov/briefings-statements/remarks-president-trump-marine-one-departure-85/', 'https://www.whitehouse.gov/briefings-statements/bill-announcement-82/', 'https://www.whitehouse.gov/briefings-statements/vice-president-pence-secretary-azar-add-key-administration-officials-coronavirus-task-force-3/', 'https://www.whitehouse.gov/briefings-statements/readout-vice-presidents-discussion-nations-governors-covid-19-coordination-preparedness/', 'https://www.whitehouse.gov/briefings-statements/remarks-vice-president-pence-american-israel-public-affairs-committee-policy-conference/', 'https://www.whitehouse.gov/briefings-statements/remarks-president-trump-president-duque-colombia-bilateral-meeting-2/', 'https://www.whitehouse.gov/briefings-statements/presidential-message-read-across-america-day-2020/', 'https://www.whitehouse.gov/briefings-statements/readout-white-house-coronavrius-task-force-meeting/', 'https://www.whitehouse.gov/briefings-statements/remarks-president-trump-2

In [51]:
# Same as above except with clinincaltrial.gov URL

urlList = []

for body_tag in bs2.find_all('body'):  # ho h2, so using 'body'
    a_tag = body_tag.find('a')  # find a-tags within each h2 header
    
    # href is the attribute that gives the URL for each a-tag link
    urlList.append(a_tag.attrs['href'])  # append each URL to urlList

print(urlList)


['https://clinicaltrials.gov/ct2/manage-recs/resources#DataElement']
