# Software Coaching for Python
# Week 2: Advanced Web Scraping Using Selenium

Instructor: Kang-Pyo Lee 

Software Installation Instructions: https://docs.google.com/document/d/1q1NnHL_YsRcxUwdO2fjTwbTZAJeow_bxuBMFtSdn_lc/edit?usp=sharing

In [1]:
# ! pip install --user --upgrade bs4 selenium==3.14.0

In [2]:
from bs4 import BeautifulSoup
import requests   
from selenium import webdriver 
from selenium.webdriver.common.keys import Keys



https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent

In [3]:
url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent"
r = requests.get(url)
r.content

b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n<title>SEC.gov | Request Rate Threshold Exceeded</title>\n<style>\nhtml {height: 100%}\nbody {height: 100%; margin:0; padding:0;}\n#header {background-color:#003968; color:#fff; padding:15px 20px 10px 20px;font-family:Arial, Helvetica, sans-serif; font-size:20px; border-bottom:solid 5px #000;}\n#footer {background-color:#003968; color:#fff; padding:15px 20px;font-family:Arial, Helvetica, sans-serif; font-size:20px;}\n#content {max-width:650px;margin:60px auto; padding:0 20px 100px 20px; background-image:url(seal_bw.png);background-repeat:no-repeat;background-position:50% 100%;}\nh1 {font-family:Georgia, Times, serif; font-size:20px;}\nh2 {text-align:center; font-family:Georgia, Times, serif; font-size:20px; width:100%; border-bottom:solid #9

## Automate web scraping using Selenium

In [4]:
driver = webdriver.Chrome("C:\\Users\\kangplee\\Downloads\\chromedriver.exe")   # for Windows users
# driver = webdriver.Chrome("/Users/kangplee/Downloads/chromedriver")           # for Mac users

You should see the Chrome browser automatically pop up. 

In [5]:
url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent"

In [6]:
driver.get(url)

In [7]:
element = driver.find_element_by_tag_name("html")
html = element.get_attribute("outerHTML")
soup = BeautifulSoup(html, "html.parser")

In [8]:
soup

<html lang="ENG"><head>
<title>Latest EDGAR Filings</title>
<!-- BEGIN HEADER -->
<script async="" id="www-widgetapi-script" src="https://www.youtube.com/s/player/26b082a8/www-widgetapi.vflset/www-widgetapi.js" type="text/javascript"></script><script async="" src="//www.youtube.com/iframe_api"></script><script async="" src="https://www.google-analytics.com/analytics.js" type="text/javascript"></script><script async="" src="//www.googletagmanager.com/gtm.js?id=GTM-TD3BKV"></script><script async="" src="//www.google-analytics.com/analytics.js"></script><script language="JavaScript" src="/include/sec.js" type="text/javascript"></script><script async="true" data-role="gateway" data-vendor="fs" src="//gateway.foresee.com/sites/sec-gov/production/gateway.min.js" type="text/javascript"></script><link href="/include/others.css" rel="STYLESHEET"/><script src="https://search.usa.gov/javascripts/remote.loader.js" type="text/javascript"></script><script async="" id="_fed_an_ua_tag" src="https://da

In [9]:
soup.find_all("table")

[<table bgcolor="white" border="0" cellpadding="0" cellspacing="0" summary="" width="100%">
 <tbody><tr>
 <td rowspan="2" width="95"><a href="/index.htm">
 <img alt="U.S. Securities &amp; Exchange Commission" border="0" height="92" src="/images/bannerSeal.gif" width="95"/></a></td>
 <td colspan="2"><img alt="" height="7" src="/images/pixel.gif" width="1"/></td>
 </tr>
 <tr>
 <td width="26"><img alt="SEC Seal" border="0" height="85" src="/images/bannerSealR.gif" width="26"/></td>
 <td valign="top" width="100%">
 <table border="0" cellpadding="0" cellspacing="0" summary="" width="100%">
 <tbody><tr>
 <td bgcolor="#324395" valign="top" width="171"><img alt="" border="0" height="30" src="/images/bannerFlag.gif" width="171"/></td>
 <td align="right" bgcolor="#324395" class="gray" nowrap="nowrap" valign="middle" width="95%">
 <a href="/index.htm">Home</a> | <a href="/edgar/searchedgar/webusers.htm">EDGAR Search Home</a> |
  <a href="/cgi-bin/browse-edgar?action=getcurrent">Latest Filings</a>

In [10]:
len(soup.find_all("table"))

8

In [11]:
soup.find_all("table")[6]

<table summary="">
<tbody><tr bgcolor="#D6D6D6">
<th align="left" nowrap="nowrap" width="4%">Form</th>
<th align="left" nowrap="nowrap" width="4%">Formats</th>
<th align="left" width="40%">Description</th>
<th align="left" nowrap="nowrap" width="4%">Accepted</th>
<th align="left" nowrap="nowrap" width="4%">Filing Date</th>
<th align="left" nowrap="nowrap" width="4%">File/Film No</th>
</tr>
<tr><td>
</td><td>
</td><td align="left" bgcolor="#E6E6E6" valign="top"><a href="/cgi-bin/browse-edgar?action=getcompany&amp;CIK=0001789879&amp;owner=include&amp;count=40">Posawatz Anthony L (0001789879) (Reporting)</a></td>
</tr>
<tr align="left" nowrap="nowrap" valign="top">
<td nowrap="nowrap">4</td>
<td nowrap="nowrap"><a href="/Archives/edgar/data/1789879/000120332021000015/0001203320-21-000015-index.htm">[html]</a><a href="/Archives/edgar/data/1789879/000120332021000015/0001203320-21-000015.txt">[text]</a></td>
<td class="small">Statement of changes in beneficial ownership of securities<br/>Acc

In [12]:
soup.find_all("table")[6].find_all("tr")

[<tr bgcolor="#D6D6D6">
 <th align="left" nowrap="nowrap" width="4%">Form</th>
 <th align="left" nowrap="nowrap" width="4%">Formats</th>
 <th align="left" width="40%">Description</th>
 <th align="left" nowrap="nowrap" width="4%">Accepted</th>
 <th align="left" nowrap="nowrap" width="4%">Filing Date</th>
 <th align="left" nowrap="nowrap" width="4%">File/Film No</th>
 </tr>, <tr><td>
 </td><td>
 </td><td align="left" bgcolor="#E6E6E6" valign="top"><a href="/cgi-bin/browse-edgar?action=getcompany&amp;CIK=0001789879&amp;owner=include&amp;count=40">Posawatz Anthony L (0001789879) (Reporting)</a></td>
 </tr>, <tr align="left" nowrap="nowrap" valign="top">
 <td nowrap="nowrap">4</td>
 <td nowrap="nowrap"><a href="/Archives/edgar/data/1789879/000120332021000015/0001203320-21-000015-index.htm">[html]</a><a href="/Archives/edgar/data/1789879/000120332021000015/0001203320-21-000015.txt">[text]</a></td>
 <td class="small">Statement of changes in beneficial ownership of securities<br/>Accession Num

In [13]:
soup.find_all("table")[6].find_all("tr")[1:]

[<tr><td>
 </td><td>
 </td><td align="left" bgcolor="#E6E6E6" valign="top"><a href="/cgi-bin/browse-edgar?action=getcompany&amp;CIK=0001789879&amp;owner=include&amp;count=40">Posawatz Anthony L (0001789879) (Reporting)</a></td>
 </tr>, <tr align="left" nowrap="nowrap" valign="top">
 <td nowrap="nowrap">4</td>
 <td nowrap="nowrap"><a href="/Archives/edgar/data/1789879/000120332021000015/0001203320-21-000015-index.htm">[html]</a><a href="/Archives/edgar/data/1789879/000120332021000015/0001203320-21-000015.txt">[text]</a></td>
 <td class="small">Statement of changes in beneficial ownership of securities<br/>Accession Number: 0001203320-21-000015  Act: 34  Size: 4 KB
 </td>
 <td nowrap="nowrap">2021-10-22<br/>21:58:40</td>
 <td nowrap="nowrap">2021-10-22</td><td align="left" nowrap="nowrap"><a href="/cgi-bin/browse-edgar?action=getcompany&amp;filenum=001-38868&amp;owner=include&amp;count=40">001-38868</a>
 <br/>211342018</td></tr>, <tr><td>
 </td><td>
 </td><td align="left" bgcolor="#E6E6E

https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001789879&owner=include&count=40

In [14]:
for tr in soup.find_all("table")[6].find_all("tr")[1:]:
    td = tr.find_all("td")[2]
    a = td.find("a")
    
    if a != None:
        title = a.text
        url = "https://www.sec.gov" + a["href"]
        
        print(title, url)

Posawatz Anthony L (0001789879) (Reporting) https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001789879&owner=include&count=40
Beam Global (0001398805) (Issuer) https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001398805&owner=include&count=40
Smurfit Dermot S. (0001812148) (Reporting) https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001812148&owner=include&count=40
GAN Ltd (0001799332) (Issuer) https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001799332&owner=include&count=40
Floyd Nancy C (0001522601) (Reporting) https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001522601&owner=include&count=40
Beam Global (0001398805) (Issuer) https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001398805&owner=include&count=40
SCHAFFER DAVID (0001692929) (Reporting) https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001692929&owner=include&count=40
4D Molecular Therapeutics Inc. (0001650648) (Issuer) http

In [15]:
link = driver.find_element_by_link_text("Posawatz Anthony L (0001789879) (Reporting)")
link.click()

In [16]:
driver.back()

In [17]:
driver.forward()

In [18]:
driver.close()