In [1]:
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import os
from bs4 import BeautifulSoup

In [2]:
driver = webdriver.Remote(
    command_executor='http://172.17.0.2:4444/wd/hub', 
    desired_capabilities=DesiredCapabilities.CHROME
)

In [3]:
driver.get('http://python.org')
html_doc = driver.page_source

In [4]:
soup = BeautifulSoup(html_doc, 'lxml')

In [5]:
print(soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]>   <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->
<!--[if IE 7]>      <html class="no-js ie7 lt-ie8 lt-ie9">          <![endif]-->
<!--[if IE 8]>      <html class="no-js ie8 lt-ie9">                 <![endif]-->
<!--[if gt IE 8]><!-->
<html class="js no-touch geolocation fontface generatedcontent svg formvalidation placeholder boxsizing no-retina flexslide" dir="ltr" lang="en" style="" xmlns="http://www.w3.org/1999/xhtml">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <link href="//ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js" rel="prefetch"/>
  <meta content="Python.org" name="application-name"/>
  <meta content="The official home of the Python Programming Language" name="msapplication-tooltip"/>
  <meta content="Python.org" name="apple-mobile-web-app-title"/>
  <meta content="yes" name="apple-mobile-web-app-capable"/>
  <meta content="black" name="apple-mobile-w

### Search Element Inside the HTML with soup

In [6]:
# search for the first p tag
soup.find('p') 

<p><strong>Notice:</strong> While Javascript is not essential for this website, your interaction with the content will be limited. Please turn Javascript on for the full experience. </p>

In [7]:
# search for all a tags
a_tags = soup.find_all('a')
print(len(a_tags))
print(a_tags)

219
[<a href="#content" title="Skip to content">Skip to content</a>, <a aria-hidden="true" class="jump-link" href="#python-network" id="close-python-network">
<span aria-hidden="true" class="icon-arrow-down"><span>▼</span></span> Close
                </a>, <a class="current_item selectedcurrent_branch selected" href="/" title="The Python Programming Language">Python</a>, <a href="/psf-landing/" title="The Python Software Foundation">PSF</a>, <a href="https://docs.python.org" title="Python Documentation">Docs</a>, <a href="https://pypi.python.org/" title="Python Package Index">PyPI</a>, <a href="/jobs/" title="Python Job Board">Jobs</a>, <a href="/community/" title="Python Community">Community</a>, <a aria-hidden="true" class="jump-link" href="#top" id="python-network">
<span aria-hidden="true" class="icon-arrow-up"><span>▲</span></span> The Python Network
                </a>, <a href="/"><img alt="python™" class="python-logo" src="/static/img/python-logo.png"/></a>, <a class="jump-to

In [8]:
# search by class
a_tag = soup.find_all('a', class_='button')
print(a_tag)

[<a class="button" href="https://www.python.org/ftp/python/3.7.0/python-3.7.0-macosx10.9.pkg">Python 3.7.0</a>, <a class="button" href="https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tar.xz">Python 3.7.0</a>, <a class="button" href="https://www.python.org/ftp/python/3.7.0/python-3.7.0.exe">Python 3.7.0</a>, <a class="button" href="/downloads/operating-systems/">View the full list of downloads</a>, <a class="button" href="http://docs.python.org/3/" style="background-color: #ffd343;">Python 3.x Docs</a>, <a class="button" href="http://docs.python.org/2/">Python 2.x Docs</a>, <a class="button prompt" data-shell-container="#dive-into-python" href="/shell/" id="start-shell">&gt;_
                        <span class="message">Launch Interactive Shell</span>
</a>, <a class="button" href="/users/membership/">Become a Member</a>, <a class="button" href="/psf/donations/">Donate to the PSF</a>]


In [9]:
# search by ID
soup.find_all('span', {'id': 'python-status-indicator'})

[<span class="python-status-indicator-default" id="python-status-indicator"></span>]

In [10]:
# search by name and string inside
soup.find_all('a', string='Submit an Event')

[<a href="https://wiki.python.org/moin/PythonEventsCalendar#Submitting_an_Event" title="">Submit an Event</a>,
 <a href="https://wiki.python.org/moin/PythonEventsCalendar#Submitting_an_Event" title="">Submit an Event</a>]

### HTML Hierarchical Structure

In [11]:
driver.get('https://hackage.haskell.org/package/HandsomeSoup-0.4.2/src/tests/test.html')
story = driver.page_source
soup = BeautifulSoup(story, 'lxml')

In [12]:
print(soup)

<html lang="en-US" xmlns="http://www.w3.org/1999/xhtml"><head><title>The Dormouse's story</title></head>
<body>
<h1 class="title"><b>The Dormouse's story</b></h1>
<p class="story">Once upon a time there were three little sisters; and their names were
      <strong>test</strong>
<a class="sister wut" href="http://example.com/elsie" id="link1"><strong>Elsie</strong></a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p data-original="test">foo</p>
<div class="curr_lang">
<p>
Inside a div.
</p>
</div>
</body></html>


In [13]:
# search for all children
p = soup.find('p', class_='story').findChildren()
p

[<strong>test</strong>,
 <a class="sister wut" href="http://example.com/elsie" id="link1"><strong>Elsie</strong></a>,
 <strong>Elsie</strong>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [14]:
# search for parent
p = soup.find('p', class_='story').findParent()
p

<body>
<h1 class="title"><b>The Dormouse's story</b></h1>
<p class="story">Once upon a time there were three little sisters; and their names were
      <strong>test</strong>
<a class="sister wut" href="http://example.com/elsie" id="link1"><strong>Elsie</strong></a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p data-original="test">foo</p>
<div class="curr_lang">
<p>
Inside a div.
</p>
</div>
</body>

In [15]:
# search for siblings
a = soup.find('a')
sib = a.findNextSiblings()
print(a)
print(sib)

<a class="sister wut" href="http://example.com/elsie" id="link1"><strong>Elsie</strong></a>
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


### Scrape for text data

In [16]:
soup.find('h1').text

"The Dormouse's story"

In [17]:
soup.find('p').text

'Once upon a time there were three little sisters; and their names were\n      test\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.'

In [18]:
soup.find('a').text

'Elsie'

### Scrape links

In [19]:
a_tags = soup.find_all('a')
for a in a_tags:
    print('id: {0}, url: {1}'.format(a['id'], a['href']))

id: link1, url: http://example.com/elsie
id: link2, url: http://example.com/lacie
id: link3, url: http://example.com/tillie


### Scrape Table

In [20]:
soup = BeautifulSoup(open('sample.html'), 'lxml')
for tr in soup.find_all('tr'):
    for td in tr.find_all('td'):
        print(td)

<td>Row 1, Column 1</td>
<td>Row 1, Column 2</td>
<td>Row2, Column 1</td>
<td>Row2, Column 2</td>
