In [1]:
import re
import urllib
from urllib import request
from bs4 import BeautifulSoup

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""



In [4]:
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [17]:
print(soup.title)
print(soup.title.name)
print(soup.title.string)
print(soup.title.parent.name)
print(soup.a.get('href'))

<title>The Dormouse's story</title>
title
The Dormouse's story
head
http://example.com/elsie


In [15]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [16]:
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [19]:
print(soup.get_text())

The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



In [22]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'lxml')
tag = soup.b
type(tag)

bs4.element.Tag

In [24]:
tag.name

'b'

In [27]:
import lxml.html
broken_html = '<ul class=country><li>Area<li>Population</ul>'
tree = lxml.html.fromstring(broken_html)
fixed_html = lxml.html.tostring(tree, pretty_print=True)
print(fixed_html)

b'<ul class="country">\n<li>Area</li>\n<li>Population</li>\n</ul>\n'


In [29]:
FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent',
          'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format',
          'postal_code_regex', 'languages', 'neighbours')

In [30]:
import re
def re_scraper(html):
    results = {}
    for field in FIELDS:
        results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).groups()[0]
    return results
    

In [33]:
def bs_scraper(html):
    soup = BeautifulSoup(html, 'html.parser')
    results = {}
    for field in FIELDS:
        results[fiield] = soup.find('table').find('tr', id='places_%s__row'%field).find('td', class_='w2p_fw').text
    return results


In [35]:
def lxml_scraper(html):
    tree - lxml.html.fromstring(html)
    results = {}
    for field in FIELDS:
        results[field] = tree.cssselect('table > tr#places_%s__row > td.w2p_fw'%field)[0]/text_content()
    return results

In [36]:
import time 
NUM_ITERATIONS = 1000

In [53]:
wiki = "https://en.wikipedia.org/wiki/List_of_state_and_union_territory_capitals_in_India"
page = request.urlopen(wiki).read().decode('utf-8')

In [54]:
soup = BeautifulSoup(page, 'lxml')

In [56]:
soup.title

<title>List of state and union territory capitals in India - Wikipedia</title>

In [58]:
soup.title.string

'List of state and union territory capitals in India - Wikipedia'

In [59]:
soup.a

<a id="top"></a>

In [60]:
soup.a

<a id="top"></a>

In [62]:
soup.find_all("a")

[<a id="top"></a>,
 <a href="/wiki/Wikipedia:Featured_lists" title="This is a featured list. Click here for more information."><img alt="This is a featured list. Click here for more information." data-file-height="438" data-file-width="462" height="19" src="//upload.wikimedia.org/wikipedia/en/thumb/e/e7/Cscr-featured.svg/20px-Cscr-featured.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/e/e7/Cscr-featured.svg/30px-Cscr-featured.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/e/e7/Cscr-featured.svg/40px-Cscr-featured.svg.png 2x" width="20"/></a>,
 <a href="#mw-head">navigation</a>,
 <a href="#p-search">search</a>,
 <a href="/wiki/States_and_union_territories_of_India" title="States and union territories of India">States and union<br/>
 territories of India</a>,
 <a class="image" href="/wiki/File:Flag_of_India.svg"><img alt="Flag of India.svg" data-file-height="900" data-file-width="1350" height="47" src="//upload.wikimedia.org/wikipedia/en/thumb/4/41/Flag_of_India.svg

In [63]:
all_links = soup.find_all("a")
for link in all_links:
    print(link.get("href"))

None
/wiki/Wikipedia:Featured_lists
#mw-head
#p-search
/wiki/States_and_union_territories_of_India
/wiki/File:Flag_of_India.svg
/wiki/List_of_states_and_territories_of_India_by_area
/wiki/List_of_states_and_union_territories_of_India_by_population
/wiki/ISO_3166-2:IN
None
/wiki/List_of_Indian_states_by_Child_Nutrition
/wiki/Indian_states_and_territories_ranking_by_crime_rate
/wiki/Indian_states_ranked_by_economic_freedom
/wiki/Indian_states_ranking_by_households_having_electricity
/wiki/Indian_states_ranking_by_fertility_rate
/wiki/Forest_cover_by_state_in_India
/wiki/List_of_Indian_states_and_union_territories_by_GDP
/wiki/List_of_Indian_states_by_GDP_per_capita
/wiki/List_of_Indian_states_and_territories_by_highest_point
/wiki/Indian_states_ranked_by_HIV_awareness
/wiki/List_of_Indian_states_and_territories_by_Human_Development_Index
/wiki/Indian_states_ranking_by_families_owning_house
/wiki/Indian_states_ranking_by_household_size
/wiki/Indian_states_and_territories_ranked_by_inciden

In [64]:
all_tables = soup.find_all('table')