In [1]:
# Let’s create an example web scraper that scrapes the page located at 
# http://www.pythonscraping.com/pages/warandpeace.html.

from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')
bs = BeautifulSoup(html.read(), 'html.parser')

In [2]:
# using findAll function to extract a Python list of proper nouns 
# found by selecting only the text within <span class="green"></span> tags

nameList = bs.find_all('span', {'class':'green'})
for name in nameList:
    print(name.get_text())

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


In [3]:
# number of times “the prince” is surrounded by tags on the example page

nameList = bs.find_all(text='the prince')
print(len(nameList))

7


In [4]:
# find the first tag with id = 'title'
title = bs.find(id='title')
print(title)

None


In [5]:
# find only descendants that are children, you can use the .children tag

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')
for child in bs.find('table',{'id':'giftList'}).children:
    print(child)



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>


In [6]:
# The output of this code is to print all rows of products from the product table, except for the first title row
# Objects cannot be siblings with themselves, so the first title row is ommited

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')
for sibling in bs.find('table', {'id':'giftList'}).tr.next_siblings:
    print(sibling)



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>


<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parr

In [7]:
# <tr>
    # td
    # td
    # td(3)
        # "$15.00"(4)
    # td(2)
        # img src="../img/gifts/img1.jpg">(1)

# (1)The image tag where src="../img/gifts/img1.jpg" is first selected.
# (2)You select the parent of that tag (in this case, the td tag).
# (3)You select the previous_sibling of the td tag (in this case, the td tag that contains the dollar value of the product).
# (4)You select the text within that tag, “$15.00.”

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')
print(bs.find('img',
{'src':'../img/gifts/img1.jpg'})
.parent.previous_sibling.get_text())


$15.00



In [16]:
# Experimentig with RegEx
# There is a list with common regular expression symbols that can be used to find and 
# collect almost any type of string 
# check the Table 2-1. Commonly used regular expression symbols attached into the folder to learn more.

# A classic example is using these regular expressions symbols to find email addresses
# By concatenating all of the rules, you arrive at this regular expression:

# [A-Za-z0-9\._+]+@[A-Za-z]+\.(com|org|edu|net)
# Using the expression above, we can find any email address that uses the domains .com,.org,.edu, .net

# Let's use an example to find some images paths, scraping the page found at 
# http://www.pythonscraping.com/pages/page3.html

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img',
    {'src':re.compile('\.\.\/img\/gifts/img.*\.jpg')})
for image in images:
    print(image['src']) 
    
# This prints only the relative image paths that start with ../img/gifts/img and end in .jpg


../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg


In [21]:
# Accessing attibutes can be done within tag objects, using:
# myTag.attrs
# This will returns a Python dictionary object
# In this case, the image attibutes can be found with the following request:

image.attrs

{'src': '../img/gifts/img6.jpg'}

In [22]:
# Using the attibute 'src', it is possible to get location of the image.

image.attrs['src']

'../img/gifts/img6.jpg'

In [27]:
# Lambda expression: basically is a function that is passed as variable in another funtion
# BeautifulSoup allows you to pass certain types of functions as parameters into the find_all function.
# Restriction: must take a tag object as an argument and return a boolean

# Example: retrieve all tags that have exactly 2 attributes:

bs.find_all(lambda tag: len(tag.attrs) == 2)

# This returned all the tags with 2 attributes, such as:
# <div class="body" id="content"></div>
# <span style="color:red" class="title"></span>

[<img src="../img/gifts/logo.jpg" style="float:left;"/>,
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>
 $10,005.00
 </td><td>
 <img src="../img/gifts/img3.jpg"/>
 </td>

In [28]:
# Lambda functions are so useful you can even use them to replace 
# existing BeautifulSoup functions.
# The following code will retrieve the tag containing the text
# 'Or maybe he's only resting?' 

bs.find_all(lambda tag: tag.get_text() == 'Or maybe he\'s only resting?')

[<span class="excitingNote">Or maybe he's only resting?</span>]