In [None]:
import requests
url = 'https://en.wikipedia.org/w/index.php/?title=List_of_Game_of_Thrones_episodes&oldid=802553687'
r = requests.get(url)
html_contents = r.text
html_contents

Soup, Rich and Green A nd finally, it becomes clear why we’ve been referring to messy HTML pages as a “soup.” The Beautiful Soup library was named after a Lewis Carroll poem bearing the same name from “Alice’s Adventures in Wonderland.” In the tale, the poem is sung by a character called the “Mock Turtle” and goes as follows: “Beautiful Soup, so rich and green,// Waiting in a hot tureen!// Who for such dainties would not stoop?// Soup of the evening, beautiful Soup!”. Just like in the story, Beautiful Soup tries to organize complexity: it helps to parse, structure and organize the oftentimes very messy web by fixing bad HTML and presenting us with an easy-to-work-with Python structure.

#### installing Beautiful Soup is easy with pip
pip install -U beautifulsoup4

In [None]:
import requests
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/w/index.php/?title=List_of_Game_of_Thrones_episodes&oldid=802553687'
r = requests.get(url)
html_contents = r.text
html_soup = BeautifulSoup(html_contents)
html_soup

The Beautiful Soup library itself depends on an HTML parser to perform most of the bulk parsing work. In Python, multiple parsers exist to do so:

* “html.parser”: a built-in Python parser that is decent and requires no extra installation. (Python Default Parser)
* “lxml”: which is very fast but requires an extra installation.
* “html5lib”: which aims to parse web page in exactly the same way as a web browser does, but is a bit slower.

In [None]:
# Explicitly defining parser
html_soup = BeautifulSoup(html_contents, 'html.parser')

###### Beautiful Soup’s main task is to take HTML content and transform it into a tree-based representation

Once you’ve created a BeautifulSoup object, there are two methods you’ll be using to fetch data from the page:
* find(name, attrs, recursive, string, **keywords)
* find_all(name, attrs, recursive, string, limit, **keywords)

In [None]:
print(html_soup.find('h1'))

In [None]:
print(html_soup.find('', {'id': 'p-logo'}))

In [None]:
for found in html_soup.find_all(['h1', 'h2']):
    print(found)

In [None]:
#Same as above
for found in html_soup.findAll(['h1', 'h2']):
    print(found)

##### Both find and find_all return Tag objects

In [None]:
import requests
from bs4 import BeautifulSoup
url = 'https://en.wikipedia.org/w/index.php/?title=List_of_Game_of_Thrones_episodes&oldid=802553687'
r = requests.get(url)
html_contents = r.text
html_soup = BeautifulSoup(html_contents, 'html.parser')

In [None]:
# Find the first h1 tag
first_h1 = html_soup.find('h1')

In [None]:
print(first_h1.name)
print(first_h1.contents)
print(str(first_h1))

print(first_h1.text)
print(first_h1.get_text())

print(first_h1.attrs)

print(first_h1.attrs['id'])
print(first_h1['id'])
print(first_h1.get('id'))

In [None]:
cites = html_soup.find_all('cite', class_='citation', limit=5)

In [None]:
for citation in cites:
    print(citation.get_text())
    # Inside of this cite element, find the first a tag
    link = citation.find('a')
    print(link.get('href'))
    print()

In [None]:
#Both are same in BeatifulSoup
tag.find('div').find('table').find('thead').find('tr')
tag.div.table.thead.tr

#Both are same in BeatifulSoup
tag.find_all('h1')
tag('h1')

In [None]:
import requests
from bs4 import BeautifulSoup
url = 'https://en.wikipedia.org/w/index.php/?title=List_of_Game_of_Thrones_episodes&oldid=802553687'
r = requests.get(url)
html_contents = r.text
html_soup = BeautifulSoup(html_contents, 'html.parser')
# We'll use a list to store our episode list
episodes = []
ep_tables = html_soup.find_all('table', class_='wikiepisodetable')

for table in ep_tables:
    headers = []
    rows = table.find_all('tr')
    # Start by fetching the header cells from the first row to determine the field names
    for header in table.find('tr').find_all('th'):
        headers.append(header.text)
    # Then go through all the rows except the first one
    for row in table.find_all('tr')[1:]:
        values = []
        # And get the column cells, the first one being inside a th-tag
        for col in row.find_all(['th','td']):
            values.append(col.text)
        if values:
            episode_dict = {headers[i]: values[i] for i in range(len(values))}
            episodes.append(episode_dict)

for episode in episodes:
    print(episode)

In [None]:
import re
html_soup.find(re.compile('^h'))

In [None]:
# Find all <a> tags
html_soup.select('a')

In [None]:
# Find the element with the info id
html_soup.select('#info')

In [None]:
# Find <div> tags with both classa and classb CSS classes
html_soup.select(div.classa.classb)

In [None]:
# Find <a> tags with an href attribute starting with http://example.com/
html_soup.select('a[href^="http://example.com/"]')

In [None]:
# Find <li> tags which are children of <ul> tags with class lst
html_soup.select(ul.lst > li)

In [51]:
#if we want to find out the citation links from our Game of Thrones Wikipedia page, we can simply run:
for link in html_soup.select('ol.references cite a[href]'):
    print(link.get('href'))

http://tv.ign.com/articles/116/1160215p1.html
/wiki/IGN
https://web.archive.org/web/20120817073932/http://tv.ign.com/articles/116/1160215p1.html
http://www.variety.com/article/VR1117957532.html?categoryid=14&cs=1
/wiki/Variety_(magazine)
https://web.archive.org/web/20120516224747/http://www.variety.com/article/VR1117957532?refCatId=14
http://www.emmys.com/shows/game-thrones
/wiki/Emmy_Award
https://web.archive.org/web/20120401123724/http://travel.usatoday.com/destinations/story/2012-04-01/Where-the-HBO-hit-Game-of-Thrones-was-filmed/53876876/1
/wiki/USA_Today
http://travel.usatoday.com/destinations/story/2012-04-01/Where-the-HBO-hit-Game-of-Thrones-was-filmed/53876876/1
https://web.archive.org/web/20131016062544/http://blog.zap2it.com/frominsidethebox/2013/01/game-of-thrones-casts-a-bear-and-shoots-in-los-angeles-for-major-season-3-scene.html
/wiki/Zap2it
http://blog.zap2it.com/frominsidethebox/2013/01/game-of-thrones-casts-a-bear-and-shoots-in-los-angeles-for-major-season-3-scene.html

In [56]:
# This will not work:
# cite a[href][rel=nofollow]

# Instead, you can use:
tags = [t for t in html_soup.select('cite a[href]') if 'nofollow' in t.get('rel', [])]

In [55]:
tags

[<a class="external text" href="http://tv.ign.com/articles/116/1160215p1.html" rel="nofollow">"Game of Thrones: "Winter is Coming" Review"</a>,
 <a class="external text" href="https://web.archive.org/web/20120817073932/http://tv.ign.com/articles/116/1160215p1.html" rel="nofollow">Archived</a>,
 <a class="external text" href="http://www.variety.com/article/VR1117957532.html?categoryid=14&amp;cs=1" rel="nofollow">"HBO turns <i>Fire</i> into fantasy series"</a>,
 <a class="external text" href="https://web.archive.org/web/20120516224747/http://www.variety.com/article/VR1117957532?refCatId=14" rel="nofollow">Archived</a>,
 <a class="external text" href="http://www.emmys.com/shows/game-thrones" rel="nofollow">"Game of Thrones"</a>,
 <a class="external text" href="https://web.archive.org/web/20120401123724/http://travel.usatoday.com/destinations/story/2012-04-01/Where-the-HBO-hit-Game-of-Thrones-was-filmed/53876876/1" rel="nofollow">"Where HBO's hit 'Game of Thrones' was filmed"</a>,
 <a clas

In [57]:
# This will not work:
# cite a[href][rel=nofollow]:not([href*="archive.org"])

# Instead, you can use:
tags = [t for t in html_soup.select('cite a[href]') if 'nofollow' in t.get('rel', []) and 'archive.org' not in t.get('href')]

In [58]:
tags

[<a class="external text" href="http://tv.ign.com/articles/116/1160215p1.html" rel="nofollow">"Game of Thrones: "Winter is Coming" Review"</a>,
 <a class="external text" href="http://www.variety.com/article/VR1117957532.html?categoryid=14&amp;cs=1" rel="nofollow">"HBO turns <i>Fire</i> into fantasy series"</a>,
 <a class="external text" href="http://www.emmys.com/shows/game-thrones" rel="nofollow">"Game of Thrones"</a>,
 <a class="external text" href="http://travel.usatoday.com/destinations/story/2012-04-01/Where-the-HBO-hit-Game-of-Thrones-was-filmed/53876876/1" rel="nofollow">the original</a>,
 <a class="external text" href="http://blog.zap2it.com/frominsidethebox/2013/01/game-of-thrones-casts-a-bear-and-shoots-in-los-angeles-for-major-season-3-scene.html" rel="nofollow">the original</a>,
 <a class="external text" href="https://www.theguardian.com/tv-and-radio/2014/jul/06/game-of-thrones-fifth-series-10000-spaniards-extras-spain" rel="nofollow">"Game of Thrones fifth series: more tha

In [60]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


### Beautiful Soap Quick Guide

In [64]:
print(soup.title)
print(soup.title.name)
print(soup.title.string)
print(soup.title.parent.name)
print(soup.p)
print(soup.a)
print(soup.find_all('a'))
print(soup.find(id="link3"))

<title>The Dormouse's story</title>
title
The Dormouse's story
head
<p class="title"><b>The Dormouse's story</b></p>
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


In [67]:
# One common task is extracting all the URLs found within a page’s <a> tags:
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie
