In [None]:
import requests
url = 'https://en.wikipedia.org/w/index.php/?title=List_of_Game_of_Thrones_episodes&oldid=802553687'
r = requests.get(url)
html_contents = r.text
html_contents

Soup, Rich and Green A nd finally, it becomes clear why we’ve been referring to messy HTML pages as a “soup.” The Beautiful Soup library was named after a Lewis Carroll poem bearing the same name from “Alice’s Adventures in Wonderland.” In the tale, the poem is sung by a character called the “Mock Turtle” and goes as follows: “Beautiful Soup, so rich and green,// Waiting in a hot tureen!// Who for such dainties would not stoop?// Soup of the evening, beautiful Soup!”. Just like in the story, Beautiful Soup tries to organize complexity: it helps to parse, structure and organize the oftentimes very messy web by fixing bad HTML and presenting us with an easy-to-work-with Python structure.

#### installing Beautiful Soup is easy with pip
pip install -U beautifulsoup4

In [5]:
import requests
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/w/index.php/?title=List_of_Game_of_Thrones_episodes&oldid=802553687'
r = requests.get(url)
html_contents = r.text
html_soup = BeautifulSoup(html_contents)
html_soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of Game of Thrones episodes - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_Game_of_Thrones_episodes","wgTitle":"List of Game of Thrones episodes","wgCurRevisionId":917364183,"wgRevisionId":802553687,"wgArticleId":31120069,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 errors: deprecated parameters","Articles containing potentially dated statements from August 2017","All articles containing potentially dated statements","Official website not in Wikidata","Featured lists","Game of Thrones episodes","Lists of American drama television series episodes","Lists of fantasy television series episodes"],"wgBreakFrames":!1,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSepar

The Beautiful Soup library itself depends on an HTML parser to perform most of the bulk parsing work. In Python, multiple parsers exist to do so:

* “html.parser”: a built-in Python parser that is decent and requires no extra installation. (Python Default Parser)
* “lxml”: which is very fast but requires an extra installation.
* “html5lib”: which aims to parse web page in exactly the same way as a web browser does, but is a bit slower.

In [6]:
# Explicitly defining parser
html_soup = BeautifulSoup(html_contents, 'html.parser')

###### Beautiful Soup’s main task is to take HTML content and transform it into a tree-based representation

Once you’ve created a BeautifulSoup object, there are two methods you’ll be using to fetch data from the page:
* find(name, attrs, recursive, string, **keywords)
* find_all(name, attrs, recursive, string, limit, **keywords)

In [7]:
print(html_soup.find('h1'))

<h1 class="firstHeading" id="firstHeading" lang="en">List of <i>Game of Thrones</i> episodes</h1>


In [8]:
print(html_soup.find('', {'id': 'p-logo'}))

<div id="p-logo" role="banner"><a class="mw-wiki-logo" href="/wiki/Main_Page" title="Visit the main page"></a></div>


In [9]:
for found in html_soup.find_all(['h1', 'h2']):
    print(found)

<h1 class="firstHeading" id="firstHeading" lang="en">List of <i>Game of Thrones</i> episodes</h1>
<h2>Contents</h2>
<h2><span class="mw-headline" id="Series_overview">Series overview</span></h2>
<h2><span class="mw-headline" id="Episodes">Episodes</span></h2>
<h2><span class="mw-headline" id="Home_media_releases">Home media releases</span></h2>
<h2><span class="mw-headline" id="Ratings">Ratings</span></h2>
<h2><span class="mw-headline" id="References">References</span></h2>
<h2><span class="mw-headline" id="External_links">External links</span></h2>
<h2>Navigation menu</h2>


In [10]:
#Same as above
for found in html_soup.findAll(['h1', 'h2']):
    print(found)

<h1 class="firstHeading" id="firstHeading" lang="en">List of <i>Game of Thrones</i> episodes</h1>
<h2>Contents</h2>
<h2><span class="mw-headline" id="Series_overview">Series overview</span></h2>
<h2><span class="mw-headline" id="Episodes">Episodes</span></h2>
<h2><span class="mw-headline" id="Home_media_releases">Home media releases</span></h2>
<h2><span class="mw-headline" id="Ratings">Ratings</span></h2>
<h2><span class="mw-headline" id="References">References</span></h2>
<h2><span class="mw-headline" id="External_links">External links</span></h2>
<h2>Navigation menu</h2>


##### Both find and find_all return Tag objects

In [11]:
import requests
from bs4 import BeautifulSoup
url = 'https://en.wikipedia.org/w/index.php/?title=List_of_Game_of_Thrones_episodes&oldid=802553687'
r = requests.get(url)
html_contents = r.text
html_soup = BeautifulSoup(html_contents, 'html.parser')

In [12]:
# Find the first h1 tag
first_h1 = html_soup.find('h1')

In [21]:
print(first_h1.name)
print(first_h1.contents)
print(str(first_h1))

print(first_h1.text)
print(first_h1.get_text())

print(first_h1.attrs)

print(first_h1.attrs['id'])
print(first_h1['id'])
print(first_h1.get('id'))

h1
['List of ', <i>Game of Thrones</i>, ' episodes']
<h1 class="firstHeading" id="firstHeading" lang="en">List of <i>Game of Thrones</i> episodes</h1>
List of Game of Thrones episodes
List of Game of Thrones episodes
{'id': 'firstHeading', 'class': ['firstHeading'], 'lang': 'en'}
firstHeading
firstHeading
firstHeading


In [22]:
cites = html_soup.find_all('cite', class_='citation', limit=5)

In [23]:
for citation in cites:
    print(citation.get_text())
    # Inside of this cite element, find the first a tag
    link = citation.find('a')
    print(link.get('href'))
    print()

Fowler, Matt (April 8, 2011). "Game of Thrones: "Winter is Coming" Review". IGN. Archived from the original on August 17, 2012. Retrieved September 22, 2016.
http://tv.ign.com/articles/116/1160215p1.html

Fleming, Michael (January 16, 2007). "HBO turns Fire into fantasy series". Variety. Archived from the original on May 16, 2012. Retrieved September 3, 2016.
http://www.variety.com/article/VR1117957532.html?categoryid=14&cs=1

"Game of Thrones". Emmys.com. Retrieved September 17, 2016.
http://www.emmys.com/shows/game-thrones

Roberts, Josh (April 1, 2012). "Where HBO's hit 'Game of Thrones' was filmed". USA Today. Archived from the original on April 1, 2012. Retrieved March 8, 2013.
https://web.archive.org/web/20120401123724/http://travel.usatoday.com/destinations/story/2012-04-01/Where-the-HBO-hit-Game-of-Thrones-was-filmed/53876876/1

Schwartz, Terri (January 28, 2013). "'Game of Thrones' casts a bear and shoots in Los Angeles for major Season 3 scene". Zap2it. Archived from the orig

In [None]:
#Both are same in BeatifulSoup
tag.find('div').find('table').find('thead').find('tr')
tag.div.table.thead.tr

#Both are same in BeatifulSoup
tag.find_all('h1')
tag('h1')

In [29]:
import requests
from bs4 import BeautifulSoup
url = 'https://en.wikipedia.org/w/index.php/?title=List_of_Game_of_Thrones_episodes&oldid=802553687'
r = requests.get(url)
html_contents = r.text
html_soup = BeautifulSoup(html_contents, 'html.parser')
# We'll use a list to store our episode list
episodes = []
ep_tables = html_soup.find_all('table', class_='wikitable plainrowheaders wikiepisodetable')

for table in ep_tables:
    headers = []
    rows = table.find_all('tr')
    # Start by fetching the header cells from the first row to determine the field names
    for header in table.find('tr').find_all('th'):
        headers.append(header.text)
        # Then go through all the rows except the first one
        for row in table.find_all('tr')[1:]:
            values = []
            # And get the column cells, the first one being inside a th-tag
            for col in row.find_all(['th','td']):
                values.append(col.text)
                if values:
                    episode_dict = {headers[i]: values[i] for i in range(len(values))}
                    episodes.append(episode_dict)

# Show the results
for episode in episodes:
    print(episode)

['1']
['1', '1']


IndexError: list index out of range