# 6.01 HTML - WebScraping - Beautiful Soup
## Basic Beautiful Soup functions

In [1]:
from bs4 import BeautifulSoup

In [7]:
html_doc = """
<!DOCTYPE html>
<html><head><title>The Dormouse's story title tag</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a 
<a class="object" href="http://example.com/well" id="link4">well</a>.</p>

<p class="story">...</p>
</body>
</html>
"""

In [8]:
html_doc

'\n<!DOCTYPE html>\n<html><head><title>The Dormouse\'s story title tag</title></head>\n<body>\n<p class="title"><b>The Dormouse\'s story</b></p>\n\n<p class="story">Once upon a time there were three little sisters; and their names were\n<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,\n<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and\n<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;\nand they lived at the bottom of a \n<a class="object" href="http://example.com/well" id="link4">well</a>.</p>\n\n<p class="story">...</p>\n</body>\n</html>\n'

In [9]:
# parse the element
soup = BeautifulSoup(html_doc, 'html.parser')

In [10]:
# html well indented. not always works great...
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   The Dormouse's story title tag
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a
   <a class="object" href="http://example.com/well" id="link4">
    well
   </a>
   .
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



In [11]:
# basic tree navigation
soup.title

<title>The Dormouse's story title tag</title>

In [12]:
soup.title.get_text()

"The Dormouse's story title tag"

In [13]:
# extract content
print(soup.get_text())



The Dormouse's story title tag

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a 
well.
...





In [14]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [15]:
soup.p.get_text()

"The Dormouse's story"

In [16]:
soup.p["class"]

['title']

In [17]:
# get all elements of a tag
soup.select("p")

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a 
 <a class="object" href="http://example.com/well" id="link4">well</a>.</p>,
 <p class="story">...</p>]

In [18]:
soup.select("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>,
 <a class="object" href="http://example.com/well" id="link4">well</a>]

In [22]:
soup.select("a")[0]["href"]

'http://example.com/elsie'

In [25]:
# get links:
for link in soup.find_all('a'):
    print(link['href'])

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie
http://example.com/well


In [33]:
soup.select(".sister")  #does not work with find_all

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [34]:
#getting second story paragraph
soup.select("p.story")[1].get_text()

'...'

## Activity 2

In [48]:
# Creating the soup
html_doc2 = """<!DOCTYPE html>
<html>
<head> Geography</head>
<body>

<div class="city">
  <h2>London</h2>
  <p>London is the most popular tourist destination in the world.</p>
</div>

<div class="city">
  <h2>Paris</h2>
  <p>Paris was originally a Roman City called Lutetia.</p>
</div>

<div class="country">
  <h2>Spain</h2>
  <p>Spain produces 43,8% of all the world's Olive Oil.</p>
</div>

</body>
</html>"""

In [49]:
soup = BeautifulSoup(html_doc2, 'html.parser')

In [50]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  Geography
 </head>
 <body>
  <div class="city">
   <h2>
    London
   </h2>
   <p>
    London is the most popular tourist destination in the world.
   </p>
  </div>
  <div class="city">
   <h2>
    Paris
   </h2>
   <p>
    Paris was originally a Roman City called Lutetia.
   </p>
  </div>
  <div class="country">
   <h2>
    Spain
   </h2>
   <p>
    Spain produces 43,8% of all the world's Olive Oil.
   </p>
  </div>
 </body>
</html>


In [51]:
#1
for sp in soup.select('p'):
    print(sp.text)

London is the most popular tourist destination in the world.
Paris was originally a Roman City called Lutetia.
Spain produces 43,8% of all the world's Olive Oil.


In [52]:
soup.select('h2')

[<h2>London</h2>, <h2>Paris</h2>, <h2>Spain</h2>]

In [53]:
#2
for link in soup.find_all('h2'):
    print(link.text)

London
Paris
Spain


In [59]:
#3
for txt in soup.select('div.city'):
    print(txt.get_text())


London
London is the most popular tourist destination in the world.


Paris
Paris was originally a Roman City called Lutetia.



In [None]:
#4
for txt in soup.select('div.city'):
    print(txt.get_text())

## Requests

In [60]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [None]:
#main > div > span > div > div > div.lister > table > tbody > tr:nth-child(1) > td.titleColumn > a

In [61]:
# 2. find url and store it in a variable
url = "https://www.imdb.com/chart/top"

In [62]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [63]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [64]:
# 4.2. check that the html code looks like it should
soup


<!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>Top 250 Movies - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
    if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
</script>
<script>
    if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
</script>
<link href="https://www.imdb.com/chart/top" rel="canonical"/>
<meta content="http://www.imdb.com/chart/top" property="og:url">
<script>
  

In [65]:
# 5. retrieve/extract the desired info (here, you'll paste the "Selector" you copied before to get the element that belongs to the top movie)
soup.select("#main > div > span > div > div > div.lister > table > tbody > tr:nth-child(1) > td.titleColumn")
#main > div > span > div > div > div.lister > table > tbody > tr:nth-child(1) > td.titleColumn > a

[<td class="titleColumn">
       1.
       <a href="/title/tt0111161/" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman">The Shawshank Redemption</a>
 <span class="secondaryInfo">(1994)</span>
 </td>]

In [66]:
soup.select("td.titleColumn") # all the info about all the movies

[<td class="titleColumn">
       1.
       <a href="/title/tt0111161/" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman">The Shawshank Redemption</a>
 <span class="secondaryInfo">(1994)</span>
 </td>,
 <td class="titleColumn">
       2.
       <a href="/title/tt0068646/" title="Francis Ford Coppola (dir.), Marlon Brando, Al Pacino">The Godfather</a>
 <span class="secondaryInfo">(1972)</span>
 </td>,
 <td class="titleColumn">
       3.
       <a href="/title/tt0071562/" title="Francis Ford Coppola (dir.), Al Pacino, Robert De Niro">The Godfather: Part II</a>
 <span class="secondaryInfo">(1974)</span>
 </td>,
 <td class="titleColumn">
       4.
       <a href="/title/tt0468569/" title="Christopher Nolan (dir.), Christian Bale, Heath Ledger">The Dark Knight</a>
 <span class="secondaryInfo">(2008)</span>
 </td>,
 <td class="titleColumn">
       5.
       <a href="/title/tt0050083/" title="Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb">De 12 gezworenen</a>
 <span class="secondaryIn

In [67]:
soup.select("td.titleColumn a") # all elements containing movie titles

[<a href="/title/tt0111161/" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman">The Shawshank Redemption</a>,
 <a href="/title/tt0068646/" title="Francis Ford Coppola (dir.), Marlon Brando, Al Pacino">The Godfather</a>,
 <a href="/title/tt0071562/" title="Francis Ford Coppola (dir.), Al Pacino, Robert De Niro">The Godfather: Part II</a>,
 <a href="/title/tt0468569/" title="Christopher Nolan (dir.), Christian Bale, Heath Ledger">The Dark Knight</a>,
 <a href="/title/tt0050083/" title="Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb">De 12 gezworenen</a>,
 <a href="/title/tt0108052/" title="Steven Spielberg (dir.), Liam Neeson, Ralph Fiennes">Schindler's List</a>,
 <a href="/title/tt0167260/" title="Peter Jackson (dir.), Elijah Wood, Viggo Mortensen">The Lord of the Rings: The Return of the King</a>,
 <a href="/title/tt0110912/" title="Quentin Tarantino (dir.), John Travolta, Uma Thurman">Pulp Fiction</a>,
 <a href="/title/tt0060196/" title="Sergio Leone (dir.), Clint Eastwood, Eli

In [69]:
# we can use .get_text() to extract the content of the tags we selected
# we'll need to do it to each tag with a for loop: here we do it to the first one
soup.select("td.titleColumn a")[0]
soup.select("td.titleColumn a")[0].get_text()

'The Shawshank Redemption'

In [72]:
# the director and main stars are in the same tag, but as a value of the attribute "title"
# we can access attributes as key-value pairs of dictionaries: using ["key"] to get the value:
soup.select("td.titleColumn a")[0]["title"]
#instead of ["title"] we could use .get("title"): choose whatever you prefer

'Frank Darabont (dir.), Tim Robbins, Morgan Freeman'

In [75]:
# the years are inside a 'span' tag with the 'secondaryInfo' class
# we also specify the parent tag and its class, which is the same we used before
# the years are inside parentheses, but we'll take care of that later
soup.select("td.titleColumn span.secondaryInfo")[0].get_text()

'(1994)'

## making beautiful soups into beautiful tables

In [76]:
#initialize empty lists
title = []
dir_stars = []
year = []


# define the number of iterations of our for loop
# by checking how many elements are in the retrieved result set
# (this is equivalent but more robust than just explicitly defining 250 iterations)
num_iter = len(soup.select("td.titleColumn a"))

# iterate through the result set and retrive all the data
for i in range(num_iter):
    title.append(soup.select("td.titleColumn a")[i].get_text())
    dir_stars.append(soup.select("td.titleColumn a")[i]["title"])
    year.append(soup.select("td.titleColumn span.secondaryInfo")[i].get_text())

print(title)
print(dir_stars)
print(year)



['The Shawshank Redemption', 'The Godfather', 'The Godfather: Part II', 'The Dark Knight', 'De 12 gezworenen', "Schindler's List", 'The Lord of the Rings: The Return of the King', 'Pulp Fiction', 'The Good, the Bad and the Ugly', 'The Lord of the Rings: The Fellowship of the Ring', 'Fight Club', 'Forrest Gump', 'Inception', 'The Lord of the Rings: The Two Towers', 'Star Wars: Episode V - The Empire Strikes Back', 'The Matrix', 'Goodfellas', "One Flew Over the Cuckoo's Nest", 'Shichinin no samurai', 'Seven', 'The Silence of the Lambs', 'Cidade de Deus', "It's a Wonderful Life", 'Het leven is mooi', 'Saving Private Ryan', 'Star Wars: Episode IV - A New Hope', 'Interstellar', 'Spider-Man: No Way Home', 'Spirited Away', 'The Green Mile', 'Gisaengchung', 'Léon', 'Seppuku', 'The Pianist', 'Terminator 2: Judgment Day', 'Terug naar de toekomst', 'The Usual Suspects', 'Psycho', 'The Lion King', 'Modern Times', 'Hotaru no haka', 'American History X', 'Whiplash', 'Gladiator', 'The Departed', 'Cit

In [77]:
# each list becomes a column
movies = pd.DataFrame({"title":title,
                       "dir_stars":dir_stars,
                       "year":year
                      })

In [78]:
movies.head()

Unnamed: 0,title,dir_stars,year
0,The Shawshank Redemption,"Frank Darabont (dir.), Tim Robbins, Morgan Fre...",(1994)
1,The Godfather,"Francis Ford Coppola (dir.), Marlon Brando, Al...",(1972)
2,The Godfather: Part II,"Francis Ford Coppola (dir.), Al Pacino, Robert...",(1974)
3,The Dark Knight,"Christopher Nolan (dir.), Christian Bale, Heat...",(2008)
4,De 12 gezworenen,"Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb",(1957)


In [79]:
director = []
star_1 = []
star_2 = []

for movie in dir_stars:
    crew = movie.split(",")
    director.append(crew[0].replace(" (dir.)", ""))
    star_1.append(crew[1])
    star_2.append(crew[2])

In [80]:
year_c = []
for item_year in year:
    item_year = item_year.replace('(','').replace(')','')
    year_c.append(item_year)

In [81]:
# each list becomes a column
movies = pd.DataFrame({"title":title,
                       "director":director,
                       "star_1":star_1,
                       "star_2":star_2,
                       "year":year_c
                      })

In [82]:
movies.head()

Unnamed: 0,title,director,star_1,star_2,year
0,The Shawshank Redemption,Frank Darabont,Tim Robbins,Morgan Freeman,1994
1,The Godfather,Francis Ford Coppola,Marlon Brando,Al Pacino,1972
2,The Godfather: Part II,Francis Ford Coppola,Al Pacino,Robert De Niro,1974
3,The Dark Knight,Christopher Nolan,Christian Bale,Heath Ledger,2008
4,De 12 gezworenen,Sidney Lumet,Henry Fonda,Lee J. Cobb,1957


In [83]:
len(movies)

250