# Scraping tutorial

- https://www.dataquest.io/blog/web-scraping-tutorial-python/
- https://www.dataquest.io/blog/web-scraping-beautifulsoup/

In [1]:
import requests

page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
page

<Response [200]>

In [2]:
page.status_code

200

In [3]:
page.content

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [4]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [5]:
print(soup.prettify())


<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [7]:
soup_children = list(soup.children)

In [12]:
soup_children[2]

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [13]:
[type(item) for item in list(soup.children)]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [14]:
html = soup_children[2]

In [18]:
html_children = list(html.children)

In [22]:
body = html_children[3]

In [23]:
body

<body>
<p>Here is some simple content for this page.</p>
</body>

In [24]:
p = list(body.children)[1]

In [25]:
p.get_text()

'Here is some simple content for this page.'

In [26]:
soup = BeautifulSoup(page.content, 'html.parser')
soup.find_all('p')

[<p>Here is some simple content for this page.</p>]

In [27]:
soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [28]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')
soup

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [29]:
soup.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [31]:
page = requests.get("https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
soup = BeautifulSoup(page.content, 'html.parser')

In [55]:
seven_day = soup.find(id='seven-day-forecast')

In [56]:
forecast_items = seven_day.find_all(class_="tombstone-container")

In [58]:
tonight = forecast_items[1]
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Overnight
  <br>
   <br/>
  </br>
 </p>
 <p>
  <img alt="Overnight: Rain.  Low around 54. Breezy, with a south wind around 22 mph, with gusts as high as 29 mph.  Chance of precipitation is 90%. New precipitation amounts of less than a tenth of an inch possible. " class="forecast-icon" src="newimages/medium/nra90.png" title="Overnight: Rain.  Low around 54. Breezy, with a south wind around 22 mph, with gusts as high as 29 mph.  Chance of precipitation is 90%. New precipitation amounts of less than a tenth of an inch possible. "/>
 </p>
 <p class="short-desc">
  Rain and
  <br>
   Breezy
  </br>
 </p>
 <p class="temp temp-low">
  Low: 54 °F
 </p>
</div>


In [59]:
period = tonight.find(class_="period-name").get_text()
short_desc = tonight.find(class_="short-desc").get_text()
temp = tonight.find(class_="temp").get_text()

print(period)
print(short_desc)
print(temp)

Overnight
Rain andBreezy
Low: 54 °F


In [60]:
img = tonight.find("img")
desc = img['title']

print(desc)

Overnight: Rain.  Low around 54. Breezy, with a south wind around 22 mph, with gusts as high as 29 mph.  Chance of precipitation is 90%. New precipitation amounts of less than a tenth of an inch possible. 


In [67]:
period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
periods = periods[1:]

In [68]:
short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
descs = [d["title"] for d in seven_day.select(".tombstone-container img")]

print(short_descs)
print(temps)
print(descs)


['Click HERE for Details', 'Rain andBreezy', 'Heavy Rainand Breezy', 'Heavy Rainand Windy', 'ShowersLikely', 'ChanceShowers', 'Slight ChanceShowers thenMostly Sunny', 'Mostly Clear', 'Mostly Sunny']
['Low: 54 °F', 'High: 58 °F', 'Low: 53 °F', 'High: 60 °F', 'Low: 50 °F', 'High: 57 °F', 'Low: 45 °F', 'High: 57 °F']
['', 'Overnight: Rain.  Low around 54. Breezy, with a south wind around 22 mph, with gusts as high as 29 mph.  Chance of precipitation is 90%. New precipitation amounts of less than a tenth of an inch possible. ', 'Tuesday: Rain. The rain could be heavy at times.  High near 58. Breezy, with a south wind 23 to 26 mph, with gusts as high as 34 mph.  Chance of precipitation is 100%. New precipitation amounts between three quarters and one inch possible. ', 'Tuesday Night: Rain before 10pm, then rain and possibly a thunderstorm between 10pm and 4am, then a chance of showers and thunderstorms after 4am. Some of the storms could produce heavy rainfall.  Low around 53. Windy, with a

In [69]:
import pandas as pd
weather = pd.DataFrame({
        "period": periods, 
        "short_desc": short_descs, 
        "temp": temps, 
        "desc":descs
    })
weather

ValueError: arrays must all be same length

# IMDB

In [70]:
from requests import get

url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'

response = get(url)
print(response.text[:500])



<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">



        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>

<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle",


In [73]:
html_soup = BeautifulSoup(response.text, 'html.parser')

In [74]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))

<class 'bs4.element.ResultSet'>
50


In [75]:
first_movie = movie_containers[0]
first_movie

<div class="lister-item mode-advanced">
<div class="lister-top-right">
<div class="ribbonize" data-caller="filmosearch" data-tconst="tt3315342"></div>
</div>
<div class="lister-item-image float-left">
<a href="/title/tt3315342/?ref_=adv_li_i"> <img alt="Logan" class="loadlate" data-tconst="tt3315342" height="98" loadlate="https://m.media-amazon.com/images/M/MV5BYzc5MTU4N2EtYTkyMi00NjdhLTg3NWEtMTY4OTEyMzJhZTAzXkEyXkFqcGdeQXVyNjc1NTYyMjg@._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://m.media-amazon.com/images/G/01/imdb/images/nopicture/large/film-184890147._CB470041630_.png" width="67"/>
</a> </div>
<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt3315342/?ref_=adv_li_tt">Logan</a>
<span class="lister-item-year text-muted unbold">(2017)</span>
</h3>
<p class="text-muted ">
<span class="certificate">12</span>
<span class="ghost">|</span>
<span class="runtime">137 min</span>
<span class="ghost">|<

In [80]:
first_name = first_movie.h3.a.text
first_name

'Logan'

In [82]:
first_year = first_movie.h3.find('span', class_ = 'lister-item-year text-muted unbold').text
first_year

'(2017)'

In [83]:
first_imdb = float(first_movie.strong.text)
first_imdb

8.1

In [84]:
first_mscore = first_movie.find('span', class_ = 'metascore favorable')

first_mscore = int(first_mscore.text)
print(first_mscore)

77
