# HTML Parsing with Beautiful Soup
1. Fundamentals
2. Traversing
3. Finding and other utilities

In [1]:
from bs4 import BeautifulSoup

url = 'https://news.ycombinator.com/'

In [2]:
import requests

html_doc = requests.get(url).content

In [3]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [4]:
# soup

In [5]:
print(soup.prettify())

<html lang="en" op="news">
 <head>
  <meta content="origin" name="referrer"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <link href="news.css?SaGQkWwFeG80WRjWeYkp" rel="stylesheet" type="text/css"/>
  <link href="favicon.ico" rel="shortcut icon"/>
  <link href="rss" rel="alternate" title="RSS" type="application/rss+xml"/>
  <title>
   Hacker News
  </title>
 </head>
 <body>
  <center>
   <table bgcolor="#f6f6ef" border="0" cellpadding="0" cellspacing="0" id="hnmain" width="85%">
    <tr>
     <td bgcolor="#ff6600">
      <table border="0" cellpadding="0" cellspacing="0" style="padding:2px" width="100%">
       <tr>
        <td style="width:18px;padding-right:4px">
         <a href="https://news.ycombinator.com">
          <img height="18" src="y18.gif" style="border:1px white solid;" width="18"/>
         </a>
        </td>
        <td style="line-height:12pt; height:10px;">
         <span class="pagetop">
          <b class="hnname">
           <a href

Let's investigate.

In [6]:
title = soup.title

Introspect and navigate the DOM.

In [7]:
title.text

'Hacker News'

In [8]:
soup.span

<span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
<a href="newest">new</a> | <a href="front">past</a> | <a href="newcomments">comments</a> | <a href="ask">ask</a> | <a href="show">show</a> | <a href="jobs">jobs</a> | <a href="submit">submit</a> </span>

Get class

In [9]:
soup.span["class"]

['pagetop']

Let's get all the links.

In [10]:
soup.find_all('a')

[<a href="https://news.ycombinator.com"><img height="18" src="y18.gif" style="border:1px white solid;" width="18"/></a>,
 <a href="news">Hacker News</a>,
 <a href="newest">new</a>,
 <a href="front">past</a>,
 <a href="newcomments">comments</a>,
 <a href="ask">ask</a>,
 <a href="show">show</a>,
 <a href="jobs">jobs</a>,
 <a href="submit">submit</a>,
 <a href="login?goto=news">login</a>,
 <a href="vote?id=31139610&amp;how=up&amp;goto=news" id="up_31139610"><div class="votearrow" title="upvote"></div></a>,
 <a class="titlelink" href="https://atp.fm/205-chris-lattner-interview-transcript#gc">Chris Lattner on garbage collection vs. Automatic Reference Counting (2017)</a>,
 <a href="from?site=atp.fm"><span class="sitestr">atp.fm</span></a>,
 <a class="hnuser" href="user?id=Austin_Conlon">Austin_Conlon</a>,
 <a href="item?id=31139610">5 hours ago</a>,
 <a href="hide?id=31139610&amp;goto=news">hide</a>,
 <a href="item?id=31139610">64 comments</a>,
 <a href="vote?id=31137324&amp;how=up&amp;goto

## Tags
- name
- attrs
- modifiable


In [11]:
anchor = soup.find_all('a')[3]

In [12]:
anchor

<a href="front">past</a>

In [13]:
anchor.name

'a'

In [14]:
anchor.attrs

{'href': 'front'}

In [15]:
anchor.get_text()

'past'

## Traversing
Let's navigate more formally with...

- `.next_element` and `.previous_element`
- `.next_siblings` and `.previous_siblings`

In [16]:
[y for y in soup.table.children]

['\n',
 <tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" style="padding:2px" width="100%"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img height="18" src="y18.gif" style="border:1px white solid;" width="18"/></a></td>
 <td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
 <a href="newest">new</a> | <a href="front">past</a> | <a href="newcomments">comments</a> | <a href="ask">ask</a> | <a href="show">show</a> | <a href="jobs">jobs</a> | <a href="submit">submit</a> </span></td><td style="text-align:right;padding-right:4px;"><span class="pagetop">
 <a href="login?goto=news">login</a>
 </span></td>
 </tr></table></td></tr>,
 '\n',
 <tr id="pagespace" style="height:10px" title=""></tr>,
 <tr><td><table border="0" cellpadding="0" cellspacing="0" class="itemlist">
 <tr class="athing" id="31139610">
 <td align="right" class="title" valign="top"><span class="rank"

## Finding by `id` and `class_`

In [17]:
soup.find_all("span", class_="score")

[<span class="score" id="score_31139610">123 points</span>,
 <span class="score" id="score_31137324">273 points</span>,
 <span class="score" id="score_31141042">17 points</span>,
 <span class="score" id="score_31134534">624 points</span>,
 <span class="score" id="score_31134074">395 points</span>,
 <span class="score" id="score_31138214">101 points</span>,
 <span class="score" id="score_31095369">75 points</span>,
 <span class="score" id="score_31128267">176 points</span>,
 <span class="score" id="score_31094373">149 points</span>,
 <span class="score" id="score_31140389">86 points</span>,
 <span class="score" id="score_31139829">49 points</span>,
 <span class="score" id="score_31128571">11 points</span>,
 <span class="score" id="score_31136285">100 points</span>,
 <span class="score" id="score_31137646">74 points</span>,
 <span class="score" id="score_31140324">17 points</span>,
 <span class="score" id="score_31138228">71 points</span>,
 <span class="score" id="score_31100438">42 poin

In [18]:
soup.find_all(id="26907176")

[]

## `get_text()`

In [19]:
url = 'https://www.macys.com/shop/mens-clothing/mens-underwear?id=57&cm_sp=c2_1111US_catsplash_men-_-row1-_-image_underwear-and-socks&edge=hybrid'

In [20]:
html_doc = requests.get(url).content

In [21]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [22]:
print(soup.prettify())

<html>
 <head>
  <title>
   Access Denied
  </title>
 </head>
 <body>
  <h1>
   Access Denied
  </h1>
  You don't have permission to access the requested URL on this server.
  <p>
   Reference: 18.877a4217.1650775466.10f872c0
  </p>
 </body>
</html>


# Scraping Techniques
- Chrome Inspector
- https://curl.trillworks.com/

1. Go to Chrome inspector/developer tools
2. Copy request as cURL
3. Put into the trillworks 
4. Copy and paste here and clean up
5. Try the request again

In [23]:
import os
import requests

headers = {
    'authority': 'www.macys.com',
    'cache-control': 'max-age=0',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
    'sec-ch-ua-mobile': '?0',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'sec-fetch-site': 'none',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-user': '?1',
    'sec-fetch-dest': 'document',
    'accept-language': 'en-US,en;q=0.9',
    'if-none-match': '"19fb13-oO1RHLn6nYAnERIvlcGEn7tVeLQ"',
}

params = (
    ('id', '57'),
    ('cm_sp', 'c2_1111US_catsplash_men-_-row1-_-image_underwear-and-socks'),
    ('edge', 'hybrid'),
)

response = requests.get('https://www.macys.com/shop/mens-clothing/mens-underwear', headers=headers, params=params)

In [24]:
soup = BeautifulSoup(response.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <title>
   Mens Underwear - Boxers, Briefs &amp; Jockstraps - Macy's
  </title>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="telephone=no" name="format-detection">
   <meta content="Shop the Latest Collection of Underwear for Men Online at Macys.com. FREE SHIPPING AVAILABLE!" name="description"/>
   <meta content="width=device-width, initial-scale=1" name="viewport"/>
   <meta content="Mens Underwear - Boxers, Briefs &amp; Jockstraps - Macy's" property="og:title" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
    <meta content="website" property="og:type" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
     <meta content="https://www.macys.com/img/nav/co_macysLogo3.gif" property="og:image" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
      <meta content="https:/

## Mini-Lab
1. Get all products and prices and save to CSV.
2. Create a Bottle-powered page where it shows you a random product with image, name, and price and link to the Macy's page for that product. 
3. Bonus: allow that Bottle-powered page to filter on price min and mx.

In [25]:
productNames = soup.find_all("div", {"class": "productBrand"})

In [26]:
productPrices = soup.find_all("div", {"class": "priceInfo"})

In [27]:
len(productNames)

66

In [28]:
len(productPrices)

66

In [29]:
import csv 

with open('prices.csv',  mode='w') as csv_file:
    writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for i in range(0,len(productNames)):
        writer.writerow([productNames[i].text.strip(), productPrices[i].text.strip()])