In [1]:
import requests
import bs4

## News Articles

In [2]:
# make the http request and turn the response into a beautiful soup object
response = requests.get('https://web-scraping-demo.zgulde.net/news')
html = response.text
soup = bs4.BeautifulSoup(html)

1. Find the container for the information we want `article_container`
1. Within the container, identify the entities that we want to produce a list
1. Process each individual entity (identify the pieces that we want and extract them)

In [3]:
# soup

In [4]:
response.content, 'html.parser'

(b'<!DOCTYPE html>\n<html lang="en">\n<head>\n    <meta charset="UTF-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n    <title>News Example Page</title>\n    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet" />\n    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.4.1/font/bootstrap-icons.css" />\n</head>\n<body class="mx-auto max-w-screen-lg pb-32">\n    \n<h1 class="my-5 text-4xl text-center">News!</h1>\n<div class="my-5 text-red-800 px-5 py-3 bg-red-100 font-bold">\n    <p>\n        <i class="bi bi-exclamation-circle text-xl"></i>\n        All data on this page is strictly for demonstration purposes and fake.\n    </p>\n</div>\n<div class="grid gap-y-12">\n    \n    <div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">\n        <img src="/static/placeholder.png" />\n        

In [5]:
article_container = soup.select('.grid.gap-y-12')

In [6]:
article_container

[<div class="grid gap-y-12">
 <div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
 <img src="/static/placeholder.png"/>
 <div class="col-span-3 space-y-3 py-3">
 <h2 class="text-2xl text-green-900">live though thing</h2>
 <div class="grid grid-cols-2 italic">
 <p> 2013-11-20 </p>
 <p class="text-right">By Darlene Landry </p>
 </div>
 <p>Property two operation yeah sport economic low. Positive treat first open person.
 Part bring need present question order. Choose size traditional.</p>
 </div>
 </div>
 <div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
 <img src="/static/placeholder.png"/>
 <div class="col-span-3 space-y-3 py-3">
 <h2 class="text-2xl text-green-900">skin position probably</h2>
 <div class="grid grid-cols-2 italic">
 <p> 2010-05-03 </p>
 <p class="text-right">By Stacey Bennett </p>
 </div>
 <p>Apply heavy along our. Particular safe society seven artist 

In [8]:
articles = article_container[0].select('.grid.grid-cols-4.gap-x-4.border')

In [9]:
article = articles[0]
# get a pretty printed representation of our element
print(article.prettify())

<div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
 <img src="/static/placeholder.png"/>
 <div class="col-span-3 space-y-3 py-3">
  <h2 class="text-2xl text-green-900">
   live though thing
  </h2>
  <div class="grid grid-cols-2 italic">
   <p>
    2013-11-20
   </p>
   <p class="text-right">
    By Darlene Landry
   </p>
  </div>
  <p>
   Property two operation yeah sport economic low. Positive treat first open person.
Part bring need present question order. Choose size traditional.
  </p>
 </div>
</div>



`.select` vs `.find` or `.find_all`

- `.select` always gives back a list, sometimes the list is empty, sometimes it has a single element, sometimes it has multiple elements
- `.find` and `.find_all` accept a *tag name*
- `.find` returns a single element
- `.find_all` returns a list of elements
- With `.select` multiple class names each have a `.` in front of them
- With `.find` or `.find_all` you can use a `class_` keyword argument, but the classes must match exactly

In [13]:
article.select('div', class_='grid grid-cols-2 italic')[0].find_all('p')

[<p> 2013-11-20 </p>,
 <p class="text-right">By Darlene Landry </p>,
 <p>Property two operation yeah sport economic low. Positive treat first open person.
 Part bring need present question order. Choose size traditional.</p>]

In [15]:
def process_article(article):
    title = article.find('h2').text
    date_and_byline_div = article.select('.grid.grid-cols-2.italic')[0]
    date_p, by_p = date_and_byline_div.find_all('p')
    summary = article.find_all('p')[-1].text
    
    return {
        "title": title,
        "date": date_p.text,
        "by": by_p.text,
        "summary": summary
    }

In [16]:
process_article(articles[3])

{'title': 'according even difference',
 'date': ' 1971-04-17 ',
 'by': 'By Penny Thornton ',
 'summary': 'Spring true mean social realize environmental. Anything employee list glass. Road couple both.\nProvide fall recently yourself employee every very. Medical federal certainly key answer statement safe.'}

In [17]:
import pandas as pd


pd.DataFrame([process_article(article) for article in articles])

Unnamed: 0,title,date,by,summary
0,live though thing,2013-11-20,By Darlene Landry,Property two operation yeah sport economic low...
1,skin position probably,2010-05-03,By Stacey Bennett,Apply heavy along our. Particular safe society...
2,buy or phone,2015-07-03,By Mark Wilson,Operation very growth glass most phone arrive....
3,according even difference,1971-04-17,By Penny Thornton,Spring true mean social realize environmental....
4,event he issue,1986-07-16,By Danielle Lyons,Think site court player. Population take case ...
5,any particularly child,1987-10-30,By Theresa Newman,Available watch I. Others land with program. C...
6,suggest lead purpose,1985-06-23,By David Ponce,Money focus never day always. Air summer word ...
7,large American much,1977-09-24,By Erin Hill,Free usually him tax. Turn foreign son whom wh...
8,something read game,2016-04-06,By Amy Anderson,Attorney lose son cold movement stock. So work...
9,either ready social,2013-03-19,By Joseph Sanchez,Final full gas what expert center. Four appear...


## People

In [19]:
response = requests.get('https://web-scraping-demo.zgulde.net/people')
soup = bs4.BeautifulSoup(response.text)
# soup

In [20]:
people_div = soup.select('#people')

In [24]:
people_div[0]

<div class="grid grid-cols-2 gap-x-12 gap-y-16" id="people">
<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Mary Barnes</h2>
<p class="quote col-span-full px-5 py-5 text-center text-gray-500">
            "Advanced didactic intranet"
        </p>
<div class="grid grid-cols-9">
<i class="bi bi-envelope-fill text-purple-800"></i>
<p class="email col-span-8">jsanchez@hotmail.com</p>
<i class="bi bi-telephone-fill text-purple-800"></i>
<p class="phone col-span-8">294-429-0836x07485</p>
</div>
<div class="address grid grid-cols-9">
<i class="bi bi-geo-fill text-purple-800"></i>
<p class="col-span-8">
                88789 Scott Fort <br/>
                Steinbury, MO 04880
            </p>
</div>
</div>
<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-

In [26]:
people = people_div[0].select('.person')

In [28]:
len(people)

10

In [27]:
person = people[0]
print(person.prettify())

<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
 <h2 class="text-2xl text-purple-800 name col-span-full border-b">
  Mary Barnes
 </h2>
 <p class="quote col-span-full px-5 py-5 text-center text-gray-500">
  "Advanced didactic intranet"
 </p>
 <div class="grid grid-cols-9">
  <i class="bi bi-envelope-fill text-purple-800">
  </i>
  <p class="email col-span-8">
   jsanchez@hotmail.com
  </p>
  <i class="bi bi-telephone-fill text-purple-800">
  </i>
  <p class="phone col-span-8">
   294-429-0836x07485
  </p>
 </div>
 <div class="address grid grid-cols-9">
  <i class="bi bi-geo-fill text-purple-800">
  </i>
  <p class="col-span-8">
   88789 Scott Fort
   <br/>
   Steinbury, MO 04880
  </p>
 </div>
</div>



In [29]:
def process_person(person):
    return {
        'name': person.find(class_='name').text,
        'quote': person.find(class_='quote').text.strip(),
        'email': person.find(class_='email').text,
        'phone': person.find(class_='phone').text,
        'address': person.find(class_='address').text.strip(),
    }

In [36]:
people[4].find(class_='name').text

'Jessica Orr MD'

In [31]:
process_person(people[4])

{'name': 'Jessica Orr MD',
 'quote': '"Streamlined responsive alliance"',
 'email': 'jharris@perez.com',
 'phone': '(437)501-9370x075',
 'address': '274 Jones Streets Apt. 700 \n                Thomasside, CO 38826'}

In [37]:
pd.DataFrame([process_person(person) for person in people])

Unnamed: 0,name,quote,email,phone,address
0,Mary Barnes,"""Advanced didactic intranet""",jsanchez@hotmail.com,294-429-0836x07485,"88789 Scott Fort \n Steinbury, ..."
1,Katherine Martinez,"""Phased multimedia process improvement""",williamssamuel@hotmail.com,0650348648,5547 Hill Ranch \n Brucechester...
2,Michelle Hernandez,"""Operative motivating knowledgebase""",stephaniejohnson@jimenez.org,892.843.4697,914 David Run \n East Roberthav...
3,Joseph Brewer,"""Reverse-engineered actuating info-mediaries""",tracie28@white-johnson.net,313.129.9742x6557,200 Darrell Flat \n North Linds...
4,Jessica Orr MD,"""Streamlined responsive alliance""",jharris@perez.com,(437)501-9370x075,274 Jones Streets Apt. 700 \n T...
5,Amy Williams,"""Digitized foreground migration""",deborah54@price.net,8660424955,"529 Rose Road \n Brookeside, ME..."
6,Robin Khan,"""Customer-focused systemic open system""",paulfrye@jones.com,001-601-651-3085x36490,"47772 Lowe Trail \n Luisland, U..."
7,Michael Harmon,"""Multi-tiered multi-state matrices""",charlotte48@aguilar.com,971.646.6132x78097,7825 Hayes Estates Suite 725 \n ...
8,Ryan Riley,"""Intuitive regional knowledgebase""",murraypatricia@yahoo.com,319.796.0603x5230,926 Ariel Springs Apt. 867 \n S...
9,Amanda Jones,"""Intuitive 3rdgeneration structure""",zsandoval@mitchell.biz,+1-551-689-6612x4494,443 Hannah Station \n Munoztown...
