# Web Scraping

In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = 'https://web-scraping-demo.zgulde.net/news'
response = get(url)
response

<Response [200]>

In [3]:
print(response.text[:400])

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>News Example Page</title>
    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet" />
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap


In [4]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
articles = soup.select('div.grid.grid-cols-4')

In [6]:
article = articles[0]
article

<div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
<img src="/static/placeholder.png"/>
<div class="col-span-3 space-y-3 py-3">
<h2 class="text-2xl text-green-900">reality data century</h2>
<div class="grid grid-cols-2 italic">
<p> 1982-05-18 </p>
<p class="text-right">By John Fitzpatrick DVM </p>
</div>
<p>Now impact million few simply. Appear citizen station production. Politics board matter area later themselves people.
Energy any personal difference majority entire teach. Process hundred in cause. Cell cell performance walk their rate worker.</p>
</div>
</div>

In [7]:
def parse_news_article(article):
    output = {}
    output['headline'] = article.find('h2').text
    output['date'], output['byline'], output['description'] = [p.text for p in article.find_all('p')]
    return output

In [8]:
pd.DataFrame([parse_news_article(article) for article in articles])

Unnamed: 0,headline,date,byline,description
0,reality data century,1982-05-18,By John Fitzpatrick DVM,Now impact million few simply. Appear citizen ...
1,deep series success,1970-05-16,By Chase Johnson,Election world property moment focus close. Gu...
2,benefit week into,2004-05-21,By Dr. Terry Solis,Voice try include force argue mother across.\n...
3,agency stop represent,1986-10-03,By John Herrera,Share team particularly throughout when. Reali...
4,energy economic race,2018-06-25,By Karen Robinson,College public practice hot. Two age measure s...
5,machine school office,1992-04-20,By Joshua Harrison,Woman talk yard want travel life fly. Performa...
6,oil should visit,2004-07-14,By Antonio Newton,Over admit song per or who. Everybody senior f...
7,compare sing company,1992-05-29,By Amy Lewis,Team population heavy hotel. Network would foc...
8,time range customer,1975-04-05,By Edward Johnson,Class put management collection mind left simp...
9,individual training onto,1986-01-27,By Rachel Morgan,Tend join situation allow show notice. Task te...


mini exercise

In [10]:
# access the website
url = 'https://web-scraping-demo.zgulde.net/people'
response = get(url)
response

<Response [200]>

In [11]:
print(response.text[:400])

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Example People Page</title>
    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet" />
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstr


In [12]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.text)
people = soup.select('div.person')
people[0]

<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Claudia Oliver</h2>
<p class="quote col-span-full px-5 py-5 text-center text-gray-500">
            "Front-line dedicated conglomeration"
        </p>
<div class="grid grid-cols-9">
<i class="bi bi-envelope-fill text-purple-800"></i>
<p class="email col-span-8">tranrachel@contreras.org</p>
<i class="bi bi-telephone-fill text-purple-800"></i>
<p class="phone col-span-8">(891)667-8448</p>
</div>
<div class="address grid grid-cols-9">
<i class="bi bi-geo-fill text-purple-800"></i>
<p class="col-span-8">
                7156 Laura Vista <br/>
                New Randyfurt, NE 35017
            </p>
</div>
</div>

In [23]:
# create a function to parse the people article
def parse_people_article(person):
    output = {}
    output['name'] = person.find('h2').text
    output['quote'] = person.select('.quote')[0].text.strip()
    output['email_address'] = person.select_one('.email').text
    output['phone_number'] = person.select_one('.phone').text
    output['address'] = person.select_one('.address').text.strip()
    return output

In [24]:
# create a dataframe from the list of people
pd.DataFrame([parse_people_article(person) for person in people])

Unnamed: 0,name,quote,email_address,phone_number,address
0,Claudia Oliver,"""Front-line dedicated conglomeration""",tranrachel@contreras.org,(891)667-8448,7156 Laura Vista \n New Randyfu...
1,Kelsey Kaiser,"""Multi-channeled well-modulated archive""",raustin@hotmail.com,939-229-3582x57502,4790 Jo Trail \n East Mariaport...
2,David Miller,"""Front-line 4thgeneration algorithm""",umanning@vasquez.com,779.982.2138x13450,34147 Cummings Corners Apt. 228 \n ...
3,Kimberly Rodriguez,"""Decentralized bottom-line analyzer""",timothycox@stanley.com,(717)765-2982,90625 Miller Run Apt. 996 \n Na...
4,Benjamin Daniel,"""Reactive demand-driven project""",zarmstrong@hotmail.com,+1-605-633-2738,95925 Frank Harbors Apt. 859 \n ...
5,Brandon Smith,"""Object-based radical hierarchy""",lstewart@bell.com,+1-367-761-6604x68246,0916 Harris Port Suite 730 \n S...
6,Danielle Armstrong,"""Up-sized attitude-oriented application""",kristibarber@hotmail.com,001-563-419-0100x329,1631 Emily Manors \n Fernandezv...
7,Kiara Bird,"""Fundamental optimal function""",angelpoole@norman.info,536-255-0589,"47977 Haney Ramp \n New John, N..."
8,Joshua Jenkins,"""Multi-lateral mission-critical toolset""",jacksonchristopher@donaldson-brown.org,672-188-1507x680,"160 Hannah Wall \n New Chase, I..."
9,Robert Griffin,"""Managed high-level database""",john41@fry.com,(249)949-8343,8080 Ashley Squares Apt. 142 \n ...
