In [1]:
### stdin e stdout

In [1]:
def get_domain(email_address: str) -> str:
    return email_address.lower().split('@')[-1]

assert get_domain('joelgus@gmail.com') == 'gmail.com'

In [2]:
!python -m pip install beautifulsoup4 requests html5lib

Collecting beautifulsoup4
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
Collecting requests
  Using cached requests-2.31.0-py3-none-any.whl (62 kB)
Collecting html5lib
  Using cached html5lib-1.1-py2.py3-none-any.whl (112 kB)
Collecting soupsieve>1.2
  Using cached soupsieve-2.5-py3-none-any.whl (36 kB)
Collecting charset-normalizer<4,>=2
  Using cached charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl (99 kB)
Collecting idna<4,>=2.5
  Using cached idna-3.6-py3-none-any.whl (61 kB)
Collecting urllib3<3,>=1.21.1
  Using cached urllib3-2.2.1-py3-none-any.whl (121 kB)
Collecting certifi>=2017.4.17
  Using cached certifi-2024.2.2-py3-none-any.whl (163 kB)
Collecting webencodings
  Using cached webencodings-0.5.1-py2.py3-none-any.whl (11 kB)
Installing collected packages: webencodings, urllib3, soupsieve, idna, html5lib, charset-normalizer, certifi, requests, beautifulsoup4
Successfully installed beautifulsoup4-4.12.3 certifi-2024.2.2 charset-normalizer-3.3.2 html5lib-1.1 


[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from bs4 import BeautifulSoup

import requests

url = 'https://github.com/joelgrus/data/blob/master/getting-data.html'

html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')

In [6]:
# geralmente trabalhamos com objetos tag:
# por exemplo: encontrando a tag p

first_paragraph = soup.find('p')
first_paragraph

<p>We read every piece of feedback, and take your input very seriously.</p>

In [10]:
# obtenha textos com a propriedade .text

first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()

first_paragraph_text, first_paragraph_words

('We read every piece of feedback, and take your input very seriously.',
 ['We',
  'read',
  'every',
  'piece',
  'of',
  'feedback,',
  'and',
  'take',
  'your',
  'input',
  'very',
  'seriously.'])

In [10]:
# para extrair atributos de uma tag, trate-a como dict

# first_paragraph_id = soup.p['id']         # gera um KeyError se não houver nenhum id
first_paragraph_id2 = soup.p.get('id')      # retorna None se não tiver nenhum id
first_paragraph_id2

In [9]:
# obtendo multiplas tags ao mesmo tempo

all_paragraphs = soup.find_all('p')         # ou apenas soup('p')
all_paragraphs
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]
paragraphs_with_ids

[]

In [13]:
# encontrando tags com uma classe específica

important_paragraphs = soup('p', {'class': 'important'})
important_paragraphs

important_paragraphs2 = soup('p', 'important')
important_paragraphs2

important_paragraphs3 = [p for p in soup('p') if 'important' in p.get('class', [])]
important_paragraphs3

[]

In [14]:
# encontar todas as <span> dentro de <div> 
# Aviso: retornará o mesmo <span> várias vezes se ele estiver em vários <div>

spans_inside_div = [span
                    for div in soup('div')          # para cada <div> na página
                    for span in div('span')]        # encontre cada <span> dentro dele

spans_inside_div

[<span class="progress-pjax-loader Progress position-fixed width-full" data-view-component="true">
     <span class="Progress-item progress-pjax-loader-bar left-0 top-0 color-bg-accent-emphasis" data-view-component="true" style="width: 0%;"></span>
 </span>,
 <span class="Progress-item progress-pjax-loader-bar left-0 top-0 color-bg-accent-emphasis" data-view-component="true" style="width: 0%;"></span>,
 <span class="d-none">Toggle navigation</span>,
 <span class="Button-content">
     <span class="Button-label"><div class="HeaderMenu-toggle-bar rounded my-1"></div>
             <div class="HeaderMenu-toggle-bar rounded my-1"></div>
             <div class="HeaderMenu-toggle-bar rounded my-1"></div></span>
   </span>,
 <span class="Button-label"><div class="HeaderMenu-toggle-bar rounded my-1"></div>
             <div class="HeaderMenu-toggle-bar rounded my-1"></div>
             <div class="HeaderMenu-toggle-bar rounded my-1"></div></span>,
 <span class="d-block h4 color-fg-default my-1

### Exemplo: Explorando o Congresso

In [8]:
# coletando todos os urls com links da página

url = "https://www.house.gov/representatives"
text = requests.get(url).text
soup = BeautifulSoup(text, 'html5lib')

all_urls = [a['href']
            for a in soup('a')
            if a.has_attr('href')]

print(len(all_urls))

967


In [10]:
import re

# Deve começar com https:// ou http://
# Deve terminar com .house.gov ou .house.gov/
regex = r"https?://.*\.house\.gov/?$"

assert re.match(regex, "https://joel.house.gov")
assert not re.match(regex, "https://joel.house.gov/biography")

# filtrando a list
good_urls = [url for url in all_urls if re.match(regex, url)] 
print(len(good_urls))

good_urls = list(set(good_urls))
len(good_urls)



872


436

In [11]:
# obtendo os comunicados de imprensa

html = requests.get("https://jayapal.house.gov").text
soup = BeautifulSoup(html, 'html5lib')

# use um conjunto por que os links podem aparecer várias vezes
links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
print(links)

{'https://jayapal.house.gov/category/press-releases/', 'https://jayapal.house.gov/category/news/'}


In [13]:
from typing import Dict, Set

press_releases: Dict[str, Set[str]] = {}

for house_url in good_urls:
    html = requests.get(house_url).text
    soup = BeautifulSoup(html, 'html5lib')
    pr_links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}

    print(f"{house_url}: {pr_links}")
    press_releases[house_url] = pr_links

https://sykes.house.gov: {'/media/press-releases'}
https://meeks.house.gov: {'/media/press-releases', 'https://democrats-foreignaffairs.house.gov/press-releases'}
https://johnrose.house.gov/: {'/media/press-releases'}
https://emmer.house.gov/: {'/press-releases'}
https://trentkelly.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://smucker.house.gov/: {'/media/press-releases'}
https://fernandez.house.gov: {'/media/press-releases'}
https://wilson.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://crane.house.gov: {'/media/press-releases'}
https://baird.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://casten.house.gov: {'https://casten.house.gov/media/press-releases'}
https://good.house.gov: {'/media/press-releases'}
https://ritchietorres.house.gov: set()
https://robinkelly.house.gov/: {'/media-center/press-releases'}
https://carey.house.gov: {'/media/press-releases'}
https://schrier.house.gov: {'/media/press-releases'}
https://clarke.hou

In [14]:
# identificando os congresistas cujo comunicado cita a palavra "dados"

def paragraph_mentions(text: str, keyword: str) -> bool:
    """Retorna True se um <p> no texto mencionar {keyword}"""
    soup = BeautifulSoup(text, 'html5lib')
    paragraphs = [p.get_text() for p in soup('p')]

    return any(keyword.lower() in paragraph.lower() for paragraph in paragraphs)

# teste rápido
text = """<body><h1>Facebook</h1><p>Twitter</p></body>"""
assert paragraph_mentions(text, 'twitter')
assert not paragraph_mentions(text, 'facebook')

In [15]:
for house_url, pr_links in press_releases.items():
    for pr_link in pr_links:
        url = f"{house_url}/{pr_link}"
        text = requests.get(url).text

        if paragraph_mentions(text, 'data'):
            print(f'{house_url}')
            break

https://delbene.house.gov
https://tenney.house.gov/
https://fallon.house.gov
https://balint.house.gov
https://castor.house.gov/
https://phillips.house.gov/
https://dustyjohnson.house.gov/
https://grothman.house.gov
https://sarajacobs.house.gov
https://beyer.house.gov
https://biggs.house.gov
https://pallone.house.gov
https://danbishop.house.gov
https://degette.house.gov
https://luttrell.house.gov
https://mchenry.house.gov
https://laurellee.house.gov
https://schakowsky.house.gov
https://cartwright.house.gov
https://tokuda.house.gov
https://kean.house.gov


### Usando Apis

In [None]:
import requests, json

github_user = 'joelgrus'
endpoint = f"https://api.github.com/users/{github_user}/repos"

repos = json.loads(requests.get(endpoint).text)