In [2]:
# This file exemplifies a simple data extraction using
# a) Basic request components
# b) Beautiful Soup component


import requests

# In the first example the input is a metadata (url) and the
# output is a raw text
book_url = "https://www.gutenberg.org/cache/epub/53101/pg53101.txt"
response = requests.get(book_url)

if response.status_code == 200:
    # here we get response contents
    text_content = response.text[:500]
    print("Raw string")
    print(repr(text_content)) #print a raw string (first 1000 characters)
    print("\n\n")
    print("Formatted string")
    print(text_content) # print the formatted string (obs: \ufeff = Byte Order Mark)
    

Raw string
'\ufeffThe Project Gutenberg eBook of A Mao e A Luva\r\n    \r\nThis ebook is for the use of anyone anywhere in the United States and\r\nmost other parts of the world at no cost and with almost no restrictions\r\nwhatsoever. You may copy it, give it away or re-use it under the terms\r\nof the Project Gutenberg License included with this ebook or online\r\nat www.gutenberg.org. If you are not located in the United States,\r\nyou will have to check the laws of the country where you are located\r\nbefore using this eBoo'



Formatted string
﻿The Project Gutenberg eBook of A Mao e A Luva
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the count

In [4]:
# In the second example the input is an URL and the output is a
# HTML marked document

news_url = "https://www.diariodebarrelas.com.br/cidade-de-barrelas/"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
response = requests.get(news_url, allow_redirects=True, headers=headers)

if response.status_code == 200:
    # here we get response contents
    html_content = response.text
    print("\n\n")
    print("Raw HTML text content")
    print(html_content) # print the formatted string with marks
else:
    print(response.status_code)





Raw HTML text content
<!doctype html>
<html lang="pt-BR">
<head>

	<!-- Global site tag (gtag.js) - Google Analytics -->
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-11022702-1"></script>
<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'UA-11022702-1');
</script>
	
	<meta charset="UTF-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1" />
	<link rel="profile" href="https://gmpg.org/xfn/11" />
	<title>A Cidade de Barrelas &#8211; Diário de Barrelas</title>
<meta name='robots' content='max-image-preview:large' />
	<style>img:is([sizes="auto" i], [sizes^="auto," i]) { contain-intrinsic-size: 3000px 1500px }</style>
	<link rel='dns-prefetch' href='//pagead2.googlesyndication.com' />
<link rel='dns-prefetch' href='//www.googletagmanager.com' />
<link rel="alternate" type="application/rss+xml" title="Feed para Diário de Barrelas &raquo;" href="https:/

In [6]:
# Let`s find a specific div content

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Finds the specific div class 'entry-content'
specific_content= soup.find('div', class_='entry-content')
print("\n\n Specific content extraction")
print(specific_content)



 Specific content extraction
<div class="entry-content">
<p>O<strong> município de Barrelas</strong> foi fundado em 2 de janeiro de 1856 quando o <strong>padre Joaquim das Costas</strong> estabeleceu o povoado de <strong>São Jorge das Barrelas</strong> com seu grupo de missionários na bela região do Planalto Paulista.</p>
<p>A atividade inicial de Barrelas foi a <strong>pecuária</strong> e a cultura do <strong>milho</strong> e do <strong>algodão</strong>, mas a cidade viveu seu apogeu na época do <strong>expansionismo cafeeiro</strong> paulista no início do século 20.</p>
<p>Barrelas é uma cidade que hoje tem <strong>4.453 habitantes</strong> e é conhecida por sua tradição de hospitalidade e amizade. A cidade é famosa por seus festivais de música ao ar livre e por sua gastronomia caseira da vovó, especialmente por suas sobremesas típicas.</p>
<p><strong>Curiosidades</strong></p>
<p>Em 1968, Barrelas foi cenário do filme <strong>“Città dei Bambini”</strong> (“A Cidade dos Meninos”) co