In [8]:
import requests
from bs4 import BeautifulSoup

# URL of the page to scrape
url = "https://pt.wikipedia.org/wiki/Lista_de_unidades_federativas_do_Brasil_por_popula%C3%A7%C3%A3o"

# Send a GET request to the page
response = requests.get(url)

# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

# The first table contains the list of federal units by population
uf_table = soup.find_all('table', {'class': 'wikitable'})[0]

# The second table (assuming there's another relevant table right after) contains region populations
region_table = soup.find_all('table', {'class': 'wikitable'})[1]

# For the UF table, we'll extract the 'Unidade Federativa' and 'População Censo 2022'
uf_data = []
for row in uf_table.find_all('tr')[1:]:  # skip the header row
    cols = row.find_all('td')
    if cols:
        uf_name = cols[0].text.strip()
        population = cols[2].text.strip()  # Assuming the population is in the third column
        uf_data.append({'Unidade Federativa': uf_name, 'População Censo 2022': population})

# For the region table, extract 'Região' and 'População'
region_data = []
for row in region_table.find_all('tr')[1:]:  # skip the header row
    cols = row.find_all('td')
    if cols:
        region_name = cols[1].text.strip()
        population = cols[2].text.strip()  # Assuming the population is in the second column
        region_data.append({'Região': region_name, 'População': population})

In [9]:
import pandas as pd

# Assuming uf_data and region_data are already populated as shown previously

# Convert uf_data into a DataFrame
uf_df = pd.DataFrame(uf_data)

# Convert region_data into a DataFrame
region_df = pd.DataFrame(region_data)

# Display the DataFrames (you can also use other methods to save or manipulate them)
print("Unidades Federativas Data:")
print(uf_df)
print("\nRegião Data:")
print(region_df)

Unidades Federativas Data:
   Unidade Federativa População Censo 2022
0                   1           44 411 238
1                   2           20 538 718
2                   3           16 054 524
3                   4           14 141 626
4                   5           11 444 380
5                   6           10 882 965
6                   7            9 058 931
7                   8            8 794 957
8                   9            8 121 025
9                  10            7 610 361
10                 11            7 056 495
11                 12            6 775 805
12                 13            3 974 687
13                 14            3 941 613
14                 15            3 833 712
15                 16            3 658 649
16                 17            3 302 729
17                 18            3 271 199
18                 19            3 127 683
19                 20            2 817 381
20                 21            2 757 013
21                 22      

In [10]:
# Convert the uf_df DataFrame to a CSV file
uf_df.to_csv('csv/unidades_federativas_2022.csv', index=False)

# Convert the region_df DataFrame to a CSV file
region_df.to_csv('csv/regioes_2022.csv', index=False)