**UNIVERSIDADE DE SÃO PAULO (USP)**

**_Author_**: Carlos Filipe de Castro Lemos

**_Academy Study_**: Web Scraping

# References

* https://www.crummy.com/software/BeautifulSoup/bs4/doc.ptbr/
* https://medium.com/data-hackers/web-scraping-com-python-para-pregui%C3%A7osos-unindo-beautifulsoup-e-selenium-parte-1-9677fc5e2385
* http://devfuria.com.br/php/como-funcionam-os-metodos-get-e-post/
* https://www.adamsmith.haus/python/docs/bs4.BeautifulSoup
* https://medium.com/machina-sapiens/raspagem-de-dados-com-python-e-beautifulsoup-1b1b7019774c

# Packages

In [103]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests

# Pandas Web Scrapping

In [104]:
# URL to Read
url = 'https://www.worldometers.info/world-population/population-by-country/'

# Getting all tables
tables = pd.read_html(url, na_values='N.A.')

# type = list
type(tables)

list

In [105]:
print("#tables: ",len(tables))
tables[0]

#tables:  1


Unnamed: 0,#,Country (or dependency),Population (2023),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,1,India,1428627663,0.81 %,11454490,481,2973190,-486136,2.0,28.0,36 %,17.76 %
1,2,China,1425671352,-0.02 %,-215985,152,9388211,-310220,1.2,39.0,65 %,17.72 %
2,3,United States,339996563,0.50 %,1706706,37,9147420,999700,1.7,38.0,83 %,4.23 %
3,4,Indonesia,277534122,0.74 %,2032783,153,1811570,-49997,2.1,30.0,59 %,3.45 %
4,5,Pakistan,240485658,1.98 %,4660796,312,770880,-165988,3.3,21.0,35 %,2.99 %
...,...,...,...,...,...,...,...,...,...,...,...,...
229,230,Montserrat,4386,-0.09 %,-4,44,100,0,1.6,44.0,11 %,0.00 %
230,231,Falkland Islands,3791,0.29 %,11,0,12170,0,1.6,40.0,62 %,0.00 %
231,232,Niue,1935,0.05 %,1,7,260,0,2.4,36.0,41 %,0.00 %
232,233,Tokelau,1893,1.18 %,22,189,10,0,2.6,27.0,0 %,0.00 %


In [106]:
tables[0].columns

Index(['#', 'Country (or dependency)', 'Population  (2023)', 'Yearly  Change',
       'Net  Change', 'Density  (P/Km²)', 'Land Area  (Km²)',
       'Migrants  (net)', 'Fert.  Rate', 'Med.  Age', 'Urban  Pop %',
       'World  Share'],
      dtype='object')

In [107]:
path = './webscrapped/pop-country.csv'
tables[0].to_csv(path)

# Beautiful Soup Web Scraping

In [108]:
# Using protocol Get
url = 'https://www.worldometers.info/world-population/population-by-country/'
req = requests.get(url)

# If response = 200 (OK!) else 404 (not found)
print('----------------------------GET----------------------------------')
print(req)

----------------------------GET----------------------------------
<Response [200]>


In [109]:
# Parse the HTML code
html = bs(req.content, 'html.parser')
print('----------------------------HTML---------------------------------')
print(html.prettify())


----------------------------HTML---------------------------------
<!DOCTYPE html>
<!--[if IE 8]> <html lang="en" class="ie8"> <![endif]-->
<!--[if IE 9]> <html lang="en" class="ie9"> <![endif]-->
<!--[if !IE]><!-->
<html lang="en">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   Population by Country (2023) - Worldometer
  </title>
  <meta content="List of countries and dependencies in the world ranked by population, from the most populated. Growth rate, median age, fertility rate, area, density, population density, urbanization, urban population, share of world population." name="description"/>
  <!-- Favicon -->
  <link href="/favicon/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
  <link href="/favicon/apple-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/>
  <link href="/favicon/apple-icon-60x60.png" rel="apple-touch-icon" si

In [110]:
# Find table 
tables = html.find_all(name='table')
print('\n\n----------------------------TABLES-------------------------------')
print('Type: ', type(tables)) # Return a list of tables
print('Size: ', len(tables))  # Size of list
print(tables[0].prettify())   # Output the first HTML Code 



----------------------------TABLES-------------------------------
Type:  <class 'bs4.element.ResultSet'>
Size:  1
<table cellspacing="0" class="table table-striped table-bordered" id="example2" width="100%">
 <thead>
  <tr>
   <th>
    #
   </th>
   <th>
    Country (or dependency)
   </th>
   <th>
    Population
    <br/>
    (2023)
   </th>
   <th>
    Yearly
    <br/>
    Change
   </th>
   <th>
    Net
    <br/>
    Change
   </th>
   <th>
    Density
    <br/>
    (P/Km²)
   </th>
   <th>
    Land Area
    <br/>
    (Km²)
   </th>
   <th>
    Migrants
    <br/>
    (net)
   </th>
   <th>
    Fert.
    <br/>
    Rate
   </th>
   <th>
    Med.
    <br/>
    Age
   </th>
   <th>
    Urban
    <br/>
    Pop %
   </th>
   <th>
    World
    <br/>
    Share
   </th>
  </tr>
 </thead>
 <tbody>
  <tr>
   <td>
    1
   </td>
   <td style="font-weight: bold; font-size:15px; text-align:left">
    <a href="/world-population/india-population/">
     India
    </a>
   </td>
   <td style="font

In [111]:
# We can use pandas to read HTML
table = pd.read_html(tables[0].prettify())

# That's it! We have a DataFrame.
table[0].columns

Index(['#', 'Country (or dependency)', 'Population  (2023)', 'Yearly  Change',
       'Net  Change', 'Density  (P/Km²)', 'Land Area  (Km²)',
       'Migrants  (net)', 'Fert.  Rate', 'Med.  Age', 'Urban  Pop %',
       'World  Share'],
      dtype='object')

In [112]:
# But, lets see in HTML what we can do
all_lines = tables[0].find_all('tr')

number, country, pop2023,  yearly, net, density, land, migrants, fert, med, urban, share = [],[],[],[],[],[],[],[],[],[],[], []

In [113]:
# We could show all text in the line
for line in all_lines:
    print(line.text)

 # Country (or dependency) Population (2023) Yearly Change Net Change Density (P/Km²) Land Area (Km²) Migrants (net) Fert. Rate Med. Age Urban Pop % World Share 
 1 India 1,428,627,663 0.81 % 11,454,490 481 2,973,190 -486,136 2.0 28 36 % 17.76 % 
 2 China 1,425,671,352 -0.02 % -215,985 152 9,388,211 -310,220 1.2 39 65 % 17.72 % 
 3 United States 339,996,563 0.50 % 1,706,706 37 9,147,420 999,700 1.7 38 83 % 4.23 % 
 4 Indonesia 277,534,122 0.74 % 2,032,783 153 1,811,570 -49,997 2.1 30 59 % 3.45 % 
 5 Pakistan 240,485,658 1.98 % 4,660,796 312 770,880 -165,988 3.3 21 35 % 2.99 % 
 6 Nigeria 223,804,632 2.41 % 5,263,420 246 910,770 -59,996 5.1 17 54 % 2.78 % 
 7 Brazil 216,422,446 0.52 % 1,108,948 26 8,358,140 6,000 1.6 34 88 % 2.69 % 
 8 Bangladesh 172,954,319 1.03 % 1,767,947 1,329 130,170 -309,977 1.9 27 41 % 2.15 % 
 9 Russia 144,444,359 -0.19 % -268,955 9 16,376,870 -136,414 1.5 39 75 % 1.80 % 
 10 Mexico 128,455,567 0.75 % 951,442 66 1,943,950 -50,239 1.8 30 88 % 1.60 % 
 11 Ethiopia

In [114]:
# We could get all the tags children
for line in all_lines:
    
    children = line.findChildren('td')
    
    # It was used to check the columns
    #for c in children:
    #    print(c)
    #print('\n\n')

    if len(children) > 0:
        number.append(children[0].text)
        country.append(children[1].text)
        pop2023.append(children[2].text)
        yearly.append(children[3].text)
        net.append(children[4].text)
        density.append(children[5].text)
        land.append(children[6].text)
        migrants.append(children[7].text)
        fert.append(children[8].text)
        med.append(children[9].text)
        urban.append(children[10].text)
        share.append(children[11].text)

# Make a DataFrame
df = pd.DataFrame({
    '#': number, 
    'Country (or dependency)': country, 
    'Population  (2023)': pop2023, 
    'Yearly  Change': yearly,
    'Net  Change': net, 
    'Density  (P/Km²)': density, 
    'Land Area  (Km²)': land,
    'Migrants  (net)': migrants, 
    'Fert.  Rate': fert, 
    'Med.  Age': med, 
    'Urban  Pop %': urban,
    'World Share': share})

# Show the Table
df

Unnamed: 0,#,Country (or dependency),Population (2023),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,1,India,1428627663,0.81 %,11454490,481,2973190,-486136,2.0,28,36 %,17.76 %
1,2,China,1425671352,-0.02 %,-215985,152,9388211,-310220,1.2,39,65 %,17.72 %
2,3,United States,339996563,0.50 %,1706706,37,9147420,999700,1.7,38,83 %,4.23 %
3,4,Indonesia,277534122,0.74 %,2032783,153,1811570,-49997,2.1,30,59 %,3.45 %
4,5,Pakistan,240485658,1.98 %,4660796,312,770880,-165988,3.3,21,35 %,2.99 %
...,...,...,...,...,...,...,...,...,...,...,...,...
229,230,Montserrat,4386,-0.09 %,-4,44,100,0,1.6,44,11 %,0.00 %
230,231,Falkland Islands,3791,0.29 %,11,0,12170,0,1.6,40,62 %,0.00 %
231,232,Niue,1935,0.05 %,1,7,260,0,2.4,36,41 %,0.00 %
232,233,Tokelau,1893,1.18 %,22,189,10,0,2.6,27,0 %,0.00 %
