In [1]:
#import libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
# URL
URL = 'https://www.worldometers.info/world-population/population-by-country/'

In [3]:
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
html_thead = soup.find_all('thead')[-1]
html_th = [tr for tr in html_thead.find_all('tr')]

headings = []

for tr in html_th:
    th = tr.find_all(['th'])
    row = [i.text.strip() for i in th]
    headings.append(row)
    
headings

[['#',
  'Country (or dependency)',
  'Population (2020)',
  'Yearly Change',
  'Net Change',
  'Density (P/Km²)',
  'Land Area (Km²)',
  'Migrants (net)',
  'Fert. Rate',
  'Med. Age',
  'Urban Pop %',
  'World Share']]

In [4]:
html_tbody = soup.find_all('tbody')[-1]
html_text = [tr for tr in html_tbody.find_all('tr')]

content = []

for tr in html_text:
    th = tr.find_all(['th','td'])
    #td = tr.find_all()
    row = [i.text for i in th]
    content.append(row)

# content

In [5]:
data = pd.DataFrame(content[:], columns=headings[0])
data.head()

Unnamed: 0,#,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,1,China,1439323776,0.39 %,5540090,153,9388211,-348399,1.7,38,61 %,18.47 %
1,2,India,1380004385,0.99 %,13586631,464,2973190,-532687,2.2,28,35 %,17.70 %
2,3,United States,331002651,0.59 %,1937734,36,9147420,954806,1.8,38,83 %,4.25 %
3,4,Indonesia,273523615,1.07 %,2898047,151,1811570,-98955,2.3,30,56 %,3.51 %
4,5,Pakistan,220892340,2.00 %,4327022,287,770880,-233379,3.6,23,35 %,2.83 %


In [6]:
data.columns

Index(['#', 'Country (or dependency)', 'Population (2020)', 'Yearly Change',
       'Net Change', 'Density (P/Km²)', 'Land Area (Km²)', 'Migrants (net)',
       'Fert. Rate', 'Med. Age', 'Urban Pop %', 'World Share'],
      dtype='object')

In [7]:
# some column have space in name so lets rename columns first
data = data.rename(columns={"#": "Rank", "Country (or dependency)": "Country", "Population (2020)": "Population", 
                            "Yearly Change": "YearlyChange", "Net Change" : "NetChange", 
                            "Density (P/Km²)" : "Density", "Land Area (Km²)" : "LandArea", 
                            "Migrants (net)" : "Migrants", "Fert. Rate" : "FertilityRate", "Med. Age" : "MedAge",
                            "Urban Pop %" : "UrbanPopPer", "World Share":"WorldShare" })

In [8]:
data.head()

Unnamed: 0,Rank,Country,Population,YearlyChange,NetChange,Density,LandArea,Migrants,FertilityRate,MedAge,UrbanPopPer,WorldShare
0,1,China,1439323776,0.39 %,5540090,153,9388211,-348399,1.7,38,61 %,18.47 %
1,2,India,1380004385,0.99 %,13586631,464,2973190,-532687,2.2,28,35 %,17.70 %
2,3,United States,331002651,0.59 %,1937734,36,9147420,954806,1.8,38,83 %,4.25 %
3,4,Indonesia,273523615,1.07 %,2898047,151,1811570,-98955,2.3,30,56 %,3.51 %
4,5,Pakistan,220892340,2.00 %,4327022,287,770880,-233379,3.6,23,35 %,2.83 %


In [9]:
data.tail(10)

Unnamed: 0,Rank,Country,Population,YearlyChange,NetChange,Density,LandArea,Migrants,FertilityRate,MedAge,UrbanPopPer,WorldShare
225,226,Wallis & Futuna,11239,-1.69 %,-193,80,140,,N.A.,N.A.,0 %,0.00 %
226,227,Nauru,10824,0.63 %,68,541,20,,N.A.,N.A.,N.A.,0.00 %
227,228,Saint Barthelemy,9877,0.30 %,30,470,21,,N.A.,N.A.,0 %,0.00 %
228,229,Saint Helena,6077,0.30 %,18,16,390,,N.A.,N.A.,27 %,0.00 %
229,230,Saint Pierre & Miquelon,5794,-0.48 %,-28,25,230,,N.A.,N.A.,100 %,0.00 %
230,231,Montserrat,4992,0.06 %,3,50,100,,N.A.,N.A.,10 %,0.00 %
231,232,Falkland Islands,3480,3.05 %,103,0,12170,,N.A.,N.A.,66 %,0.00 %
232,233,Niue,1626,0.68 %,11,6,260,,N.A.,N.A.,46 %,0.00 %
233,234,Tokelau,1357,1.27 %,17,136,10,,N.A.,N.A.,0 %,0.00 %
234,235,Holy See,801,0.25 %,2,2003,0,,N.A.,N.A.,N.A.,0.00 %


In [26]:
# replace thousand format to integer
data['Population'] = data.Population.apply(lambda x: x.replace(',',''))
data['NetChange'] = data.NetChange.apply(lambda x: x.replace(',',''))
data['Density'] = data.Density.apply(lambda x: x.replace(',',''))
data['LandArea'] = data.LandArea.apply(lambda x: x.replace(',',''))
data['Migrants'] = data.Migrants.apply(lambda x: x.replace(',',''))

# replace not available to zero
data['FertilityRate'] = data.FertilityRate.apply(lambda x: x.replace('N.A.','0'))
data['MedAge'] = data.MedAge.apply(lambda x: x.replace('N.A.','0'))
data['UrbanPopPer'] = data.UrbanPopPer.apply(lambda x: x.replace('N.A.','0'))



# replace % sign
data['YearlyChange'] = data.YearlyChange.apply(lambda x: x.replace('%',''))
data['UrbanPopPer'] = data.UrbanPopPer.apply(lambda x: x.replace('%',''))
data['WorldShare'] = data.WorldShare.apply(lambda x: x.replace('%',''))

In [27]:
data.head()

Unnamed: 0,Rank,Country,Population,YearlyChange,NetChange,Density,LandArea,Migrants,FertilityRate,MedAge,UrbanPopPer,WorldShare
0,1,China,1439323776,0.39,5540090,153,9388211,-348399,1.7,38,61,18.47
1,2,India,1380004385,0.99,13586631,464,2973190,-532687,2.2,28,35,17.7
2,3,United States,331002651,0.59,1937734,36,9147420,954806,1.8,38,83,4.25
3,4,Indonesia,273523615,1.07,2898047,151,1811570,-98955,2.3,30,56,3.51
4,5,Pakistan,220892340,2.0,4327022,287,770880,-233379,3.6,23,35,2.83


In [28]:
data.isna().sum()

Rank             0
Country          0
Population       0
YearlyChange     0
NetChange        0
Density          0
LandArea         0
Migrants         0
FertilityRate    0
MedAge           0
UrbanPopPer      0
WorldShare       0
dtype: int64