# Import Libraries
Importing libraries are necessary for any project.
We're going to import commonly used Webscrapping and Data Analysis libraries. Make sure all libraries are installed.

In [1]:
import requests # for getting web contents
from bs4 import BeautifulSoup # for scraping web contents
import pandas as pd # for data analysis
import numpy as np

# URL
For web scrapping it's important to have url.

In [2]:
# link of web page that you want to scrap data
URL = 'https://www.worldometers.info/world-population/population-by-country/'

# get web data
page = requests.get(URL)

# parse web data
soup = BeautifulSoup(page.content, "html.parser")

In [3]:
# find the table
# our trageted table is last

# getting the table body
html_tbody = soup.find_all('tbody')[-1]

#getting all the rows in table body
html_text = [tr for tr in html_tbody.find_all('tr')]

urls = []
countries = []

# loop through table body
for tr in html_text:
    # getting all th, td
    th = tr.find_all(['th','td'])
    # storing all th value in row and removing white space
    # append content
    country = th[1].get_text()
    url = th[1].find('a')['href']
    urls.append(url)
    countries.append(country)

In [4]:
# function to scrape values from each url
def url_scrraper(url,country):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    table = soup.find_all('table')[0]
    # getting the table head because it may contains headings (column names)
    html_thead = soup.find_all('thead')[0]

    #getting all the rows in table head
    html_tr = [tr for tr in html_thead.find_all('tr')]

    # loop through table head
    for tr in html_tr:
        # getting all th
        th = tr.find_all(['th'])
        # storing all th value in row and removing white space
        row = [i.text.strip() for i in th]
        row.insert(1, 'Country')
        # append headings 
        headings.append(row)
        if len(headings[0]) != 14:
            headings[0].insert(5, 'Migrants (net)')
            headings[0].insert(6, 'Median Age')
            headings[0].insert(7, 'Fertility Rate')
    
    # getting the table body
    html_tbody = soup.find_all('tbody')[0]

    #getting all the rows in table body
    html_text = [tr for tr in html_tbody.find_all('tr')]


    # loop through table body
    for tr in html_text:
        # getting all th, td
        th = tr.find_all(['th','td'])
        # storing all th value in row and removing white space
        row = [i.text.strip() for i in th]
        row.insert(1, country)
        if len(row) != 14:
            row.insert(5, np.nan)
            row.insert(6, np.nan)
            row.insert(7, np.nan)
        # append content
        content.append(row)

In [5]:
base_url = 'https://www.worldometers.info'
content = []

for i in range(len(countries)):
    headings = []
    if type(urls[i]) == list:
        url = base_url + urls[i][0]
    else:
        url = base_url + urls[i]
    url_scrraper(url, countries[i])

In [6]:
# save contents in a dataframe
data = pd.DataFrame(content[:], columns=headings[0])

# Data Analysis

## Let's first look at the data

In [7]:
data

Unnamed: 0,Year,Country,Population,Yearly % Change,Yearly Change,Migrants (net),Median Age,Fertility Rate,Density (P/Km²),Urban Pop %,Urban Population,Country's Share of World Pop,World Population,Holy SeeGlobal Rank
0,2020,China,1439323776,0.39 %,5540090,-348399,38.4,1.69,153,60.8 %,875075919,18.47 %,7794798739,1
1,2019,China,1433783686,0.43 %,6135900,-348399,37.0,1.65,153,59.7 %,856409297,18.59 %,7713468100,1
2,2018,China,1427647786,0.47 %,6625995,-348399,37.0,1.65,152,58.6 %,837022095,18.71 %,7631091040,1
3,2017,China,1421021791,0.49 %,6972440,-348399,37.0,1.65,151,57.5 %,816957613,18.83 %,7547858925,1
4,2016,China,1414049351,0.51 %,7201481,-348399,37.0,1.65,151,56.3 %,796289491,18.94 %,7464022049,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4190,1975,Holy See,728,2.48 %,17,,,,1820,N.A.,N.A.,0 %,4079480606,235
4191,1970,Holy See,644,-5.49 %,-42,,,,1610,N.A.,N.A.,0 %,3700437046,235
4192,1965,Holy See,854,-1.18 %,-10,,,,2135,N.A.,N.A.,0 %,3339583597,235
4193,1960,Holy See,906,-0.04 %,0,,,,2265,N.A.,N.A.,0 %,3034949748,235


## Look at Example Records

In [8]:
# check few top rows of data
data.head()

Unnamed: 0,Year,Country,Population,Yearly % Change,Yearly Change,Migrants (net),Median Age,Fertility Rate,Density (P/Km²),Urban Pop %,Urban Population,Country's Share of World Pop,World Population,Holy SeeGlobal Rank
0,2020,China,1439323776,0.39 %,5540090,-348399,38.4,1.69,153,60.8 %,875075919,18.47 %,7794798739,1
1,2019,China,1433783686,0.43 %,6135900,-348399,37.0,1.65,153,59.7 %,856409297,18.59 %,7713468100,1
2,2018,China,1427647786,0.47 %,6625995,-348399,37.0,1.65,152,58.6 %,837022095,18.71 %,7631091040,1
3,2017,China,1421021791,0.49 %,6972440,-348399,37.0,1.65,151,57.5 %,816957613,18.83 %,7547858925,1
4,2016,China,1414049351,0.51 %,7201481,-348399,37.0,1.65,151,56.3 %,796289491,18.94 %,7464022049,1


## Descriptive Statistics

In [9]:
# getting Generate descriptive statistics of data. Generate descriptive statistics include count, mean, std, min_value, 25%, 50%, 75%, max_value
data.describe()

Unnamed: 0,Year,Country,Population,Yearly % Change,Yearly Change,Migrants (net),Median Age,Fertility Rate,Density (P/Km²),Urban Pop %,Urban Population,Country's Share of World Pop,World Population,Holy SeeGlobal Rank
count,4195,4195,4195,4195,4195,3600,3600.0,3600.0,4195,4195,4195,4195,4195,4195
unique,18,235,4192,740,3956,2253,311.0,643.0,723,973,3985,291,18,235
top,2005,Madagascar,801,0.73 %,0,0,17.6,1.72,4,N.A.,N.A.,0.00 %,3700437046,39
freq,235,18,2,24,6,152,48.0,25.0,87,113,113,546,235,18


## Summary of data-type, columns, non-null values, memory usage.

In [10]:
# data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4195 entries, 0 to 4194
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Year                          4195 non-null   object
 1   Country                       4195 non-null   object
 2   Population                    4195 non-null   object
 3   Yearly %  Change              4195 non-null   object
 4   Yearly Change                 4195 non-null   object
 5   Migrants (net)                3600 non-null   object
 6   Median Age                    3600 non-null   object
 7   Fertility Rate                3600 non-null   object
 8   Density (P/Km²)               4195 non-null   object
 9   Urban Pop %                   4195 non-null   object
 10  Urban Population              4195 non-null   object
 11  Country's Share of World Pop  4195 non-null   object
 12  World Population              4195 non-null   object
 13  Holy SeeGlobal Ran

## Column labels

In [11]:
# get the column labels of the data.
data.columns

Index(['Year', 'Country', 'Population', 'Yearly %  Change', 'Yearly Change',
       'Migrants (net)', 'Median Age', 'Fertility Rate', 'Density (P/Km²)',
       'Urban Pop %', 'Urban Population', 'Country's Share of World Pop',
       'World Population', 'Holy SeeGlobal Rank'],
      dtype='object')

# Data Cleaning

## Rename Column Name

Rename Holy SeeGlobal Rank column name to Country Global Rank and Country's Share of World Pop to Country's Share of World Pop %.

In [12]:
# rename column name
data = data.rename(columns={'Holy SeeGlobal Rank':'Country Global Rank', 
                            'Country\'s Share of World Pop':'Country\'s Share of World Pop %',
                            'Yearly %  Change':'Yearly % Change'})

## Remove unwanted symbols (like %, thousand comma from integer(,), etc..)

In [13]:
# remove % from columns
data["Yearly % Change"] = data["Yearly % Change"].str.replace('%','')
data['Urban Pop %'] = data['Urban Pop %'].str.replace('%','')
data['Country\'s Share of World Pop %'] = data['Country\'s Share of World Pop %'].str.replace('%','')

# remove ',' from columns
data['Population'] = data['Population'].str.replace(',','')
data['Yearly Change'] = data['Yearly Change'].str.replace(',','')
data['Migrants (net)'] = data['Migrants (net)'].str.replace(',','')
data['Urban Population'] = data['Urban Population'].str.replace(',','')
data['World Population'] = data['World Population'].str.replace(',','')
data['Density (P/Km²)'] = data['Density (P/Km²)'].str.replace(',','')

# replace N.A. values to np.nan values
data['Urban Pop %'] = data['Urban Pop %'].replace('N.A.', np.nan)
data['Urban Population'] = data['Urban Population'].replace('N.A.', np.nan)

# Save Data into CSV

In [14]:
# save data
data.to_csv('PopulationByCountryHistory.csv', index=False)