# Import Libraries
Importing libraries are necessary for any project.
We're going to import commonly used Webscrapping and Data Analysis libraries. Make sure all libraries are installed.

In [1]:
import requests # for getting web contents
from bs4 import BeautifulSoup # for scraping web contents
import pandas as pd # for data analysis

# URL
For web scrapping it's important to have url.

In [2]:
# link of web page that you want to scrap data
URL = 'https://www.worldometers.info/world-population/population-by-country/'

# get web data
page = requests.get(URL)

# parse web data
soup = BeautifulSoup(page.content, "html.parser")

In [3]:
# getting url for each countries
# getting the table body
html_tbody = soup.find_all('tbody')[-1]

#getting all the rows in table body
html_text = [tr for tr in html_tbody.find_all('tr')]

urls = []
countries = []

# loop through table body
for tr in html_text:
    # getting all th, td
    th = tr.find_all(['th','td'])
    # storing all th value in row and removing white space
    # append content
    country = th[1].get_text()
    url = th[1].find('a')['href']
    urls.append(url)
    countries.append(country)

In [4]:
# function to scrape values from each url
def url_scrraper(url,country):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    table = soup.find_all('table')[-1]
    # getting the table head because it may contains headings (column names)
    html_thead = soup.find_all('thead')[-1]

    #getting all the rows in table head
    html_tr = [tr for tr in html_thead.find_all('tr')]

    # loop through table head
    for tr in html_tr:
        # getting all th
        th = tr.find_all(['th'])
        # storing all th value in row and removing white space
        row = [i.text.strip() for i in th]
        row.insert(1, 'Country')
        if len(row) == 4 :
            headings[:]=[]
            # append headings 
            headings.append(row)
        
    # getting the table body
    html_tbody = soup.find_all('tbody')[-1]

    #getting all the rows in table body
    html_text = [tr for tr in html_tbody.find_all('tr')]


    # loop through table body
    for tr in html_text:
        # getting all th, td
        th = tr.find_all(['th','td'])
        # storing all th value in row and removing white space
        row = [i.text.strip() for i in th]
        row.insert(1, country)
        if len(row) == 4 :
            # append content
            content.append(row)

In [5]:
base_url = 'https://www.worldometers.info'
content = []
headings = []
for i in range(len(countries)):
    
    if type(urls[i]) == list:
        url = base_url + urls[i][0]
    else:
        url = base_url + urls[i]
    url_scrraper(url, countries[i])

In [6]:
# save contents in a dataframe
data = pd.DataFrame(content[:], columns=headings[0])

# Data Analysis

## Look at Example Records

In [7]:
# check few top rows of data
data.head()

Unnamed: 0,#,Country,CITY NAME,POPULATION
0,1,China,Shanghai,22315474
1,2,China,Beijing,11716620
2,3,China,Tianjin,11090314
3,4,China,Guangzhou,11071424
4,5,China,Shenzhen,10358381


## Descriptive Statistics

In [8]:
# getting Generate descriptive statistics of data. Generate descriptive statistics include count, mean, std, min_value, 25%, 50%, 75%, max_value
data.describe()

Unnamed: 0,#,Country,CITY NAME,POPULATION
count,6906,6906,6906,6906
unique,70,198,6789,6577
top,1,Madagascar,Victoria,30000
freq,198,70,4,14


## Summary of data-type, columns, non-null values, memory usage.

In [9]:
# data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6906 entries, 0 to 6905
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   #           6906 non-null   object
 1   Country     6906 non-null   object
 2   CITY NAME   6906 non-null   object
 3   POPULATION  6906 non-null   object
dtypes: object(4)
memory usage: 215.9+ KB


## Column labels

In [10]:
# get the column labels of the data.
data.columns

Index(['#', 'Country', 'CITY NAME', 'POPULATION'], dtype='object')

# Data Cleaning

## Rename Column Name

In [11]:
# rename column name if required
data = data.rename(columns={'CITY NAME':'City Name', 'POPULATION':'Population'})

## Remove unwanted symbols (like % and thousand comma from integer)

In [12]:
# remove extra characters from columns
data['Population'] = data['Population'].str.replace(',','')
data = data.drop('#', axis=1)

# Save Data into CSV

In [13]:
# save data
data.to_csv('World Major Cities Population.csv', index=False)