# Import Library

In [1]:
#import libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
# URL
URL = 'https://www.worldometers.info/world-population/world-population-by-year/'

In [3]:
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
html_thead = soup.find_all('thead')[-1]
html_th = [tr for tr in html_thead.find_all('tr')]

headings = []

for tr in html_th:
    th = tr.find_all(['th'])
    row = [i.text.strip() for i in th]
    headings.append(row)
    
headings

[['Year',
  'World Population',
  'YearlyChange',
  'NetChange',
  'Density(P/Km²)',
  'UrbanPop',
  'UrbanPop %']]

In [4]:
html_tbody = soup.find_all('tbody')[-1]
html_text = [tr for tr in html_tbody.find_all('tr')]

content = []

for tr in html_text:
    th = tr.find_all(['th','td'])
    #td = tr.find_all()
    row = [i.text for i in th]
    content.append(row)

# content

In [5]:
data = pd.DataFrame(content[:], columns=headings[0])
data.head()

Unnamed: 0,Year,World Population,YearlyChange,NetChange,Density(P/Km²),UrbanPop,UrbanPop %
0,2020,7794798739,1.05 %,81330639,52,4378993944,56 %
1,2019,7713468100,1.08 %,82377060,52,4299438618,56 %
2,2018,7631091040,1.10 %,83232115,51,4219817318,55 %
3,2017,7547858925,1.12 %,83836876,51,4140188594,55 %
4,2016,7464022049,1.14 %,84224910,50,4060652683,54 %


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Year              94 non-null     object
 1   World Population  94 non-null     object
 2   YearlyChange      94 non-null     object
 3   NetChange         94 non-null     object
 4   Density(P/Km²)    94 non-null     object
 5   UrbanPop          94 non-null     object
 6   UrbanPop %        94 non-null     object
dtypes: object(7)
memory usage: 5.3+ KB


In [7]:
data.columns

Index(['Year', 'World Population', 'YearlyChange', 'NetChange',
       'Density(P/Km²)', 'UrbanPop', 'UrbanPop %'],
      dtype='object')

In [8]:
data = data.rename(columns={"World Population": "WorldPopulation"})
data = data.rename(columns={"UrbanPop %": "UrbanPopPer"})


In [9]:
data.head()

Unnamed: 0,Year,WorldPopulation,YearlyChange,NetChange,Density(P/Km²),UrbanPop,UrbanPopPer
0,2020,7794798739,1.05 %,81330639,52,4378993944,56 %
1,2019,7713468100,1.08 %,82377060,52,4299438618,56 %
2,2018,7631091040,1.10 %,83232115,51,4219817318,55 %
3,2017,7547858925,1.12 %,83836876,51,4140188594,55 %
4,2016,7464022049,1.14 %,84224910,50,4060652683,54 %


In [10]:
data.tail(10)

Unnamed: 0,Year,WorldPopulation,YearlyChange,NetChange,Density(P/Km²),UrbanPop,UrbanPopPer
84,700,210000000,,,,,
85,600,200000000,,,,,
86,200,190000000,,,,,
87,-200,150000000,,,,,
88,-500,100000000,,,,,
89,-1000,50000000,,,,,
90,-2000,27000000,,,,,
91,-3000,14000000,,,,,
92,-4000,7000000,,,,,
93,-5000,5000000,,,,,


# Data Cleaning
As we can see that from index 70 to 93 we have only population data. Thus I am removing all rows from 70 to 93.

In [11]:
population = data.loc[0:69]

In [12]:
population.head()

Unnamed: 0,Year,WorldPopulation,YearlyChange,NetChange,Density(P/Km²),UrbanPop,UrbanPopPer
0,2020,7794798739,1.05 %,81330639,52,4378993944,56 %
1,2019,7713468100,1.08 %,82377060,52,4299438618,56 %
2,2018,7631091040,1.10 %,83232115,51,4219817318,55 %
3,2017,7547858925,1.12 %,83836876,51,4140188594,55 %
4,2016,7464022049,1.14 %,84224910,50,4060652683,54 %


Now we have data with out null value. However, our data needs to be cleand as data have comma(,) and percentage(%) sign in it. Lets remove all this signs.

In [13]:
# replace thousand format to integer
population['WorldPopulation'] = data.WorldPopulation.apply(lambda x: x.replace(',',''))
population['UrbanPop'] = data.UrbanPop.apply(lambda x: x.replace(',',''))
population['NetChange'] = data.NetChange.apply(lambda x: x.replace(',',''))

# replace % sign
population['YearlyChange'] = data.YearlyChange.apply(lambda x: x.replace('%',''))
population['UrbanPopPer'] = data.UrbanPopPer.apply(lambda x: x.replace('%',''))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  population['WorldPopulation'] = data.WorldPopulation.apply(lambda x: x.replace(',',''))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  population['UrbanPop'] = data.UrbanPop.apply(lambda x: x.replace(',',''))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  population['NetChange'] = data.NetChange.ap

In [14]:
population.head()

Unnamed: 0,Year,WorldPopulation,YearlyChange,NetChange,Density(P/Km²),UrbanPop,UrbanPopPer
0,2020,7794798739,1.05,81330639,52,4378993944,56
1,2019,7713468100,1.08,82377060,52,4299438618,56
2,2018,7631091040,1.1,83232115,51,4219817318,55
3,2017,7547858925,1.12,83836876,51,4140188594,55
4,2016,7464022049,1.14,84224910,50,4060652683,54


In [15]:
data.to_csv('WorldPopulation.csv', index=False)