# Population, surface area and density

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

# Options and Settings

In [2]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.autolayout'] = True
plt.rcParams['font.size'] = 12
path = os.getcwd()                                         # get current working directory
warnings.simplefilter('ignore')

# Import Data

In [3]:
filepath = os.path.join('datasets', 'Population, Surface Area and Density.csv')
df = pd.read_csv(filepath)

# Head and Tail

In [4]:
df

Unnamed: 0,Region/Country/Area,Unnamed: 1,Year,Series,Value,Footnotes,Source,Unnamed: 7
0,1,"Total, all countries or areas",2010,Population mid-year estimates (millions),6956.82,,"United Nations Population Division, New York, ...",
1,1,"Total, all countries or areas",2010,Population mid-year estimates for males (milli...,3507.70,,"United Nations Population Division, New York, ...",
2,1,"Total, all countries or areas",2010,Population mid-year estimates for females (mil...,3449.12,,"United Nations Population Division, New York, ...",
3,1,"Total, all countries or areas",2010,Sex ratio (males per 100 females),101.70,,"United Nations Population Division, New York, ...",
4,1,"Total, all countries or areas",2010,Population aged 0 to 14 years old (percentage),27.00,,"United Nations Population Division, New York, ...",
...,...,...,...,...,...,...,...,...
7255,716,Zimbabwe,2021,Population mid-year estimates for females (mil...,7.89,Projected estimate (medium fertility variant).,"United Nations Population Division, New York, ...",
7256,716,Zimbabwe,2021,Sex ratio (males per 100 females),91.40,Projected estimate (medium fertility variant).,"United Nations Population Division, New York, ...",
7257,716,Zimbabwe,2021,Population aged 0 to 14 years old (percentage),41.30,Projected estimate (medium fertility variant).,"United Nations Population Division, New York, ...",
7258,716,Zimbabwe,2021,Population aged 60+ years old (percentage),4.70,Projected estimate (medium fertility variant).,"United Nations Population Division, New York, ...",


In [5]:
df.shape

(7260, 8)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7260 entries, 0 to 7259
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Region/Country/Area  7260 non-null   int64  
 1   Unnamed: 1           7260 non-null   object 
 2   Year                 7260 non-null   int64  
 3   Series               7260 non-null   object 
 4   Value                7260 non-null   float64
 5   Footnotes            2282 non-null   object 
 6   Source               7260 non-null   object 
 7   Unnamed: 7           114 non-null    object 
dtypes: float64(1), int64(2), object(5)
memory usage: 453.9+ KB


In [7]:
df.rename(columns={'Region/Country/Area': 'Code', 
                   'Unnamed: 1': 'RegionCountryArea',
                   }, inplace=True)

df           

Unnamed: 0,Code,RegionCountryArea,Year,Series,Value,Footnotes,Source,Unnamed: 7
0,1,"Total, all countries or areas",2010,Population mid-year estimates (millions),6956.82,,"United Nations Population Division, New York, ...",
1,1,"Total, all countries or areas",2010,Population mid-year estimates for males (milli...,3507.70,,"United Nations Population Division, New York, ...",
2,1,"Total, all countries or areas",2010,Population mid-year estimates for females (mil...,3449.12,,"United Nations Population Division, New York, ...",
3,1,"Total, all countries or areas",2010,Sex ratio (males per 100 females),101.70,,"United Nations Population Division, New York, ...",
4,1,"Total, all countries or areas",2010,Population aged 0 to 14 years old (percentage),27.00,,"United Nations Population Division, New York, ...",
...,...,...,...,...,...,...,...,...
7255,716,Zimbabwe,2021,Population mid-year estimates for females (mil...,7.89,Projected estimate (medium fertility variant).,"United Nations Population Division, New York, ...",
7256,716,Zimbabwe,2021,Sex ratio (males per 100 females),91.40,Projected estimate (medium fertility variant).,"United Nations Population Division, New York, ...",
7257,716,Zimbabwe,2021,Population aged 0 to 14 years old (percentage),41.30,Projected estimate (medium fertility variant).,"United Nations Population Division, New York, ...",
7258,716,Zimbabwe,2021,Population aged 60+ years old (percentage),4.70,Projected estimate (medium fertility variant).,"United Nations Population Division, New York, ...",


In [8]:
df1 = df[['Code', 'RegionCountryArea', 'Year', 'Series', 'Value']]
df1

Unnamed: 0,Code,RegionCountryArea,Year,Series,Value
0,1,"Total, all countries or areas",2010,Population mid-year estimates (millions),6956.82
1,1,"Total, all countries or areas",2010,Population mid-year estimates for males (milli...,3507.70
2,1,"Total, all countries or areas",2010,Population mid-year estimates for females (mil...,3449.12
3,1,"Total, all countries or areas",2010,Sex ratio (males per 100 females),101.70
4,1,"Total, all countries or areas",2010,Population aged 0 to 14 years old (percentage),27.00
...,...,...,...,...,...
7255,716,Zimbabwe,2021,Population mid-year estimates for females (mil...,7.89
7256,716,Zimbabwe,2021,Sex ratio (males per 100 females),91.40
7257,716,Zimbabwe,2021,Population aged 0 to 14 years old (percentage),41.30
7258,716,Zimbabwe,2021,Population aged 60+ years old (percentage),4.70


In [9]:
df1.shape

(7260, 5)

In [10]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7260 entries, 0 to 7259
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Code               7260 non-null   int64  
 1   RegionCountryArea  7260 non-null   object 
 2   Year               7260 non-null   int64  
 3   Series             7260 non-null   object 
 4   Value              7260 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 283.7+ KB


In [11]:
df1.describe(include='object')

Unnamed: 0,RegionCountryArea,Series
count,7260,7260
unique,266,8
top,"Total, all countries or areas",Population mid-year estimates (millions)
freq,30,1058


In [12]:
df1['RegionCountryArea'].value_counts()

Total, all countries or areas    30
Guinea                           30
Maldives                         30
Mali                             30
Malta                            30
                                 ..
American Samoa                   10
Dominica                         10
Saint Martin (French part)        6
Bonaire, St. Eustatius & Saba     6
Saint Barth�lemy                  6
Name: RegionCountryArea, Length: 266, dtype: int64

In [13]:
df1['Series'].value_counts()

Population mid-year estimates (millions)                1058
Population density                                      1058
Population mid-year estimates for males (millions)       925
Population mid-year estimates for females (millions)     925
Sex ratio (males per 100 females)                        925
Population aged 0 to 14 years old (percentage)           925
Population aged 60+ years old (percentage)               925
Surface area (thousand km2)                              519
Name: Series, dtype: int64

In [14]:
df1['Code'].value_counts()

1      30
324    30
462    30
466    30
470    30
       ..
16     10
212    10
663     6
535     6
652     6
Name: Code, Length: 266, dtype: int64

In [15]:
df1['Year'].value_counts()

2015    1951
2019    1893
2010    1692
2021    1692
2017      32
Name: Year, dtype: int64

In [16]:
df1['Value'].describe()

count      7260.000000
mean        276.482474
std        2762.778277
min           0.000000
25%           5.295000
50%          22.800000
75%          96.000000
max      136162.000000
Name: Value, dtype: float64

# Missing Values Exploration

In [17]:
df1.isna().sum()

Code                 0
RegionCountryArea    0
Year                 0
Series               0
Value                0
dtype: int64

# Grouping and Aggregation

Filter By Region

In [18]:
regions = ['Africa', 'Northern Africa', 'Sub-Saharan Africa', 'Eastern Africa', 'Middle Africa', 'Southern Africa', 'Western Africa', 
           'Americas', 'Northern America', 'Latin America & the Caribbean', 'Caribbean', 'Central America', 'South America', 'Asia', 
           'Central Asia', 'Eastern Asia', 'South-central Asia', 'South-eastern Asia', 'Southern Asia', 'Western Asia', 'Europe', 
           'Eastern Europe', 'Northern Europe', 'Southern Europe', 'Western Europe', 'Oceania', 'Australia and New Zealand', 'Melanesia']

df_reg_filt = df1[df1['RegionCountryArea'].isin(regions)]
df_reg_filt

Unnamed: 0,Code,RegionCountryArea,Year,Series,Value
30,2,Africa,2010,Population mid-year estimates (millions),1039.30
31,2,Africa,2010,Population mid-year estimates for males (milli...,518.25
32,2,Africa,2010,Population mid-year estimates for females (mil...,521.05
33,2,Africa,2010,Sex ratio (males per 100 females),99.50
34,2,Africa,2010,Population aged 0 to 14 years old (percentage),41.50
...,...,...,...,...,...
865,54,Melanesia,2021,Population mid-year estimates for females (mil...,5.55
866,54,Melanesia,2021,Sex ratio (males per 100 females),104.00
867,54,Melanesia,2021,Population aged 0 to 14 years old (percentage),34.40
868,54,Melanesia,2021,Population aged 60+ years old (percentage),6.60


In [19]:
df_reg_filt.rename(columns={
    'RegionCountryArea': 'Region'
}, inplace=True)

df_reg_filt

Unnamed: 0,Code,Region,Year,Series,Value
30,2,Africa,2010,Population mid-year estimates (millions),1039.30
31,2,Africa,2010,Population mid-year estimates for males (milli...,518.25
32,2,Africa,2010,Population mid-year estimates for females (mil...,521.05
33,2,Africa,2010,Sex ratio (males per 100 females),99.50
34,2,Africa,2010,Population aged 0 to 14 years old (percentage),41.50
...,...,...,...,...,...
865,54,Melanesia,2021,Population mid-year estimates for females (mil...,5.55
866,54,Melanesia,2021,Sex ratio (males per 100 females),104.00
867,54,Melanesia,2021,Population aged 0 to 14 years old (percentage),34.40
868,54,Melanesia,2021,Population aged 60+ years old (percentage),6.60


Filter Population in Millions

In [20]:
millions = ['Population mid-year estimates (millions)',
            'Population mid-year estimates for males (millions)',
            'Population mid-year estimates for females (millions)']

df_reg_filt_mill = df_reg_filt[df_reg_filt['Series'].isin(millions)]
df_reg_filt_mill

Unnamed: 0,Code,Region,Year,Series,Value
30,2,Africa,2010,Population mid-year estimates (millions),1039.30
31,2,Africa,2010,Population mid-year estimates for males (milli...,518.25
32,2,Africa,2010,Population mid-year estimates for females (mil...,521.05
37,2,Africa,2015,Population mid-year estimates (millions),1182.44
38,2,Africa,2015,Population mid-year estimates for males (milli...,590.28
...,...,...,...,...,...
856,54,Melanesia,2019,Population mid-year estimates for males (milli...,5.57
857,54,Melanesia,2019,Population mid-year estimates for females (mil...,5.35
863,54,Melanesia,2021,Population mid-year estimates (millions),11.33
864,54,Melanesia,2021,Population mid-year estimates for males (milli...,5.78


Filter Sex ratio

In [21]:
sex_ratio = ['Sex ratio (males per 100 females)']

df_reg_filt_sr = df_reg_filt[df_reg_filt['Series'].isin(sex_ratio)]
df_reg_filt_sr

Unnamed: 0,Code,Region,Year,Series,Value
33,2,Africa,2010,Sex ratio (males per 100 females),99.5
40,2,Africa,2015,Sex ratio (males per 100 females),99.7
48,2,Africa,2019,Sex ratio (males per 100 females),99.8
56,2,Africa,2021,Sex ratio (males per 100 females),99.9
63,15,Northern Africa,2010,Sex ratio (males per 100 females),100.8
...,...,...,...,...,...
836,53,Australia and New Zealand,2021,Sex ratio (males per 100 females),98.8
843,54,Melanesia,2010,Sex ratio (males per 100 females),103.6
850,54,Melanesia,2015,Sex ratio (males per 100 females),103.9
858,54,Melanesia,2019,Sex ratio (males per 100 females),104.0


Filter Population in Percentage

In [22]:
pop_pct = ['Population aged 0 to 14 years old (percentage)',
           'Population aged 60+ years old (percentage)']

df_reg_filt_pct = df_reg_filt[df_reg_filt['Series'].isin(pop_pct)]
df_reg_filt_pct

Unnamed: 0,Code,Region,Year,Series,Value
34,2,Africa,2010,Population aged 0 to 14 years old (percentage),41.5
35,2,Africa,2010,Population aged 60+ years old (percentage),5.1
41,2,Africa,2015,Population aged 0 to 14 years old (percentage),41.1
42,2,Africa,2015,Population aged 60+ years old (percentage),5.3
49,2,Africa,2019,Population aged 0 to 14 years old (percentage),40.6
...,...,...,...,...,...
852,54,Melanesia,2015,Population aged 60+ years old (percentage),5.9
859,54,Melanesia,2019,Population aged 0 to 14 years old (percentage),35.0
860,54,Melanesia,2019,Population aged 60+ years old (percentage),6.4
867,54,Melanesia,2021,Population aged 0 to 14 years old (percentage),34.4


Filter Population Density

In [23]:
pop_dens = ['Population density']

df_reg_filt_popdens = df_reg_filt[df_reg_filt['Series'].isin(pop_dens)]
df_reg_filt_popdens

Unnamed: 0,Code,Region,Year,Series,Value
36,2,Africa,2010,Population density,35.1
43,2,Africa,2015,Population density,39.9
51,2,Africa,2019,Population density,44.1
59,2,Africa,2021,Population density,46.3
66,15,Northern Africa,2010,Population density,26.1
...,...,...,...,...,...
839,53,Australia and New Zealand,2021,Population density,3.9
846,54,Melanesia,2010,Population density,17.3
853,54,Melanesia,2015,Population density,19.1
861,54,Melanesia,2019,Population density,20.6


Filter Surface area in thousand km2

In [24]:
surf_area = ['Surface area (thousand km2)']

df_reg_filt_surfarea = df_reg_filt[df_reg_filt['Series'].isin(surf_area)]
df_reg_filt_surfarea.head()

Unnamed: 0,Code,Region,Year,Series,Value
44,2,Africa,2015,Surface area (thousand km2),30311.0
52,2,Africa,2019,Surface area (thousand km2),29648.0
74,15,Northern Africa,2015,Surface area (thousand km2),7880.0
82,15,Northern Africa,2019,Surface area (thousand km2),7769.0
104,202,Sub-Saharan Africa,2015,Surface area (thousand km2),22431.0
