# World Bank - Population Growth dataset

## ETL - Part 1

Load & cleanse the source dataset. As we want to present country-based values only, exclude any country codes that are not true ISO3166 codes (e.g. regional coding such as EMU='Euro area', or economic such as OED='OECD members'). Write out the extracted dataset to a new file.

In [27]:
# Dependencies
import pandas as pd

In [28]:
# Load data file
pop_growth_df = pd.read_csv('source_data/API_SP.POP.GROW_DS2_en_csv_v2_3404396.csv', skiprows=4)
pop_growth_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,Unnamed: 68
0,Aruba,ABW,Population growth (annual %),SP.POP.GROW,,2.179059,1.548572,1.389337,1.215721,1.032841,...,0.637959,0.590062,0.537296,0.494795,0.45197,0.134255,-0.045045,-0.086392,-0.157953,
1,Africa Eastern and Southern,AFE,Population growth (annual %),SP.POP.GROW,,2.66018,2.732633,2.753248,2.806915,2.840787,...,2.802586,2.728159,2.655672,2.688371,2.691134,2.678184,2.607472,2.543757,2.531587,
2,Afghanistan,AFG,Population growth (annual %),SP.POP.GROW,,1.925952,2.014879,2.078997,2.139651,2.216007,...,3.121341,2.581549,2.866492,2.885208,2.908529,3.134747,2.851358,2.534498,2.665628,
3,Africa Western and Central,AFW,Population growth (annual %),SP.POP.GROW,,2.115789,2.145723,2.190827,2.21136,2.242567,...,2.723317,2.713059,2.706266,2.669239,2.633982,2.615646,2.573377,2.539799,2.540864,
4,Angola,AGO,Population growth (annual %),SP.POP.GROW,,1.558355,1.460738,1.410425,1.301745,1.111041,...,3.617678,3.586211,3.550987,3.464457,3.395278,3.268348,3.16603,3.096753,3.030996,


In [29]:
# Check number of data records after initial load
initial_num = len(pop_growth_df)
print(initial_num)

266


In [30]:
# Drop unwanted columns
#
# The data file includes a comma at the end of each row, which is interpreted as an extra 'unnamed' column. 
# A trailing comma on each line of a CSV file is not part of the normal CSV-format definition (refer to 
# https://www.rfc-editor.org/rfc/rfc4180).
# This column in the dataframe can simply be dropped.
#
pop_growth_df = pop_growth_df.drop(columns=["Unnamed: 68"])
pop_growth_df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Aruba,ABW,Population growth (annual %),SP.POP.GROW,,2.179059,1.548572,1.389337,1.215721,1.032841,...,0.691615,0.637959,0.590062,0.537296,0.494795,0.451970,0.134255,-0.045045,-0.086392,-0.157953
1,Africa Eastern and Southern,AFE,Population growth (annual %),SP.POP.GROW,,2.660180,2.732633,2.753248,2.806915,2.840787,...,2.774990,2.802586,2.728159,2.655672,2.688371,2.691134,2.678184,2.607472,2.543757,2.531587
2,Afghanistan,AFG,Population growth (annual %),SP.POP.GROW,,1.925952,2.014879,2.078997,2.139651,2.216007,...,3.657576,3.121341,2.581549,2.866492,2.885208,2.908529,3.134747,2.851358,2.534498,2.665628
3,Africa Western and Central,AFW,Population growth (annual %),SP.POP.GROW,,2.115789,2.145723,2.190827,2.211360,2.242567,...,2.750731,2.723317,2.713059,2.706266,2.669239,2.633982,2.615646,2.573377,2.539799,2.540864
4,Angola,AGO,Population growth (annual %),SP.POP.GROW,,1.558355,1.460738,1.410425,1.301745,1.111041,...,3.684429,3.617678,3.586211,3.550987,3.464457,3.395278,3.268348,3.166030,3.096753,3.030996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,XKX,Population growth (annual %),SP.POP.GROW,,2.400980,2.378001,2.336873,2.280566,2.192574,...,-0.294474,-1.364932,-0.596734,0.753585,0.339011,-0.457730,0.070131,-0.229016,-1.010215,-0.664615
262,"Yemen, Rep.",YEM,Population growth (annual %),SP.POP.GROW,,1.862737,1.872291,1.839467,1.920052,2.046198,...,2.811072,2.712955,2.621537,2.564321,2.486360,2.426208,2.310447,2.137790,2.144628,2.210656
263,South Africa,ZAF,Population growth (annual %),SP.POP.GROW,,2.799492,2.978651,3.033440,3.061378,3.091501,...,1.576294,2.074017,0.972004,0.387278,1.225530,1.295074,1.223179,0.998920,0.841058,0.865465
264,Zambia,ZMB,Population growth (annual %),SP.POP.GROW,,3.156056,3.178563,3.196632,3.194441,3.201590,...,3.247118,3.191896,3.147407,3.113595,3.061888,3.007618,2.933818,2.840806,2.758032,2.720528


In [31]:
# NOTE regarding 'null' (missing) data values
#
# (1) The '1960' column is all blank in this case of the 'population growth' data file, because the year 1960 is the reference
#     point for comparison. 
# (2) For some countries, data is not available for all years 1960-2023 inclusive. Those cases are listed here:  
pop_growth_df[pop_growth_df.drop(columns=["1960"]).isna().any(axis=1)]

# NOTE that the NZL and SYC cases both have one missing year value (other than 1960) so their data row are not complete
# compared in comparison with others. 
# 
# If any of these cases remain after cross-filtering against valid international country codes (see below), they will be 
# addressed with the help of a data validation library in a subsequent ETL stage.

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
110,Not classified,INX,Population growth (annual %),SP.POP.GROW,,,,,,,...,,,,,,,,,,
180,New Zealand,NZL,Population growth (annual %),SP.POP.GROW,,1.99944,2.542112,1.986583,2.094972,1.649506,...,1.661012,2.036034,2.246032,2.088723,1.79124,1.591159,2.204789,0.413665,0.115364,2.048368
196,West Bank and Gaza,PSE,Population growth (annual %),SP.POP.GROW,,,,,,,...,2.344077,2.29048,2.246105,1.988686,2.533011,2.511782,2.48655,2.457039,2.425538,2.393265
226,Seychelles,SYC,Population growth (annual %),SP.POP.GROW,,2.811425,2.652834,2.542225,2.505104,2.51127,...,1.555396,2.229795,1.337635,1.224034,0.954292,0.887925,0.853708,0.805183,,-0.087627


In [32]:
pop_growth_df.columns

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022',
       '2023'],
      dtype='object')

In [33]:
len(pop_growth_df.columns)

68

In [34]:
# Load official Country Codes reference data file
country_codes_df = pd.read_csv('source_data/CountryCodes_ISO3166.csv')
country_codes_df.head()

Unnamed: 0,English short name,French short name,Alpha-2 code,Alpha-3 code,Numeric
0,Afghanistan,Afghanistan (l'),AF,AFG,4
1,Albania,Albanie (l'),AL,ALB,8
2,Algeria,Algérie (l'),DZ,DZA,12
3,American Samoa,Samoa américaines (les),AS,ASM,16
4,Andorra,Andorre (l'),AD,AND,20


In [35]:
# Look for 3-letter entries in the dataset that DON'T match official ISO3166 country codes
#
# Return all rows in pop_growth_df that do NOT have a matching team in country_codes_df, following the approach in:
# https://www.statology.org/pandas-anti-join/
#
# (1) perform outer join
outer = pd.merge(pop_growth_df, country_codes_df, left_on='Country Code', right_on='Alpha-3 code', how='outer', indicator=True)

# (2) perform anti-join
anti_join = outer[(outer._merge=='left_only')].drop('_merge', axis=1)

# View results
anti_join[["Country Name", "Country Code"]]


Unnamed: 0,Country Name,Country Code
1,Africa Eastern and Southern,AFE
3,Africa Western and Central,AFW
9,Arab World,ARB
44,Central Europe and the Baltics,CEB
46,Channel Islands,CHI
58,Caribbean small states,CSS
71,East Asia & Pacific (excluding high income),EAP
72,Early-demographic dividend,EAR
73,East Asia & Pacific,EAS
74,Europe & Central Asia (excluding high income),ECA


In [36]:
# Extract the "non-country codes" so we can use those as a filter
non_country_codes = anti_join["Country Code"]
num_non_country_codes = len(non_country_codes)
print(num_non_country_codes)

51


In [37]:
# Filter out non-country-codes from the original population growth dataset so we are left with ISO3166 country codes only
pop_growth_df = pop_growth_df[~pop_growth_df["Country Code"].isin(non_country_codes)]
pop_growth_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Aruba,ABW,Population growth (annual %),SP.POP.GROW,,2.179059,1.548572,1.389337,1.215721,1.032841,...,0.691615,0.637959,0.590062,0.537296,0.494795,0.45197,0.134255,-0.045045,-0.086392,-0.157953
2,Afghanistan,AFG,Population growth (annual %),SP.POP.GROW,,1.925952,2.014879,2.078997,2.139651,2.216007,...,3.657576,3.121341,2.581549,2.866492,2.885208,2.908529,3.134747,2.851358,2.534498,2.665628
4,Angola,AGO,Population growth (annual %),SP.POP.GROW,,1.558355,1.460738,1.410425,1.301745,1.111041,...,3.684429,3.617678,3.586211,3.550987,3.464457,3.395278,3.268348,3.16603,3.096753,3.030996
5,Albania,ALB,Population growth (annual %),SP.POP.GROW,,3.120855,3.056731,2.953749,2.880686,2.754021,...,-0.207047,-0.291206,-0.15988,-0.091972,-0.246732,-0.426007,-0.574207,-0.926918,-1.21579,-1.148418
6,Andorra,AND,Population growth (annual %),SP.POP.GROW,,7.868139,7.521207,7.223198,6.941512,6.653122,...,0.355275,0.174378,1.100603,1.772183,1.580147,1.757491,1.761891,1.702288,0.994607,0.330182


In [38]:
# Check number of data records after filtering
filtered_num = len(pop_growth_df)
print(filtered_num)

post_filter_check_OK = filtered_num == (initial_num - num_non_country_codes)
print('Filter check passed: ', post_filter_check_OK)

215
Filter check passed:  True


In [39]:
# Write out cleansed 'population growth' dataset
pop_growth_df.to_csv('./data/ETL_POP_GROW.csv', encoding='utf8', index=False)