# World Bank - Population Growth dataset

## ETL - Part 5 - Prepare datasets for loading into database

Consolidate the datasets from the prior ETL steps, reading for loading into the database.
Write out the cleansed datasets into new files corresponding to each database table.

In [39]:
import pandas as pd

### Finalise Population Growth dataset for the first database table

In [40]:
# Load the CSV data from previous ETL steps
population_growth_data_df = pd.read_csv("data/Cleansed_POP_GROW.csv")

population_growth_metadata_df = pd.read_csv("data/Cleansed_Metadata_Country_POP_GROW.csv")

In [41]:
# Consolidate the Population Growth data & metadata into a single dataset
population_growth_df = population_growth_metadata_df.merge(population_growth_data_df,on='Country Code')
population_growth_df.head()

Unnamed: 0,Country Code,Region,IncomeGroup,Country Name,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,ABW,Latin America & Caribbean,High income,Aruba,(Not Specified),2.17905904113205,1.54857174393805,1.3893370634193,1.21572057526871,1.03284091886278,...,0.691615,0.637959,0.590062,0.537296,0.494795,0.45197,0.134255,-0.045045,-0.0863922826549927,-0.157953
1,AFG,South Asia,Low income,Afghanistan,(Not Specified),1.92595161110872,2.01487886339461,2.07899662655224,2.13965084372647,2.21600692082533,...,3.657576,3.121341,2.581549,2.866492,2.885208,2.908529,3.134747,2.851358,2.53449831666733,2.665628
2,AGO,Sub-Saharan Africa,Lower middle income,Angola,(Not Specified),1.5583550408936,1.46073837045336,1.41042530862807,1.3017451764171,1.11104064143605,...,3.684429,3.617678,3.586211,3.550987,3.464457,3.395278,3.268348,3.16603,3.09675267067326,3.030996
3,ALB,Europe & Central Asia,Upper middle income,Albania,(Not Specified),3.12085537059054,3.05673050279088,2.95374876199632,2.88068642624326,2.75402123804227,...,-0.207047,-0.291206,-0.15988,-0.091972,-0.246732,-0.426007,-0.574207,-0.926918,-1.21579032012532,-1.148418
4,AND,Europe & Central Asia,High income,Andorra,(Not Specified),7.86813924493059,7.52120721100516,7.22319755892096,6.94151152967971,6.65312152542815,...,0.355275,0.174378,1.100603,1.772183,1.580147,1.757491,1.761891,1.702288,0.994607149162366,0.330182


In [42]:
# Rename column - 'Country Code' to 'CountryCode' (to match the second database table)
population_growth_df = population_growth_df.rename(columns={'Country Code': 'CountryCode'})

In [43]:
# Write out the Population Growth dataset, ready for importing into the database
population_growth_df.to_csv('./data_for_db/PopulationGrowth.csv', encoding='utf8', index=False)

### Add Continent / Country information for the second database table

In [44]:
# Load the CSV data from source data folder
countries_by_continent_df = pd.read_csv("source_data/countries-by-continent-2024.csv")

In [45]:
# Drop the 'Country Name' column as that's already present in the main population growth dataset
countries_by_continent_df = countries_by_continent_df.drop(columns=['Country'], errors='ignore')
countries_by_continent_df.head()

Unnamed: 0,CountryCode,Continent
0,AFG,Asia
1,ALB,Europe
2,DZA,Africa
3,ASM,Oceania
4,AND,Europe


In [46]:
# Write out the Country/Continent dataset, ready for importing into the database
countries_by_continent_df.to_csv('./data_for_db/CountryContinent.csv', encoding='utf8', index=False)