In [1]:
import pandas as pd
import numpy as np

from constants import POPULATION_DIMENSION_CSV

In [2]:
df = pd.read_csv('../raw_data/population.csv')
df.replace("..", np.nan, inplace=True)

In [3]:
columns = df.columns.tolist()
# fix column years format
for i in range(4, len(columns)):
	columns[i] = columns[i][:4]
df.columns = columns
df.head(3)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,"Life expectancy at birth, female (years)",SP.DYN.LE00.FE.IN,Canada,CAN,82.6,82.7,82.9,83.0,83.3,83.5,83.6,83.8,83.9,83.9,84.0,84.0,84.0,84.2,84.2,
1,"Life expectancy at birth, female (years)",SP.DYN.LE00.FE.IN,United States,USA,80.1,80.3,80.6,80.6,80.9,81.0,81.1,81.2,81.2,81.3,81.2,81.1,81.1,81.2,81.4,
2,"Life expectancy at birth, female (years)",SP.DYN.LE00.FE.IN,Mexico,MEX,77.999,78.035,78.039,78.026,78.004,77.979,77.952,77.922,77.89,77.86,77.837,77.826,77.827,77.844,77.879,


In [4]:
# Replace missing value with the mean value for each country and attribute 
df[columns[4:]] = df[columns[4:]].T.fillna(df.mean(axis=1)).T
df.head(3)

  df[columns[4:]] = df[columns[4:]].T.fillna(df.mean(axis=1)).T


Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,"Life expectancy at birth, female (years)",SP.DYN.LE00.FE.IN,Canada,CAN,82.6,82.7,82.9,83.0,83.3,83.5,83.6,83.8,83.9,83.9,84.0,84.0,84.0,84.2,84.2,83.573333
1,"Life expectancy at birth, female (years)",SP.DYN.LE00.FE.IN,United States,USA,80.1,80.3,80.6,80.6,80.9,81.0,81.1,81.2,81.2,81.3,81.2,81.1,81.1,81.2,81.4,80.953333
2,"Life expectancy at birth, female (years)",SP.DYN.LE00.FE.IN,Mexico,MEX,77.999,78.035,78.039,78.026,78.004,77.979,77.952,77.922,77.89,77.86,77.837,77.826,77.827,77.844,77.879,77.927933


In [5]:
attributes_dic = {
    "life_expectancy_male": "SP.DYN.LE00.MA.IN",
    "life_expectancy_female": "SP.DYN.LE00.FE.IN",
    "birth_rate": "SP.DYN.CBRT.IN",
    "death_rate": "SP.DYN.CDRT.IN",
    "growth_rate": "SP.POP.GROW",
    "fertility_rate": "SP.DYN.TFRT.IN",
    "total_population": "SP.POP.TOTL",
    "urban_population": "SP.URB.TOTL",
    "rural_population": "SP.RUR.TOTL",
    "population_ages_0_to_14": "SP.POP.0014.TO",
    "population_ages_15_to_64": "SP.POP.1564.TO",
    "population_ages_65_and_above": "SP.POP.65UP.TO",
}

In [6]:
def transpose_attributes(dataframe, attribute):
	dataframe.drop(columns=["Series Name", "Series Code", "Country Name"], inplace=True)
	columns = dataframe.columns.tolist()
	return dataframe.melt(id_vars="Country Code", value_vars=columns[1:], var_name="Year", value_name=attribute)

In [7]:
# Transpose Attributes
attributes_df_dic = {}
for key, value in attributes_dic.items(): 
	filt = (df["Series Code"] == value)
	attribute_df = df.loc[filt].copy()
	attributes_df_dic[key] = transpose_attributes(attribute_df, key)

In [8]:
# Merge attributes together
attributes_lst_dic = list(attributes_df_dic.values())
population_df = attributes_lst_dic[0] 
for i in range(1, len(attributes_lst_dic)):
	population_df = pd.merge(population_df, attributes_lst_dic[i], how="left", left_on=["Country Code", "Year"], right_on=["Country Code", "Year"])
population_df.rename_axis("population_key", inplace=True)
population_df.head(18)

Unnamed: 0_level_0,Country Code,Year,life_expectancy_male,life_expectancy_female,birth_rate,death_rate,growth_rate,fertility_rate,total_population,urban_population,rural_population,population_ages_0_to_14,population_ages_15_to_64,population_ages_65_and_above
population_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,CAN,2005,77.9,82.6,10.6,7.1,0.944467,1.5434,32243753.0,25834340.0,6409413.0,5699388.0,22314773.0,4229591.0
1,USA,2005,75.0,80.1,14.0,8.3,0.921713,2.057,295516599.0,236200507.0,59316092.0,61707218.0,197426230.0,36383151.0
2,MEX,2005,72.575,77.999,21.741,4.792,1.415817,2.495,106005199.0,80890447.0,25114752.0,33749096.0,66293339.0,5962764.0
3,IND,2005,63.689,65.357,24.087,8.074,1.579709,2.972,1147609924.0,335503761.0,812106163.0,375719593.0,717517454.0,54372877.0
4,IDN,2005,65.579,69.107,21.765,7.203,1.336305,2.514,226289468.0,103961907.0,122327561.0,67618156.0,147807686.0,10863626.0
5,IRN,2005,70.438,73.502,17.401,5.041,1.169424,1.825,69762345.0,47130045.0,22632300.0,18761902.0,47677758.0,3322685.0
6,EGY,2005,67.138,71.825,24.849,6.371,1.805717,3.019,75523576.0,32495529.0,43028047.0,25560699.0,46265950.0,3696926.0
7,KEN,2005,52.819,56.667,38.366,10.544,2.739246,4.843,36624897.0,7938446.0,28686451.0,16124833.0,19737943.0,762121.0
8,NGA,2005,47.504,49.028,42.537,16.267,2.585689,5.985,138865014.0,54260116.0,84604898.0,60644266.0,74412505.0,3808243.0
9,CAN,2006,78.1,82.7,10.9,7.0,1.010335,1.5862,32571174.0,26126316.0,6444858.0,5667703.0,22581706.0,4321765.0


In [9]:
# Output Population Dimension
population_df.to_csv(POPULATION_DIMENSION_CSV)