In [1]:
import pandas as pd
import numpy as np
import os

np.set_printoptions(formatter={"float": lambda x: "{0:0.4f}".format(x)})

In [2]:
wb_data_c = pd.read_csv(os.path.join("staging_data", "wb_data.country.csv"))
wb_i = wb_data_c[["Indicator Code", "Indicator Name"]].drop_duplicates()
wb_data_c = wb_data_c.drop(columns=["Country Name", "Indicator Name"])
wb_data_r = pd.read_csv(os.path.join("staging_data", "wb_data.region.csv"))
wb_data_r = wb_data_r.drop(columns=["Region Name", "Indicator Name"])
wb_c = pd.read_csv(os.path.join("staging_data", "wb_countries.csv"))
wb_c = wb_c.rename(columns={"Region": "Region Name"})
wb_r = pd.read_csv(os.path.join("staging_data", "wb_regions.csv"))
imp_strategy = pd.read_csv(os.path.join("staging_data", "indicator_imputation_strategy.csv"))
imp_strategy = imp_strategy.drop(columns=["Indicator Name"])

In [3]:
wb_data_c.head()

Unnamed: 0,Country Code,Indicator Code,Year,Measure
0,AFG,EG.ELC.ACCS.ZS,2000,4.4
1,AFG,AG.LND.IRIG.AG.ZS,2000,
2,AFG,AG.LND.AGRI.ZS,2000,57.945817
3,AFG,AG.LND.AGRI.K2,2000,377940.0
4,AFG,NV.AGR.TOTL.ZS,2000,


In [4]:
wb_data_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200508 entries, 0 to 200507
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Country Code    200508 non-null  object 
 1   Indicator Code  200508 non-null  object 
 2   Year            200508 non-null  int64  
 3   Measure         137303 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 6.1+ MB


In [5]:
print(
    f"Percentage of Missing data in Measure - {sum((wb_data_c["Measure"].isna()).astype(int)) * 100 / wb_data_c.shape[0]:.2f}%"
)

Percentage of Missing data in Measure - 31.52%


If a Measure is missing a value, we can fill it using the following ways (in order of precedence) - 

- Get the mean of measure for all years for that country and indicator and fill the null value (only if there is data available for 16 or more years between 2000-2020)
- Get the value of measure for that indicator and year from the corresponding region estimate and fill the null value (only if the value exists in region estimates)
- Get the mean of Measure for that indicator from all countries belonging to that region for that year.
- Get the mean of measure for all years for the corresponding region and indicator and fill the null value (only if there is data available for 16 or more years between 2000-2020 in that region)

In [6]:
result_df = wb_data_c.copy()

In [7]:
# Merge country-region mappings
country_region_map = wb_c.merge(wb_r, on="Region Name", how="left")
country_region_map = country_region_map[["Country Code", "Region Code"]]
country_region_map

Unnamed: 0,Country Code,Region Code
0,ABW,LCN
1,AFG,SAS
2,AGO,SSF
3,ALB,ECS
4,AND,ECS
...,...,...
212,XKX,ECS
213,YEM,MEA
214,ZAF,SSF
215,ZMB,SSF


In [8]:
# Step 1: Calculate country-level means for countries with 16+ years of data
country_means = (
    wb_data_c.groupby(["Country Code", "Indicator Code"])
    .agg({"Measure": ["count", "mean", "median"]})
    .reset_index()
)
country_means.columns = ["Country Code", "Indicator Code", "Count", "Mean", "Median"]
country_means

Unnamed: 0,Country Code,Indicator Code,Count,Mean,Median
0,ABW,AG.CON.FERT.ZS,11,9.017727,2.485000
1,ABW,AG.LND.AGRI.K2,21,20.000000,20.000000
2,ABW,AG.LND.AGRI.ZS,21,11.111111,11.111111
3,ABW,AG.LND.ARBL.HA.PC,21,0.020201,0.019932
4,ABW,AG.LND.ARBL.ZS,21,11.111111,11.111111
...,...,...,...,...,...
9543,ZWE,SN.ITK.SVFI.ZS,6,34.283333,34.850000
9544,ZWE,SP.DYN.IMRT.IN,21,47.580952,50.500000
9545,ZWE,SP.POP.GROW,21,1.384441,1.253650
9546,ZWE,SP.RUR.TOTL.ZS,21,66.772143,66.804000


In [9]:
valid_country_means = country_means[country_means["Count"] >= 16].copy()
valid_country_means

Unnamed: 0,Country Code,Indicator Code,Count,Mean,Median
1,ABW,AG.LND.AGRI.K2,21,20.000000,20.000000
2,ABW,AG.LND.AGRI.ZS,21,11.111111,11.111111
3,ABW,AG.LND.ARBL.HA.PC,21,0.020201,0.019932
4,ABW,AG.LND.ARBL.ZS,21,11.111111,11.111111
12,ABW,EG.ELC.ACCS.ZS,21,99.290476,100.000000
...,...,...,...,...,...
9541,ZWE,SN.ITK.DEFC.ZS,20,31.325000,30.800000
9544,ZWE,SP.DYN.IMRT.IN,21,47.580952,50.500000
9545,ZWE,SP.POP.GROW,21,1.384441,1.253650
9546,ZWE,SP.RUR.TOTL.ZS,21,66.772143,66.804000


In [10]:
# Step 2: Create region-year mapping for direct value replacement
region_mapping = result_df.merge(
    country_region_map[["Country Code", "Region Code"]], on="Country Code", how="left"
)
region_mapping

Unnamed: 0,Country Code,Indicator Code,Year,Measure,Region Code
0,AFG,EG.ELC.ACCS.ZS,2000,4.400000,SAS
1,AFG,AG.LND.IRIG.AG.ZS,2000,,SAS
2,AFG,AG.LND.AGRI.ZS,2000,57.945817,SAS
3,AFG,AG.LND.AGRI.K2,2000,377940.000000,SAS
4,AFG,NV.AGR.TOTL.ZS,2000,,SAS
...,...,...,...,...,...
200503,ZWE,ER.H2O.INTR.PC,2020,782.403403,SSF
200504,ZWE,SP.RUR.TOTL.ZS,2020,67.758000,SSF
200505,ZWE,SE.PRM.NENR,2020,,SSF
200506,ZWE,SL.UEM.TOTL.ZS,2020,8.621000,SSF


In [11]:
region_year_values = wb_data_r[['Region Code', 'Indicator Code', 'Year', 'Measure']].copy()

In [12]:
# Step 3: Add region information to main dataframe for regional calculations
result_df = result_df.merge(
    country_region_map[["Country Code", "Region Code"]], on="Country Code", how="left"
)
result_df

Unnamed: 0,Country Code,Indicator Code,Year,Measure,Region Code
0,AFG,EG.ELC.ACCS.ZS,2000,4.400000,SAS
1,AFG,AG.LND.IRIG.AG.ZS,2000,,SAS
2,AFG,AG.LND.AGRI.ZS,2000,57.945817,SAS
3,AFG,AG.LND.AGRI.K2,2000,377940.000000,SAS
4,AFG,NV.AGR.TOTL.ZS,2000,,SAS
...,...,...,...,...,...
200503,ZWE,ER.H2O.INTR.PC,2020,782.403403,SSF
200504,ZWE,SP.RUR.TOTL.ZS,2020,67.758000,SSF
200505,ZWE,SE.PRM.NENR,2020,,SSF
200506,ZWE,SL.UEM.TOTL.ZS,2020,8.621000,SSF


In [13]:
# Step 4: Calculate region-level means for regions with 16+ years of data
region_means = (
    wb_data_r.groupby(["Region Code", "Indicator Code"])
    .agg({"Measure": ["count", "mean", "median"]})
    .reset_index()
)
region_means.columns = ["Region Code", "Indicator Code", "Count", "Mean", "Median"]
region_means

Unnamed: 0,Region Code,Indicator Code,Count,Mean,Median
0,AFE,AG.CON.FERT.ZS,21,1.917958e+01,1.854804e+01
1,AFE,AG.LND.AGRI.K2,21,6.708790e+06,6.717045e+06
2,AFE,AG.LND.AGRI.ZS,21,4.549313e+01,4.562913e+01
3,AFE,AG.LND.ARBL.HA.PC,21,2.083160e-01,2.129696e-01
4,AFE,AG.LND.ARBL.ZS,21,7.350962e+00,7.432404e+00
...,...,...,...,...,...
2107,WLD,SN.ITK.SVFI.ZS,5,9.100000e+00,8.900000e+00
2108,WLD,SP.DYN.IMRT.IN,21,3.870952e+01,3.730000e+01
2109,WLD,SP.POP.GROW,21,1.220462e+00,1.237784e+00
2110,WLD,SP.RUR.TOTL.ZS,21,4.854659e+01,4.842379e+01


In [14]:
valid_region_means = region_means[region_means['Count'] >= 16].copy()
valid_region_means

Unnamed: 0,Region Code,Indicator Code,Count,Mean,Median
0,AFE,AG.CON.FERT.ZS,21,1.917958e+01,1.854804e+01
1,AFE,AG.LND.AGRI.K2,21,6.708790e+06,6.717045e+06
2,AFE,AG.LND.AGRI.ZS,21,4.549313e+01,4.562913e+01
3,AFE,AG.LND.ARBL.HA.PC,21,2.083160e-01,2.129696e-01
4,AFE,AG.LND.ARBL.ZS,21,7.350962e+00,7.432404e+00
...,...,...,...,...,...
2105,WLD,SN.ITK.DEFC.ZS,20,9.645000e+00,8.750000e+00
2108,WLD,SP.DYN.IMRT.IN,21,3.870952e+01,3.730000e+01
2109,WLD,SP.POP.GROW,21,1.220462e+00,1.237784e+00
2110,WLD,SP.RUR.TOTL.ZS,21,4.854659e+01,4.842379e+01


In [15]:
# Function to apply imputation rules
def impute_value(row):
    indicator_code = row["Indicator Code"]
    strategy = imp_strategy[imp_strategy["Indicator Code"] == indicator_code][
        "Type"
    ].values[0]
    if pd.isna(row["Measure"]) and strategy in ["mean", "median"]:
        if row.name % 10 == 0:
            print()
        print(row.name, end=" ")

        # Rule 1: Country mean/median if 16+ years available
        country_mean = valid_country_means[
            (valid_country_means["Country Code"] == row["Country Code"])
            & (valid_country_means["Indicator Code"] == row["Indicator Code"])
        ]["Mean" if strategy == "mean" else "Median"].values

        if len(country_mean) > 0:
            return country_mean[0]

        # Rule 2: Region value for same year if available
        region_code = country_region_map[
            country_region_map["Country Code"] == row["Country Code"]
        ]["Region Code"].values[0]

        region_value = region_year_values[
            (region_year_values["Region Code"] == region_code)
            & (region_year_values["Indicator Code"] == row["Indicator Code"])
            & (region_year_values["Year"] == row["Year"])
        ]["Measure"].values

        if len(region_value) > 0 and not pd.isna(region_value[0]):
            return region_value[0]

        # Rule 3: Mean of Measure from all countries in the same region for that year
        region_countries_mean = wb_data_c[
            (wb_data_c["Year"] == row["Year"])
            & (wb_data_c["Indicator Code"] == row["Indicator Code"])
        ].merge(
            country_region_map[["Country Code", "Region Code"]],
            on="Country Code",
            how="left",
        )
        region_countries_mean = region_countries_mean[
            region_countries_mean["Region Code"] == region_code
        ]["Measure"].mean()

        if not pd.isna(region_countries_mean):
            return region_countries_mean

        # Rule 4: Region mean/median if 16+ years available
        region_mean = valid_region_means[
            (valid_region_means["Region Code"] == region_code)
            & (valid_region_means["Indicator Code"] == row["Indicator Code"])
        ]["Mean" if strategy == "mean" else "Median"].values

        if len(region_mean) > 0:
            return region_mean[0]

    return row["Measure"]

In [16]:
result_df['Measure'] = result_df.apply(impute_value, axis=1)

1 4 5 11 13 16 19 
20 21 29 
30 34 35 36 37 38 41 45 55 57 64 65 73 74 78 79 
80 81 89 99 101 108 109 117 118 123 132 133 136 137 
140 141 142 143 144 145 146 147 148 149 
150 151 152 153 154 155 156 159 161 162 163 166 167 168 169 
170 171 173 174 175 177 181 184 185 186 187 188 189 
190 191 193 196 197 198 205 206 
210 211 212 213 214 217 218 219 221 225 231 233 236 241 
250 254 255 256 257 261 265 269 275 277 278 279 284 285 289 
290 293 294 298 299 
300 301 302 305 306 307 309 319 321 329 338 342 343 344 345 353 356 363 365 372 373 381 382 387 393 397 401 404 405 406 407 408 409 
410 411 413 416 418 419 
420 423 425 426 427 
430 431 432 433 434 435 438 439 441 451 453 
460 461 469 
470 474 475 476 477 485 495 497 505 514 518 519 
520 521 529 539 541 548 549 557 558 563 573 583 585 592 593 601 602 606 607 608 609 613 617 
620 621 625 626 627 629 636 637 645 646 
650 651 652 653 654 661 671 673 681 
690 695 701 705 715 717 724 725 733 734 738 739 
740 741 742 749 759 761 769 782 783 

In [17]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200508 entries, 0 to 200507
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Country Code    200508 non-null  object 
 1   Indicator Code  200508 non-null  object 
 2   Year            200508 non-null  int64  
 3   Measure         185351 non-null  float64
 4   Region Code     200508 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 7.6+ MB


In [18]:
print(
    f"Percentage of Missing data in Measure - {sum((result_df["Measure"].isna()).astype(int)) * 100 / wb_data_c.shape[0]:.2f}%"
)

Percentage of Missing data in Measure - 7.56%


In [19]:
result_df = result_df.drop(columns=["Region Code"])
result_df = result_df.merge(
    wb_c[["Country Code", "Country Name"]], how="left", on="Country Code"
)
result_df = result_df.merge(wb_i, how="left", on="Indicator Code")
result_df = result_df[
    [
        "Country Code",
        "Country Name",
        "Indicator Code",
        "Indicator Name",
        "Year",
        "Measure",
    ]
]
result_df

Unnamed: 0,Country Code,Country Name,Indicator Code,Indicator Name,Year,Measure
0,AFG,Afghanistan,EG.ELC.ACCS.ZS,Access to electricity (% of population),2000,4.400000
1,AFG,Afghanistan,AG.LND.IRIG.AG.ZS,Agricultural irrigated land (% of total agricu...,2000,5.726721
2,AFG,Afghanistan,AG.LND.AGRI.ZS,Agricultural land (% of land area),2000,57.945817
3,AFG,Afghanistan,AG.LND.AGRI.K2,Agricultural land (sq. km),2000,377940.000000
4,AFG,Afghanistan,NV.AGR.TOTL.ZS,"Agriculture, forestry, and fishing, value adde...",2000,26.210069
...,...,...,...,...,...,...
200503,ZWE,Zimbabwe,ER.H2O.INTR.PC,Renewable internal freshwater resources per ca...,2020,782.403403
200504,ZWE,Zimbabwe,SP.RUR.TOTL.ZS,Rural population (% of total population),2020,67.758000
200505,ZWE,Zimbabwe,SE.PRM.NENR,"School enrollment, primary (% net)",2020,
200506,ZWE,Zimbabwe,SL.UEM.TOTL.ZS,"Unemployment, total (% of total labor force) (...",2020,8.621000


In [20]:
df_countries = pd.read_csv(os.path.join("staging_data", "wb_countries.csv"))
df_countries = df_countries.rename(columns={"Region": "Region Name"})
df_countries = df_countries[["Country Code", "Country Name", "Region Name", "Income Group"]]
wb_data = result_df.merge(df_countries, on=['Country Code','Country Name'], how='left')
wb_data = wb_data[['Country Code','Country Name','Region Name','Indicator Code','Indicator Name','Income Group','Year','Measure']]
wb_data = wb_data.sort_values(["Country Code", "Year", "Indicator Name"])

In [21]:
df = wb_data.drop(columns=["Indicator Code", "Region Name", "Income Group"])
df = (
    df.pivot_table(
        index=["Country Code", "Country Name", "Year"],
        columns="Indicator Name",
        values="Measure",
    )
    .reset_index()
    .rename_axis(None, axis=1)
)
df = df.sort_values(["Country Code", "Year"])
df = df.drop(
    columns=[
        "Country Code",
        "Prevalence of moderate or severe food insecurity in the population (%)",
        "Prevalence of severe food insecurity in the population (%)",
    ]
)
df.isna().any(axis=0)

Country Name                                                                      False
Year                                                                              False
Access to electricity (% of population)                                           False
Agricultural irrigated land (% of total agricultural land)                         True
Agricultural land (% of land area)                                                False
Agricultural land (sq. km)                                                        False
Agriculture, forestry, and fishing, value added (% of GDP)                        False
Agriculture, forestry, and fishing, value added per worker (constant 2015 US$)    False
Arable land (% of land area)                                                      False
Arable land (hectares per person)                                                 False
Average precipitation in depth (mm per year)                                      False
Cereal production (metric tons) 

### Agricultural irrigated land (% of total agricultural land)

In [22]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Year", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Agricultural irrigated land (% of total agricultural land)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name", "Year"], observed=False
)["Measure"].mean()
mean_groups

Region Name          Indicator Name                                              Year
East Asia & Pacific  Agricultural irrigated land (% of total agricultural land)  2000     0.561236
                                                                                 2001    16.580352
                                                                                 2002    10.261620
                                                                                 2003    11.202007
                                                                                 2004    11.528957
                                                                                           ...    
Sub-Saharan Africa   Agricultural irrigated land (% of total agricultural land)  2016    19.534884
                                                                                 2017     3.318298
                                                                                 2018     5.993394
                       

In [23]:
groups = wb_data[["Region Name", "Indicator Name", "Year"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Agricultural irrigated land (% of total agricultural land)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

In [24]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Agricultural irrigated land (% of total agricultural land)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name"], observed=False
)["Measure"].mean()
mean_groups

Region Name                 Indicator Name                                            
East Asia & Pacific         Agricultural irrigated land (% of total agricultural land)    10.197585
Europe & Central Asia       Agricultural irrigated land (% of total agricultural land)     7.747245
Latin America & Caribbean   Agricultural irrigated land (% of total agricultural land)    14.310598
Middle East & North Africa  Agricultural irrigated land (% of total agricultural land)    12.903147
North America               Agricultural irrigated land (% of total agricultural land)     4.002303
South Asia                  Agricultural irrigated land (% of total agricultural land)    34.403276
Sub-Saharan Africa          Agricultural irrigated land (% of total agricultural land)     6.464813
Name: Measure, dtype: float64

In [25]:
groups = wb_data[["Region Name", "Indicator Name"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Agricultural irrigated land (% of total agricultural land)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

### Child employment in agriculture (% of economically active children ages 7-14)

In [26]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Year", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Child employment in agriculture (% of economically active children ages 7-14)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name", "Year"], observed=False
)["Measure"].mean()
mean_groups

Region Name          Indicator Name                                                                 Year
East Asia & Pacific  Child employment in agriculture (% of economically active children ages 7-14)  2000          NaN
                                                                                                    2001    77.393333
                                                                                                    2002          NaN
                                                                                                    2003          NaN
                                                                                                    2004    82.300000
                                                                                                              ...    
Sub-Saharan Africa   Child employment in agriculture (% of economically active children ages 7-14)  2016          NaN
                                                                     

In [27]:
groups = wb_data[["Region Name", "Indicator Name", "Year"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Child employment in agriculture (% of economically active children ages 7-14)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

In [28]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Child employment in agriculture (% of economically active children ages 7-14)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name"], observed=False
)["Measure"].mean()
mean_groups

Region Name                 Indicator Name                                                               
East Asia & Pacific         Child employment in agriculture (% of economically active children ages 7-14)    76.117619
Europe & Central Asia       Child employment in agriculture (% of economically active children ages 7-14)    84.055057
Latin America & Caribbean   Child employment in agriculture (% of economically active children ages 7-14)    52.347057
Middle East & North Africa  Child employment in agriculture (% of economically active children ages 7-14)    52.370000
North America               Child employment in agriculture (% of economically active children ages 7-14)          NaN
South Asia                  Child employment in agriculture (% of economically active children ages 7-14)    66.073333
Sub-Saharan Africa          Child employment in agriculture (% of economically active children ages 7-14)    74.923611
Name: Measure, dtype: float64

In [29]:
groups = wb_data[["Region Name", "Indicator Name"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Child employment in agriculture (% of economically active children ages 7-14)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

### Droughts, floods, extreme temperatures (% of population, average 1990-2009)

In [30]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Year", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Droughts, floods, extreme temperatures (% of population, average 1990-2009)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name", "Year"], observed=False
)["Measure"].mean()
mean_groups

Region Name          Indicator Name                                                               Year
East Asia & Pacific  Droughts, floods, extreme temperatures (% of population, average 1990-2009)  2000   NaN
                                                                                                  2001   NaN
                                                                                                  2002   NaN
                                                                                                  2003   NaN
                                                                                                  2004   NaN
                                                                                                          ..
Sub-Saharan Africa   Droughts, floods, extreme temperatures (% of population, average 1990-2009)  2016   NaN
                                                                                                  2017   NaN
                         

In [31]:
groups = wb_data[["Region Name", "Indicator Name", "Year"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Droughts, floods, extreme temperatures (% of population, average 1990-2009)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

In [32]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Droughts, floods, extreme temperatures (% of population, average 1990-2009)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name"], observed=False
)["Measure"].mean()
mean_groups

Region Name                 Indicator Name                                                             
East Asia & Pacific         Droughts, floods, extreme temperatures (% of population, average 1990-2009)    1.581095
Europe & Central Asia       Droughts, floods, extreme temperatures (% of population, average 1990-2009)    0.392112
Latin America & Caribbean   Droughts, floods, extreme temperatures (% of population, average 1990-2009)    0.823143
Middle East & North Africa  Droughts, floods, extreme temperatures (% of population, average 1990-2009)    0.740057
North America               Droughts, floods, extreme temperatures (% of population, average 1990-2009)    0.112449
South Asia                  Droughts, floods, extreme temperatures (% of population, average 1990-2009)    1.745426
Sub-Saharan Africa          Droughts, floods, extreme temperatures (% of population, average 1990-2009)    2.081214
Name: Measure, dtype: float64

In [33]:
groups = wb_data[["Region Name", "Indicator Name"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Droughts, floods, extreme temperatures (% of population, average 1990-2009)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

### Gini index

In [34]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Year", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Gini index"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name", "Year"], observed=False
)["Measure"].mean()
mean_groups

Region Name          Indicator Name  Year
East Asia & Pacific  Gini index      2000    39.625000
                                     2001    33.671021
                                     2002    36.925000
                                     2003    38.941291
                                     2004    35.700000
                                               ...    
Sub-Saharan Africa   Gini index      2016    44.100000
                                     2017    40.940000
                                     2018    37.920000
                                     2019    42.760000
                                     2020    39.300000
Name: Measure, Length: 147, dtype: float64

In [35]:
groups = wb_data[["Region Name", "Indicator Name", "Year"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Gini index" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

In [36]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Gini index"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name"], observed=False
)["Measure"].mean()
mean_groups

Region Name                 Indicator Name
East Asia & Pacific         Gini index        36.981971
Europe & Central Asia       Gini index        31.467598
Latin America & Caribbean   Gini index        48.869166
Middle East & North Africa  Gini index        35.944269
North America               Gini index        37.157381
South Asia                  Gini index        34.501316
Sub-Saharan Africa          Gini index        43.172985
Name: Measure, dtype: float64

In [37]:
groups = wb_data[["Region Name", "Indicator Name"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Gini index" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

### Literacy rate, adult total (% of people ages 15 and above)

In [38]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Year", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Literacy rate, adult total (% of people ages 15 and above)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name", "Year"], observed=False
)["Measure"].mean()
mean_groups

Region Name          Indicator Name                                              Year
East Asia & Pacific  Literacy rate, adult total (% of people ages 15 and above)  2000    89.453713
                                                                                 2001    88.690446
                                                                                 2002    90.988922
                                                                                 2003    91.385446
                                                                                 2004    90.813014
                                                                                           ...    
Sub-Saharan Africa   Literacy rate, adult total (% of people ages 15 and above)  2016    64.818228
                                                                                 2017    64.738055
                                                                                 2018    64.369675
                       

In [39]:
groups = wb_data[["Region Name", "Indicator Name", "Year"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Literacy rate, adult total (% of people ages 15 and above)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

In [40]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Literacy rate, adult total (% of people ages 15 and above)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name"], observed=False
)["Measure"].mean()
mean_groups

Region Name                 Indicator Name                                            
East Asia & Pacific         Literacy rate, adult total (% of people ages 15 and above)    93.061361
Europe & Central Asia       Literacy rate, adult total (% of people ages 15 and above)    97.630223
Latin America & Caribbean   Literacy rate, adult total (% of people ages 15 and above)    91.593583
Middle East & North Africa  Literacy rate, adult total (% of people ages 15 and above)    79.004009
North America               Literacy rate, adult total (% of people ages 15 and above)    98.656300
South Asia                  Literacy rate, adult total (% of people ages 15 and above)    65.799802
Sub-Saharan Africa          Literacy rate, adult total (% of people ages 15 and above)    61.354183
Name: Measure, dtype: float64

In [41]:
groups = wb_data[["Region Name", "Indicator Name"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Literacy rate, adult total (% of people ages 15 and above)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

### Political Stability and Absence of Violence/Terrorism: Estimate

In [42]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Year", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Political Stability and Absence of Violence/Terrorism: Estimate"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name", "Year"], observed=False
)["Measure"].mean()
mean_groups

Region Name          Indicator Name                                                   Year
East Asia & Pacific  Political Stability and Absence of Violence/Terrorism: Estimate  2000    0.422531
                                                                                      2001    0.454812
                                                                                      2002    0.458880
                                                                                      2003    0.368554
                                                                                      2004    0.415486
                                                                                                ...   
Sub-Saharan Africa   Political Stability and Absence of Violence/Terrorism: Estimate  2016   -0.610972
                                                                                      2017   -0.623140
                                                                                     

In [43]:
groups = wb_data[["Region Name", "Indicator Name", "Year"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Political Stability and Absence of Violence/Terrorism: Estimate" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

In [44]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Political Stability and Absence of Violence/Terrorism: Estimate"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name"], observed=False
)["Measure"].mean()
mean_groups

Region Name                 Indicator Name                                                 
East Asia & Pacific         Political Stability and Absence of Violence/Terrorism: Estimate    0.436080
Europe & Central Asia       Political Stability and Absence of Violence/Terrorism: Estimate    0.418130
Latin America & Caribbean   Political Stability and Absence of Violence/Terrorism: Estimate    0.132839
Middle East & North Africa  Political Stability and Absence of Violence/Terrorism: Estimate   -0.609356
North America               Political Stability and Absence of Violence/Terrorism: Estimate    0.783220
South Asia                  Political Stability and Absence of Violence/Terrorism: Estimate   -0.990587
Sub-Saharan Africa          Political Stability and Absence of Violence/Terrorism: Estimate   -0.565693
Name: Measure, dtype: float64

In [45]:
groups = wb_data[["Region Name", "Indicator Name"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Political Stability and Absence of Violence/Terrorism: Estimate" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

### Poverty headcount ratio at national poverty lines (% of population)

In [46]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Year", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Poverty headcount ratio at national poverty lines (% of population)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name", "Year"], observed=False
)["Measure"].mean()
mean_groups

Region Name          Indicator Name                                                       Year
East Asia & Pacific  Poverty headcount ratio at national poverty lines (% of population)  2000    37.066667
                                                                                          2001    26.959459
                                                                                          2002    25.300000
                                                                                          2003    33.235135
                                                                                          2004    21.750000
                                                                                                    ...    
Sub-Saharan Africa   Poverty headcount ratio at national poverty lines (% of population)  2016    46.675000
                                                                                          2017    37.366667
                                         

In [47]:
groups = wb_data[["Region Name", "Indicator Name", "Year"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Poverty headcount ratio at national poverty lines (% of population)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

In [48]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Poverty headcount ratio at national poverty lines (% of population)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name"], observed=False
)["Measure"].mean()
mean_groups

Region Name                 Indicator Name                                                     
East Asia & Pacific         Poverty headcount ratio at national poverty lines (% of population)    20.562384
Europe & Central Asia       Poverty headcount ratio at national poverty lines (% of population)    18.962724
Latin America & Caribbean   Poverty headcount ratio at national poverty lines (% of population)    36.636486
Middle East & North Africa  Poverty headcount ratio at national poverty lines (% of population)    19.164399
North America               Poverty headcount ratio at national poverty lines (% of population)          NaN
South Asia                  Poverty headcount ratio at national poverty lines (% of population)    34.687778
Sub-Saharan Africa          Poverty headcount ratio at national poverty lines (% of population)    44.989552
Name: Measure, dtype: float64

In [49]:
groups = wb_data[["Region Name", "Indicator Name"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Poverty headcount ratio at national poverty lines (% of population)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

In [50]:
region_indicator = wb_data[["Income Group", "Indicator Name", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Poverty headcount ratio at national poverty lines (% of population)"
]
mean_groups = region_indicator.groupby(
    ["Income Group", "Indicator Name"], observed=False
)["Measure"].mean()
mean_groups

Income Group         Indicator Name                                                     
High income          Poverty headcount ratio at national poverty lines (% of population)    22.946434
Low income           Poverty headcount ratio at national poverty lines (% of population)    42.145585
Lower middle income  Poverty headcount ratio at national poverty lines (% of population)    33.485556
Upper middle income  Poverty headcount ratio at national poverty lines (% of population)    28.151928
Name: Measure, dtype: float64

In [51]:
groups = wb_data[["Income Group", "Indicator Name"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Poverty headcount ratio at national poverty lines (% of population)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

### Prevalence of stunting, height for age (% of children under 5)

In [52]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Year", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Prevalence of stunting, height for age (% of children under 5)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name", "Year"], observed=False
)["Measure"].mean()
mean_groups

Region Name          Indicator Name                                                  Year
East Asia & Pacific  Prevalence of stunting, height for age (% of children under 5)  2000    36.177778
                                                                                     2001    41.248378
                                                                                     2002    39.925000
                                                                                     2003    33.760000
                                                                                     2004    27.240000
                                                                                               ...    
Sub-Saharan Africa   Prevalence of stunting, height for age (% of children under 5)  2016    32.621429
                                                                                     2017    26.900000
                                                                                     2

In [53]:
groups = wb_data[["Region Name", "Indicator Name", "Year"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Prevalence of stunting, height for age (% of children under 5)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

In [54]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Prevalence of stunting, height for age (% of children under 5)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name"], observed=False
)["Measure"].mean()
mean_groups

Region Name                 Indicator Name                                                
East Asia & Pacific         Prevalence of stunting, height for age (% of children under 5)    27.196234
Europe & Central Asia       Prevalence of stunting, height for age (% of children under 5)    14.060496
Latin America & Caribbean   Prevalence of stunting, height for age (% of children under 5)    15.252612
Middle East & North Africa  Prevalence of stunting, height for age (% of children under 5)    15.454033
North America               Prevalence of stunting, height for age (% of children under 5)     2.890000
South Asia                  Prevalence of stunting, height for age (% of children under 5)    37.293651
Sub-Saharan Africa          Prevalence of stunting, height for age (% of children under 5)    36.149263
Name: Measure, dtype: float64

In [55]:
groups = wb_data[["Region Name", "Indicator Name"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Prevalence of stunting, height for age (% of children under 5)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

### Prevalence of underweight, weight for age (% of children under 5)

In [56]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Year", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Prevalence of underweight, weight for age (% of children under 5)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name", "Year"], observed=False
)["Measure"].mean()
mean_groups

Region Name          Indicator Name                                                     Year
East Asia & Pacific  Prevalence of underweight, weight for age (% of children under 5)  2000    14.621622
                                                                                        2001    12.004054
                                                                                        2002    12.593514
                                                                                        2003    12.684324
                                                                                        2004    10.806486
                                                                                                  ...    
Sub-Saharan Africa   Prevalence of underweight, weight for age (% of children under 5)  2016    18.163333
                                                                                        2017    17.256250
                                                           

In [57]:
groups = wb_data[["Region Name", "Indicator Name", "Year"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Prevalence of underweight, weight for age (% of children under 5)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

In [58]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Prevalence of underweight, weight for age (% of children under 5)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name"], observed=False
)["Measure"].mean()
mean_groups

Region Name                 Indicator Name                                                   
East Asia & Pacific         Prevalence of underweight, weight for age (% of children under 5)     9.228932
Europe & Central Asia       Prevalence of underweight, weight for age (% of children under 5)     3.638500
Latin America & Caribbean   Prevalence of underweight, weight for age (% of children under 5)     4.141440
Middle East & North Africa  Prevalence of underweight, weight for age (% of children under 5)     6.613878
North America               Prevalence of underweight, weight for age (% of children under 5)     0.786984
South Asia                  Prevalence of underweight, weight for age (% of children under 5)    33.604107
Sub-Saharan Africa          Prevalence of underweight, weight for age (% of children under 5)    20.295367
Name: Measure, dtype: float64

In [59]:
groups = wb_data[["Region Name", "Indicator Name"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Prevalence of underweight, weight for age (% of children under 5)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

### Prevalence of wasting, weight for height (% of children under 5)

In [60]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Year", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Prevalence of wasting, weight for height (% of children under 5)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name", "Year"], observed=False
)["Measure"].mean()
mean_groups

Region Name          Indicator Name                                                    Year
East Asia & Pacific  Prevalence of wasting, weight for height (% of children under 5)  2000    9.144444
                                                                                       2001    5.435135
                                                                                       2002    8.550000
                                                                                       2003    8.700000
                                                                                       2004    8.260000
                                                                                                 ...   
Sub-Saharan Africa   Prevalence of wasting, weight for height (% of children under 5)  2016    6.914286
                                                                                       2017    7.350000
                                                                            

In [61]:
groups = wb_data[["Region Name", "Indicator Name", "Year"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Prevalence of wasting, weight for height (% of children under 5)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

In [62]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "Prevalence of wasting, weight for height (% of children under 5)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name"], observed=False
)["Measure"].mean()
mean_groups

Region Name                 Indicator Name                                                  
East Asia & Pacific         Prevalence of wasting, weight for height (% of children under 5)     6.636939
Europe & Central Asia       Prevalence of wasting, weight for height (% of children under 5)     3.629625
Latin America & Caribbean   Prevalence of wasting, weight for height (% of children under 5)     2.331430
Middle East & North Africa  Prevalence of wasting, weight for height (% of children under 5)     5.181757
North America               Prevalence of wasting, weight for height (% of children under 5)     0.420000
South Asia                  Prevalence of wasting, weight for height (% of children under 5)    12.674206
Sub-Saharan Africa          Prevalence of wasting, weight for height (% of children under 5)     7.992369
Name: Measure, dtype: float64

In [63]:
groups = wb_data[["Region Name", "Indicator Name"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "Prevalence of wasting, weight for height (% of children under 5)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

### School enrollment, primary (% net)

In [64]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Year", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "School enrollment, primary (% net)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name", "Year"], observed=False
)["Measure"].mean()
mean_groups

Region Name          Indicator Name                      Year
East Asia & Pacific  School enrollment, primary (% net)  2000    93.583615
                                                         2001    93.530157
                                                         2002    93.482984
                                                         2003    93.595185
                                                         2004    93.638704
                                                                   ...    
Sub-Saharan Africa   School enrollment, primary (% net)  2016    81.733280
                                                         2017    81.874203
                                                         2018    84.090855
                                                         2019    82.801572
                                                         2020    73.010413
Name: Measure, Length: 147, dtype: float64

In [65]:
groups = wb_data[["Region Name", "Indicator Name", "Year"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "School enrollment, primary (% net)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

In [66]:
region_indicator = wb_data[["Region Name", "Indicator Name", "Measure"]]
region_indicator = region_indicator[
    region_indicator["Indicator Name"]
    == "School enrollment, primary (% net)"
]
mean_groups = region_indicator.groupby(
    ["Region Name", "Indicator Name"], observed=False
)["Measure"].mean()
mean_groups

Region Name                 Indicator Name                    
East Asia & Pacific         School enrollment, primary (% net)    93.888413
Europe & Central Asia       School enrollment, primary (% net)    94.244464
Latin America & Caribbean   School enrollment, primary (% net)    93.193401
Middle East & North Africa  School enrollment, primary (% net)    88.901233
North America               School enrollment, primary (% net)    95.392704
South Asia                  School enrollment, primary (% net)    85.192464
Sub-Saharan Africa          School enrollment, primary (% net)    75.248799
Name: Measure, dtype: float64

In [67]:
groups = wb_data[["Region Name", "Indicator Name"]]
mask = wb_data["Measure"].isna()
for idx in wb_data[mask].index:
    group_key = tuple(groups.loc[idx])
    if "School enrollment, primary (% net)" in group_key:
        mean_group = mean_groups[group_key]
        wb_data.loc[idx, "Measure"] = mean_group

In [68]:
result_df = wb_data.drop(columns=["Region Name", "Income Group"])
# save this to clean data as wb_data.country.csv
result_df.head()

Unnamed: 0,Country Code,Country Name,Indicator Code,Indicator Name,Year,Measure
396,ABW,Aruba,EG.ELC.ACCS.ZS,Access to electricity (% of population),2000,91.7
397,ABW,Aruba,AG.LND.IRIG.AG.ZS,Agricultural irrigated land (% of total agricu...,2000,7.348765
398,ABW,Aruba,AG.LND.AGRI.ZS,Agricultural land (% of land area),2000,11.111111
399,ABW,Aruba,AG.LND.AGRI.K2,Agricultural land (sq. km),2000,20.0
400,ABW,Aruba,NV.AGR.TOTL.ZS,"Agriculture, forestry, and fishing, value adde...",2000,0.018608


In [69]:
print(
    f"Percentage of Missing data in Measure - {sum((result_df["Measure"].isna()).astype(int)) * 100 / wb_data_c.shape[0]:.2f}%"
)

Percentage of Missing data in Measure - 3.84%


In [70]:
result_df.to_csv(os.path.join("clean_data", "wb_data.country.csv"), index=False)

## Combining Regions data and storing in another CSV for further EDA

In [71]:
clean_df = pd.read_csv(os.path.join("clean_data", "wb_data.country.csv"))

In [72]:
df_countries = pd.read_csv(os.path.join("staging_data", "wb_countries.csv"))
df_countries = df_countries.rename(columns={"Region": "Region Name"})
df_countries = df_countries[["Country Code", "Country Name", "Region Name", "Income Group"]]

In [73]:
clean_df.head()

Unnamed: 0,Country Code,Country Name,Indicator Code,Indicator Name,Year,Measure
0,ABW,Aruba,EG.ELC.ACCS.ZS,Access to electricity (% of population),2000,91.7
1,ABW,Aruba,AG.LND.IRIG.AG.ZS,Agricultural irrigated land (% of total agricu...,2000,7.348765
2,ABW,Aruba,AG.LND.AGRI.ZS,Agricultural land (% of land area),2000,11.111111
3,ABW,Aruba,AG.LND.AGRI.K2,Agricultural land (sq. km),2000,20.0
4,ABW,Aruba,NV.AGR.TOTL.ZS,"Agriculture, forestry, and fishing, value adde...",2000,0.018608


In [74]:
df_countries.head()

Unnamed: 0,Country Code,Country Name,Region Name,Income Group
0,ABW,Aruba,Latin America & Caribbean,High income
1,AFG,Afghanistan,South Asia,Low income
2,AGO,Angola,Sub-Saharan Africa,Lower middle income
3,ALB,Albania,Europe & Central Asia,Upper middle income
4,AND,Andorra,Europe & Central Asia,High income


In [75]:
clean_df = clean_df.merge(df_countries, on=['Country Code','Country Name'], how='left')

In [76]:
clean_df.head()

Unnamed: 0,Country Code,Country Name,Indicator Code,Indicator Name,Year,Measure,Region Name,Income Group
0,ABW,Aruba,EG.ELC.ACCS.ZS,Access to electricity (% of population),2000,91.7,Latin America & Caribbean,High income
1,ABW,Aruba,AG.LND.IRIG.AG.ZS,Agricultural irrigated land (% of total agricu...,2000,7.348765,Latin America & Caribbean,High income
2,ABW,Aruba,AG.LND.AGRI.ZS,Agricultural land (% of land area),2000,11.111111,Latin America & Caribbean,High income
3,ABW,Aruba,AG.LND.AGRI.K2,Agricultural land (sq. km),2000,20.0,Latin America & Caribbean,High income
4,ABW,Aruba,NV.AGR.TOTL.ZS,"Agriculture, forestry, and fishing, value adde...",2000,0.018608,Latin America & Caribbean,High income


In [77]:
clean_df = clean_df[['Country Code','Country Name','Region Name','Indicator Code','Indicator Name','Income Group','Year','Measure']]

In [78]:
clean_df.head()

Unnamed: 0,Country Code,Country Name,Region Name,Indicator Code,Indicator Name,Income Group,Year,Measure
0,ABW,Aruba,Latin America & Caribbean,EG.ELC.ACCS.ZS,Access to electricity (% of population),High income,2000,91.7
1,ABW,Aruba,Latin America & Caribbean,AG.LND.IRIG.AG.ZS,Agricultural irrigated land (% of total agricu...,High income,2000,7.348765
2,ABW,Aruba,Latin America & Caribbean,AG.LND.AGRI.ZS,Agricultural land (% of land area),High income,2000,11.111111
3,ABW,Aruba,Latin America & Caribbean,AG.LND.AGRI.K2,Agricultural land (sq. km),High income,2000,20.0
4,ABW,Aruba,Latin America & Caribbean,NV.AGR.TOTL.ZS,"Agriculture, forestry, and fishing, value adde...",High income,2000,0.018608


In [79]:
clean_df.to_csv(os.path.join("clean_data", 'wb_data_combined.csv'), index=False)