# GROUPING

After preparing the data for a more exhaustive analysis, the last step is to separate the different countries into group with common characteristics. This is necessary as an aggregate study of all countries would mean erasing the differences between them and missing important conclusions, making our study too shallow.

Grouping them by similar characteristics will simplify our analysis, all while reducing the data loss and stressing common patterns among countries in the same group.

In [1]:
import os
import numpy as np
from scipy.stats import pearsonr, spearmanr
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from Project.Utils.aggregate import aggregate


column_country = 'Country'
column_year = 'Year'
column_region = 'Region'

read_path = os.getcwd()
region_url = read_path + '/Databases/AuxiliarData/world-regions.csv'
df_url = read_path + '/Output/SilverDataframe.csv'

write_path = os.getcwd() + '/Output/'
country_folder = write_path + '/Country/'
region_folder = write_path + '/Region/'
aggregate_folder = region_folder + '/Aggregate/'



## CREATING DIRECTORIES
If the destination folders do not exist, create them.

In [2]:
if not os.path.exists(country_folder):
            os.makedirs(country_folder)
if not os.path.exists(region_folder):
            os.makedirs(region_folder)
if not os.path.exists(aggregate_folder):
            os.makedirs(aggregate_folder)

### READ THE FULL DATAFRAME AND THE REGION DATAFRAME

In [3]:
region_df = pd.read_csv(region_url)
df = pd.read_csv(df_url, index_col = ['Country', 'Year'])

print(region_df)
region_df.drop(columns=['Code', 'Year'], inplace = True)
region_df.rename(columns = {'World Region according to the World Bank': column_region, 'Entity': column_country}, inplace = True)
#region_df.set_index(['Country', 'Region'], inplace = True)
print(region_df)

             Entity Code  Year World Region according to the World Bank
0       Afghanistan  AFG  2017                               South Asia
1           Albania  ALB  2017                  Europe and Central Asia
2           Algeria  DZA  2017             Middle East and North Africa
3    American Samoa  ASM  2017                    East Asia and Pacific
4           Andorra  AND  2017                  Europe and Central Asia
..              ...  ...   ...                                      ...
212       Venezuela  VEN  2017              Latin America and Caribbean
213         Vietnam  VNM  2017                    East Asia and Pacific
214           Yemen  YEM  2017             Middle East and North Africa
215          Zambia  ZMB  2017                       Sub-Saharan Africa
216        Zimbabwe  ZWE  2017                       Sub-Saharan Africa

[217 rows x 4 columns]
            Country                        Region
0       Afghanistan                    South Asia
1           

## ADD THE PROPERTY 'REGION'
By merging both DataFrames, we will obtain a DataFrame similar to the original one with a new field for the index: `region`

In [4]:
print(df)
region_df.set_index(['Country', 'Region'], inplace = True)
df = pd.merge(df, region_df, how = 'inner', left_index = True, right_index = True)
print(df)

                  % Undernourishment  AgriShareGDP  CreditToAgriFishForest  \
Country     Year                                                             
Afghanistan 2000                47.8      54.06300                     NaN   
            2001                47.8      54.06300                     NaN   
            2002                45.6      45.13440                     NaN   
            2003                40.6      41.90340                     NaN   
            2004                38.0      35.61280                     NaN   
...                              ...           ...                     ...   
Zimbabwe    2016                 NaN       7.87399                     NaN   
            2017                 NaN       8.34095                     NaN   
            2018                 NaN       8.30469                     NaN   
            2019                 NaN       8.17322                     NaN   
            2020                 NaN      10.93630              

## EXPORT THE SEGREGATED DATAFRAMES
We will export the new DataFrame with the regions and two new kinds of DataFrame: one for every different country, and another for every region.

In [5]:
df.to_csv(write_path + '/GoldDataframe.csv')

country_list = set(df.index.get_level_values(0))
region_list = set(df.index.get_level_values(2))

for country in country_list:
    df.loc[df.index.get_level_values(0) == country].to_csv(country_folder + country + '.csv')

for region in region_list:
    df.loc[df.index.get_level_values(2) == region].to_csv(region_folder + region + '.csv')

# REGION AGGREGATES
In order to create an aggregated DataFrame with entries for the regions we are studying, we will have to estimate the values of its indicators: the summatory for absolute ones, and a weighted average based on population for the relative ones. The weight of the mean could also be established as the GDP of each country, or any other figure we consider representative enough of the country's importance in the calculus.

In [6]:
#Define which indicators should be not a weighted
abs_indicators = ['GDP', 'Population', 'TotalAgri']

#Wether aggregate from the regions aggregate (faster) or from the country raw data (slower but more precise)
fast_aggregate = True

aggregate_df = pd.DataFrame()

for region in region_list:
    #Create sub-DataFrame with only the countries for each region
    region_df = df.loc[df.index.get_level_values(column_region) == region]
    #Aggregate by the countries of said region
    aggregated_region_df = aggregate(region_df, aggregate_by = column_country, for_index = column_year, new_group_col_name = column_region, group_name = region, abs_indicators = abs_indicators)
    #Add the aggregated DataFrame to the DataFrame per aggregated regions
    aggregate_df = pd.concat([aggregate_df, aggregated_region_df])

#Obtain an aggregate for the world
if fast_aggregate:
    aggregate_df_world = aggregate(aggregate_df, column_region, column_year, new_group_col_name = column_region, group_name = 'World', abs_indicators = abs_indicators)
else:
    aggregate_df_world = aggregate(df, column_country, column_year, new_group_col_name = column_region, group_name = 'World', abs_indicators = abs_indicators)

#Export the aggregated DataFrame to a .csv file
aggregate_df.to_csv(write_path + 'AggregatedRegion_DataFrame.csv')

aggregate_df_world.to_csv(write_path + 'AggregatedWorld_DataFrame.csv')

In [7]:
import datetime

#Set True if you want to analyse the time for both methods. By default is False to save that time.
test_time = False

if test_time:
    #Count time for the fast method: aggregating from the aggregated regions.
    st_time = datetime.now()
    fast_aggregate_df = aggregate(aggregate_df, column_region, column_year, new_group_col_name = column_region, group_name = 'World', abs_indicators = abs_indicators)
    f_time = datetime.now()
    fast_time = f_time - st_time
    print('Fast aggregation time: ' + fast_time.total_seconds())

    #Count time for the most precise method: aggregating from the raw data per country.
    st_time = datetime.now()
    precise_aggregate_df = aggregate(df, column_country, column_year, new_group_col_name = column_region, group_name = 'World',abs_indicators = abs_indicators)
    f_time = datetime.now()
    precise_time = f_time - st_time
    print('Precisse aggregation time: ' + precise_time.total_seconds())

    #Print the results to compare.
    print(fast_aggregate_df)
    print(precise_aggregate_df)

    #Export the results to .csv files if a more detailed analysis is needed.
    fast_aggregate_df.to_csv(region_folder + 'World_Fast.csv')
    precise_aggregate_df.to_csv(region_folder + 'World.csv')
else:
    print('No aggregation time analysed. If you want to study it, set the test_time variable to True and execute again')

No aggregation time analysed. If you want to study it, set the test_time variable to True and execute again


### Variables that can be changed
The p-value can be changed to a desired value. For the default value is recommnended 0.05. If you desire a higher confidence level lower this variable. This variable reffers directly to significance level, but for porpouses of clarification is set to this name:

We do the %store in order to pass this variable to other notebooks.

In [8]:
PVALUE_VAR = 0.05

%store PVALUE_VAR

Stored 'PVALUE_VAR' (float)


## Correlation dataframe.
This dataframe is the main piece of the notebook. Consists in generating for every country the correlation matrix for it and saving only the correlation value of the different variables with the GDP. 
This codeblock also calculates the p-value of Spearman and Pearson, if the value > PVALUE_VAR the correlation will be deleted due to not having statiscal significance.

Later on is concatenated and generates the following result:

In [9]:
#One dataframe per country

write_path = os.getcwd() + '/Output/'

col_country = 'Country'
col_year = 'Year'
col_region = 'Region'
col_gdp = 'GDP'

df= pd.read_csv (write_path + 'GoldDataframe.csv')
corr_df = pd.DataFrame()
corr_df.index.names = [col_country]

#List all the countries, none repeated
countries = set(df[col_country].to_list())

country_dict = {}
corr_dict = {}

for country in countries:
    #Get the DataFrame for a given country
    country_df = df.loc[df[col_country] == country]

    #Correlation matrix for that country
    country_corr_df = country_df.corr()

    #Significance for the correlations
    pval = country_df.corr(method = lambda x, y: pearsonr(x, y)[1]) - np.eye(*country_corr_df.shape)
    p = pval.applymap(lambda x: 1 if x < PVALUE_VAR else np.NaN)
    country_corr_df = country_corr_df * p

    #Significance for the correlations
    pval = country_df.corr(method = lambda x, y: spearmanr(x, y)[1]) - np.eye(*country_corr_df.shape)
    p = pval.applymap(lambda x: 1 if x < PVALUE_VAR else np.NaN)
    country_corr_df = country_corr_df * p

    #Trim it into a single row
    country_corr_df = country_corr_df.rename(columns = {col_gdp: country}).drop(index = [col_year, col_gdp])

    #Add the row to a new DataFrame with the correlations for each country
    corr_df = pd.concat([corr_df, country_corr_df[country]], axis = 1)

#Transpose the resulting DataFrame to have the desired format, save it and show it
corr_df = corr_df.transpose()
corr_df


Unnamed: 0,% Undernourishment,AgriShareGDP,CreditToAgriFishForest,EmploymentRural,%EmploymentAgriFishForest,% Soldiers,Employment in industry,Employment in services,Birth Rate,Cost business start-up,...,% Rural Population,Tertiary School Gender Parity,Suicide Rate,% Vulnerable female employment,% Vulnerable male employment,Gini,Civil Liberties,Freedom of Expression,% Healthcare Investment,Population
Rwanda,,-0.820315,,,,-0.842532,0.992907,0.956154,-0.987402,-0.908425,...,-0.825151,0.928341,-0.852783,-0.992218,-0.990241,,,,,0.983200
Equatorial Guinea,,-0.848619,,,,-0.655238,0.722743,,-0.474562,,...,-0.699411,,,-0.939166,-0.775149,,,,,0.485875
Malaysia,,,,,-0.796201,-0.944179,-0.889028,0.934341,-0.854599,-0.950145,...,-0.970563,,0.555442,0.550089,,,0.619590,,0.888805,0.967304
Portugal,,-0.937049,,-0.519219,-0.487164,,-0.707766,0.641318,-0.684102,-0.691886,...,-0.680719,-0.765838,0.641131,-0.556400,-0.478053,-0.707518,-0.718688,,-0.511585,
Zimbabwe,,-0.674981,,,,-0.699115,-0.892451,,,-0.784723,...,0.938288,,,,0.660437,,0.588724,0.861675,,0.956048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Chad,-0.661227,,,,,-0.825559,0.564680,0.896739,-0.795398,-0.909298,...,-0.576978,,-0.487903,-0.668813,-0.823977,,,,-0.824777,0.788433
Lesotho,,-0.800496,,,,,,0.903933,-0.856030,-0.944754,...,-0.917841,,0.915577,-0.908167,-0.928620,,,,0.464520,
South Sudan,,,,,,,,,,,...,,,,,,,,,,
Azerbaijan,-0.750257,-0.858162,,,-0.691811,-0.694244,0.814884,,0.589671,-0.883838,...,-0.647364,,,0.601210,-0.678162,,,-0.842061,-0.663217,0.691515


## Cleaned GoldDataframe 
Before the correlation dataframe was generated and it detected all the correlations with a high p-value. To perform all the following analysis a clean GoldDataframe is needed. It simply detects if the correlation for the indicator and country is Nan and procedes to delete the whole column indicator for the country.

In [10]:

df = df.copy()
for country in countries:
    for ind in corr_df.columns:
        if np.isnan(corr_df[ind][country]):
            df.loc[df[col_country] == country, [ind]] = np.NaN

df.set_index([col_country, col_region, col_year]).to_csv(write_path + 'GoldDataframe_significant.csv')
df.set_index([col_country, col_region, col_year])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,% Undernourishment,AgriShareGDP,CreditToAgriFishForest,EmploymentRural,GDP,%EmploymentAgriFishForest,% Soldiers,Employment in industry,Employment in services,Birth Rate,...,% Rural Population,Tertiary School Gender Parity,Suicide Rate,% Vulnerable female employment,% Vulnerable male employment,Gini,Civil Liberties,Freedom of Expression,% Healthcare Investment,Population
Country,Region,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Afghanistan,South Asia,2000,47.8,54.06300,,,3342.034168,,,9.48,24.680000,48.021,...,77.922,,4.9,98.720002,91.879999,,,0.625,,20779957.0
Afghanistan,South Asia,2001,47.8,54.06300,,,3598.470576,,,8.98,24.719999,47.505,...,77.831,,5.0,98.760003,92.399998,,,0.625,,21606992.0
Afghanistan,South Asia,2002,45.6,45.13440,,,4141.523943,,,9.99,25.590000,46.901,...,77.739,,5.0,98.669999,91.460001,,,0.625,,22600774.0
Afghanistan,South Asia,2003,40.6,41.90340,,,4729.042179,,,10.35,25.950001,46.231,...,77.647,,5.0,98.599998,91.039999,,,0.687,,23680871.0
Afghanistan,South Asia,2004,38.0,35.61280,,,5388.482107,,,10.61,26.120001,45.507,...,77.500,,5.0,98.549998,90.960003,,,0.677,,24726689.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe,Sub-Saharan Africa,2016,,7.87399,,,20548.678073,,0.75975,7.05,,,...,67.704,,,,56.789999,,0.430,0.389,,14030338.0
Zimbabwe,Sub-Saharan Africa,2017,,8.34095,,,22040.902301,,0.75072,6.90,,,...,67.763,,,,56.609999,,0.488,0.431,,14236599.0
Zimbabwe,Sub-Saharan Africa,2018,,8.30469,,,24311.560545,,0.73821,6.75,,,...,67.791,,,,56.380000,,0.447,0.471,,14438812.0
Zimbabwe,Sub-Saharan Africa,2019,,8.17322,,,21935.075306,,0.73821,6.57,,,...,67.790,,,,57.090001,,0.403,0.434,,14645473.0


## Saving Correlation Dataframe Pearson

Use the same code as before, but this time save the correlation dataframe.

In [11]:
#One dataframe per country

write_path = os.getcwd() + '/Output/'

col_country = 'Country'
col_year = 'Year'
col_region = 'Region'
col_gdp = 'GDP'

df= pd.read_csv (write_path + 'GoldDataframe_significant.csv')
corr_df = pd.DataFrame()
corr_df.index.names = [col_country]

#List all the countries, none repeated
countries = set(df[col_country].to_list())

country_dict = {}
corr_dict = {}

for country in countries:
    #Get the DataFrame for a given country
    country_df = df.loc[df[col_country] == country]

    #Correlation matrix for that country
    country_corr_df = country_df.corr()

    #Trim it into a single row
    country_corr_df = country_corr_df.rename(columns = {col_gdp: country}).drop(index = [col_year, col_gdp])

    #Add the row to a new DataFrame with the correlations for each country
    corr_df = pd.concat([corr_df, country_corr_df[country]], axis = 1)

#Transpose the resulting DataFrame to have the desired format, save it and show it
corr_df = corr_df.transpose()
corr_df.to_csv(os.getcwd()+'/Output/Corr_DF_pearson.csv')
corr_df

Unnamed: 0,% Undernourishment,AgriShareGDP,CreditToAgriFishForest,EmploymentRural,%EmploymentAgriFishForest,% Soldiers,Employment in industry,Employment in services,Birth Rate,Cost business start-up,...,% Rural Population,Tertiary School Gender Parity,Suicide Rate,% Vulnerable female employment,% Vulnerable male employment,Gini,Civil Liberties,Freedom of Expression,% Healthcare Investment,Population
Rwanda,,-0.820315,,,,-0.842532,0.992907,0.956154,-0.987402,-0.908425,...,-0.825151,0.928341,-0.852783,-0.992218,-0.990241,,,,,0.983200
Equatorial Guinea,,-0.848619,,,,-0.655238,0.722743,,-0.474562,,...,-0.699411,,,-0.939166,-0.775149,,,,,0.485875
Malaysia,,,,,-0.796201,-0.944179,-0.889028,0.934341,-0.854599,-0.950145,...,-0.970563,,0.555442,0.550089,,,0.619590,,0.888805,0.967304
Portugal,,-0.937049,,-0.519219,-0.487164,,-0.707766,0.641318,-0.684102,-0.691886,...,-0.680719,-0.765838,0.641131,-0.556400,-0.478053,-0.707518,-0.718688,,-0.511585,
Zimbabwe,,-0.674981,,,,-0.699115,-0.892451,,,-0.784723,...,0.938288,,,,0.660437,,0.588724,0.861675,,0.956048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Chad,-0.661227,,,,,-0.825559,0.564680,0.896739,-0.795398,-0.909298,...,-0.576978,,-0.487903,-0.668813,-0.823977,,,,-0.824777,0.788433
Lesotho,,-0.800496,,,,,,0.903933,-0.856030,-0.944754,...,-0.917841,,0.915577,-0.908167,-0.928620,,,,0.464520,
South Sudan,,,,,,,,,,,...,,,,,,,,,,
Azerbaijan,-0.750257,-0.858162,,,-0.691811,-0.694244,0.814884,,0.589671,-0.883838,...,-0.647364,,,0.601210,-0.678162,,,-0.842061,-0.663217,0.691515


Saving the correlation dataframe using spearman.

In [12]:
#One dataframe per country

write_path = os.getcwd() + '/Output/'

col_country = 'Country'
col_year = 'Year'
col_region = 'Region'
col_gdp = 'GDP'

df= pd.read_csv (write_path + 'GoldDataframe_significant.csv')
corr_df = pd.DataFrame()
corr_df.index.names = [col_country]

#List all the countries, none repeated
countries = set(df[col_country].to_list())

country_dict = {}
corr_dict = {}

for country in countries:
    #Get the DataFrame for a given country
    country_df = df.loc[df[col_country] == country]

    #Correlation matrix for that country
    country_corr_df = country_df.corr(method='spearman')

    #Trim it into a single row
    country_corr_df = country_corr_df.rename(columns = {col_gdp: country}).drop(index = [col_year, col_gdp])

    #Add the row to a new DataFrame with the correlations for each country
    corr_df = pd.concat([corr_df, country_corr_df[country]], axis = 1)

#Transpose the resulting DataFrame to have the desired format, save it and show it
corr_df = corr_df.transpose()
corr_df.to_csv(os.getcwd()+'/Output/Corr_DF_spearman.csv')
corr_df

Unnamed: 0,% Undernourishment,AgriShareGDP,CreditToAgriFishForest,EmploymentRural,%EmploymentAgriFishForest,% Soldiers,Employment in industry,Employment in services,Birth Rate,Cost business start-up,...,% Rural Population,Tertiary School Gender Parity,Suicide Rate,% Vulnerable female employment,% Vulnerable male employment,Gini,Civil Liberties,Freedom of Expression,% Healthcare Investment,Population
Rwanda,,-0.693732,,,,-0.994799,0.995778,0.995778,-0.994805,-0.826985,...,-0.997400,0.936322,-0.992185,-0.994479,-0.995778,,,,,0.994805
Equatorial Guinea,,-0.874959,,,,-0.447335,0.691783,,-0.524675,,...,-0.524675,,,-0.920429,-0.777525,,,,,0.524675
Malaysia,,,,,-0.877114,-0.927179,-0.824448,0.910036,-0.955844,-0.931254,...,-0.955844,,0.436584,0.526316,,,0.668416,,0.910036,0.955844
Portugal,,-0.862439,,-0.448635,-0.560833,,-0.475447,0.548293,-0.488107,-0.538743,...,-0.546164,-0.457236,0.503096,-0.566504,-0.569106,-0.600918,-0.514919,,-0.468943,
Zimbabwe,,-0.669698,,,,-0.841353,-0.782722,,,-0.884333,...,0.877922,,,,0.512504,,0.637875,0.913470,,0.832468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Chad,-0.694409,,,,,-0.812745,0.459838,0.815200,-0.800000,-0.824378,...,-0.800000,,-0.558816,-0.474830,-0.672296,,,,-0.686607,0.800000
Lesotho,,-0.811302,,,,,,0.835986,-0.833766,-0.834805,...,-0.832468,,0.842481,-0.835986,-0.835986,,,,0.472231,
South Sudan,,,,,,,,,,,...,,,,,,,,,,
Azerbaijan,-0.838985,-0.894446,,,-0.635891,-0.656698,0.689185,,0.680741,-0.709262,...,-0.668831,,,0.641117,-0.599545,,,-0.807018,-0.680312,0.668831
