# GROUPING

After preparing the data for a more exhaustive analysis, the last step is to separate the different countries into group with common characteristics. This is necessary as an aggregate study of all countries would mean erasing the differences between them and missing important conclusions, making our study too shallow.

Grouping them by similar characteristics will simplify our analysis, all while reducing the data loss and stressing common patterns among countries in the same group.

In [1]:
import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import datetime
from Project.Utils.aggregate import aggregate

import warnings
warnings.filterwarnings("ignore")

col_country = 'Country'
col_year = 'Year'
col_region = 'Region'
col_gdp = 'GDP'

col_index = [col_country, col_region, col_year]

read_path = os.getcwd()
region_url = read_path + '/Databases/AuxiliarData/world-regions-mod.csv'
df_url = read_path + '/Output/SilverDataframe.csv'

output_path = os.getcwd() + '/Output/'
write_path = os.getcwd() + '/Output/'
country_folder = write_path + '/Country/'
region_folder = write_path + '/Region/'
aggregate_folder = region_folder + '/Aggregate/'



## CREATING DIRECTORIES
If the destination folders do not exist, create them.

In [2]:
if not os.path.exists(country_folder):
            os.makedirs(country_folder)
if not os.path.exists(region_folder):
            os.makedirs(region_folder)
if not os.path.exists(aggregate_folder):
            os.makedirs(aggregate_folder)

## MODIFY THE REGION FILE
We downloaded a .csv file with the regions of the world according to the World Bank. However, we need to modify it slightly before using it.

Since we plan to repeatedly later, we will save this modified version.

In [3]:
if not os.path.exists(region_url) or True:

    reg_df = pd.read_csv(read_path + '/Databases/AuxiliarData/' + 'world-regions.csv')

    display(reg_df)
    reg_df.drop(columns=['Code', 'Year'], inplace = True)
    reg_df.rename(columns = {'World Region according to the World Bank': col_region, 'Entity': col_country}, inplace = True)
    reg_df.set_index([col_country, col_region], inplace = True)
    display(reg_df)

    reg_df.to_csv(region_url)


Unnamed: 0,Entity,Code,Year,World Region according to the World Bank
0,Afghanistan,AFG,2017,South Asia
1,Albania,ALB,2017,Europe and Central Asia
2,Algeria,DZA,2017,Middle East and North Africa
3,American Samoa,ASM,2017,East Asia and Pacific
4,Andorra,AND,2017,Europe and Central Asia
...,...,...,...,...
212,Venezuela,VEN,2017,Latin America and Caribbean
213,Vietnam,VNM,2017,East Asia and Pacific
214,Yemen,YEM,2017,Middle East and North Africa
215,Zambia,ZMB,2017,Sub-Saharan Africa


Country,Region
Afghanistan,South Asia
Albania,Europe and Central Asia
Algeria,Middle East and North Africa
American Samoa,East Asia and Pacific
Andorra,Europe and Central Asia
...,...
Venezuela,Latin America and Caribbean
Vietnam,East Asia and Pacific
Yemen,Middle East and North Africa
Zambia,Sub-Saharan Africa


### READ THE FULL DATAFRAME AND THE REGION DATAFRAME

In [4]:
df = pd.read_csv(df_url, index_col = [col_country, col_year])
region_df = pd.read_csv(region_url, index_col = [col_country, col_region])

## ADD THE PROPERTY 'REGION'
By merging both DataFrames, we will obtain a DataFrame similar to the original one with a new field for the index: `region`

In [5]:
display(df)
df = pd.merge(region_df, df, how = 'inner', left_index = True, right_index = True)
display(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,AgriShareGDP,CreditToAgriFishForest,EmploymentRural,GDP,% Soldiers,Employment in industry,Employment in services,Birth Rate,Cost business start-up,Death Rate,...,Researchers in R&D,R&D expenditure %GDP,% Rural Population,Tertiary School Gender Parity,% Vulnerable female employment,% Vulnerable male employment,Civil Liberties,Freedom of Expression,% Healthcare Investment,Population
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Afghanistan,2000,54.06300,,,3342.034168,7.887961,9.48,24.680000,48.021,72.0,11.718,...,,,77.922,,98.720002,91.879999,0.400,0.625,1.21,20779957.0
Afghanistan,2001,54.06300,,,3598.470576,5.020511,8.98,24.719999,47.505,72.0,11.387,...,,,77.831,,98.760003,92.399998,0.400,0.625,1.21,21606992.0
Afghanistan,2002,45.13440,,,4141.523943,2.153062,9.99,25.590000,46.901,72.0,11.048,...,,,77.739,,98.669999,91.460001,0.400,0.625,1.21,22600774.0
Afghanistan,2003,41.90340,,,4729.042179,2.208290,10.35,25.950001,46.231,72.0,10.704,...,,,77.647,,98.599998,91.039999,0.403,0.687,5.46,23680871.0
Afghanistan,2004,35.61280,,,5388.482107,0.435599,10.61,26.120001,45.507,72.0,10.356,...,,,77.500,,98.549998,90.960003,0.403,0.677,3.60,24726689.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe,2016,7.87399,,,20548.678073,0.759750,7.05,26.070000,32.864,121.5,8.286,...,,,67.704,,75.970001,56.789999,0.430,0.389,,14030338.0
Zimbabwe,2017,8.34095,,,22040.902301,0.750720,6.90,26.629999,31.732,110.0,8.044,...,,,67.763,,76.579998,56.609999,0.488,0.431,,14236599.0
Zimbabwe,2018,8.30469,,,24311.560545,0.738210,6.75,27.230000,30.676,110.7,7.883,...,,,67.791,,77.170002,56.380000,0.447,0.471,,14438812.0
Zimbabwe,2019,8.17322,,,21935.075306,0.738210,6.57,27.240000,29.747,76.6,7.773,...,,,67.790,,79.299999,57.090001,0.403,0.434,,14645473.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AgriShareGDP,CreditToAgriFishForest,EmploymentRural,GDP,% Soldiers,Employment in industry,Employment in services,Birth Rate,Cost business start-up,Death Rate,...,Researchers in R&D,R&D expenditure %GDP,% Rural Population,Tertiary School Gender Parity,% Vulnerable female employment,% Vulnerable male employment,Civil Liberties,Freedom of Expression,% Healthcare Investment,Population
Country,Region,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Afghanistan,South Asia,2000,54.06300,,,3342.034168,7.887961,9.48,24.680000,48.021,72.0,11.718,...,,,77.922,,98.720002,91.879999,0.400,0.625,1.21,20779957.0
Afghanistan,South Asia,2001,54.06300,,,3598.470576,5.020511,8.98,24.719999,47.505,72.0,11.387,...,,,77.831,,98.760003,92.399998,0.400,0.625,1.21,21606992.0
Afghanistan,South Asia,2002,45.13440,,,4141.523943,2.153062,9.99,25.590000,46.901,72.0,11.048,...,,,77.739,,98.669999,91.460001,0.400,0.625,1.21,22600774.0
Afghanistan,South Asia,2003,41.90340,,,4729.042179,2.208290,10.35,25.950001,46.231,72.0,10.704,...,,,77.647,,98.599998,91.039999,0.403,0.687,5.46,23680871.0
Afghanistan,South Asia,2004,35.61280,,,5388.482107,0.435599,10.61,26.120001,45.507,72.0,10.356,...,,,77.500,,98.549998,90.960003,0.403,0.677,3.60,24726689.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe,Sub-Saharan Africa,2016,7.87399,,,20548.678073,0.759750,7.05,26.070000,32.864,121.5,8.286,...,,,67.704,,75.970001,56.789999,0.430,0.389,,14030338.0
Zimbabwe,Sub-Saharan Africa,2017,8.34095,,,22040.902301,0.750720,6.90,26.629999,31.732,110.0,8.044,...,,,67.763,,76.579998,56.609999,0.488,0.431,,14236599.0
Zimbabwe,Sub-Saharan Africa,2018,8.30469,,,24311.560545,0.738210,6.75,27.230000,30.676,110.7,7.883,...,,,67.791,,77.170002,56.380000,0.447,0.471,,14438812.0
Zimbabwe,Sub-Saharan Africa,2019,8.17322,,,21935.075306,0.738210,6.57,27.240000,29.747,76.6,7.773,...,,,67.790,,79.299999,57.090001,0.403,0.434,,14645473.0


## EXPORT THE SEGREGATED DATAFRAMES
We will export the new DataFrame with the regions and two new kinds of DataFrame: one for every different country, and another for every region.

In [6]:
df.to_csv(write_path + '/GoldDataframe.csv')

country_list = set(df.index.get_level_values(col_country))
region_list = set(df.index.get_level_values(col_region))

for country in country_list:
    df.loc[df.index.get_level_values(col_country) == country].to_csv(country_folder + country + '.csv')

for region in region_list:
    df.loc[df.index.get_level_values(col_region) == region].to_csv(region_folder + region + '.csv')

# REGION AGGREGATES
In order to create an aggregated DataFrame with entries for the regions we are studying, we will have to estimate the values of its indicators: the summatory for absolute ones, and a weighted average based on population for the relative ones. The weight of the mean could also be established as the GDP of each country, or any other figure we consider representative enough of the country's importance in the calculus.

In [7]:
#Define which indicators should be not a weighted average, but a summatory.
abs_indicators = ['GDP', 'Population', 'TotalAgri']

#Wether aggregate from the regions aggregate (faster) or from the country raw data (slower but more precise).
fast_aggregate = True

#Initialize list to store the aggregated Dataframe for each region to later concatenate them all.
aggregated_list = []

for region in region_list:
    #Create sub-DataFrame with only the countries for each region
    region_df = df.loc[df.index.get_level_values(col_region) == region]
    #Aggregate by the countries of said region
    aggregated_region_df = aggregate(region_df, aggregate_by = col_country, for_index = col_year, new_group_col_name = col_region, group_name = region, abs_indicators = abs_indicators)
    #Add the aggregated DataFrame to the DataFrame per aggregated regions
    aggregated_list.append(aggregated_region_df)

aggregate_df = pd.concat(aggregated_list).sort_index()

#Obtain an aggregate for the world
col_world = 'World'

if fast_aggregate:
    aggregate_df_world = aggregate(aggregate_df, col_region, col_year, new_group_col_name = col_region, group_name = col_world, abs_indicators = abs_indicators)
else:
    aggregate_df_world = aggregate(df, col_country, col_year, new_group_col_name = col_region, group_name = col_world, abs_indicators = abs_indicators)

#Export the aggregated DataFrame to a .csv file
aggregate_df.to_csv(write_path + 'AggregatedRegion_DataFrame.csv')

aggregate_df_world.to_csv(write_path + 'AggregatedWorld_DataFrame.csv')

## ANALYSE AGGREGATION TIME
Since it is a time consuming process, the total time needed can be measured in the following cell if desired. Otherwised, it will be ignored.

In [8]:
#Set True if you want to analyse the time for both methods. By default is False to save that time.
test_time = False

if test_time:
    #Count time for the fast method: aggregating from the aggregated regions.
    st_time = datetime.now()
    fast_aggregate_df = aggregate(aggregate_df, col_region, col_year, new_group_col_name = col_region, group_name = 'World', abs_indicators = abs_indicators)
    f_time = datetime.now()
    fast_time = f_time - st_time
    print('Fast aggregation time: ' + fast_time.total_seconds())

    #Count time for the most precise method: aggregating from the raw data per country.
    st_time = datetime.now()
    precise_aggregate_df = aggregate(df, col_country, col_year, new_group_col_name = col_region, group_name = 'World',abs_indicators = abs_indicators)
    f_time = datetime.now()
    precise_time = f_time - st_time
    print('Precisse aggregation time: ' + precise_time.total_seconds())

    #Print the results to compare.
    display(fast_aggregate_df)
    display(precise_aggregate_df)

    #Export the results to .csv files if a more detailed analysis is needed.
    fast_aggregate_df.to_csv(region_folder + 'World_Fast.csv')
    precise_aggregate_df.to_csv(region_folder + 'World.csv')
else:
    print('No aggregation time analysed. If you want to study it, set the test_time variable to True and execute again')

No aggregation time analysed. If you want to study it, set the test_time variable to True and execute again


### Variables that can be changed
The p-value can be changed to a desired value. For the default value is recommnended 0.05. If you desire a higher confidence level lower this variable. This variable reffers directly to significance level, but for porpouses of clarification is set to this name:

We do the %store in order to pass this variable to other notebooks.

In [9]:
PVALUE_VAR = 0.05

%store PVALUE_VAR

Stored 'PVALUE_VAR' (float)


## Correlation dataframe.
This dataframe is the main piece of the notebook. Consists in generating for every country the correlation matrix for it and saving only the correlation value of the different variables with the GDP. 
This codeblock also calculates the p-value of Spearman and Pearson, if the value > PVALUE_VAR the correlation will be deleted due to not having statiscal significance.

Later on is concatenated and generates the following result:

## Saving Correlation Dataframe Pearson

Use the same code as before, but this time save the correlation dataframe.

In [10]:
df = pd.read_csv(write_path + 'GoldDataframe.csv', index_col = [col_country, col_region, col_year])
#List of Series that will be concatenated.
corr_list = []

for country in country_list:
    #Get the DataFrame for a given country.
    country_df = df.loc[df.index.get_level_values(col_country) == country]

    #Correlation matrix for that country, using Pearson.
    country_corr_df = country_df.corr(method = 'pearson')

    #Significance for the correlations
    pval = country_df.corr(method = lambda x, y: pearsonr(x, y)[1]) - np.eye(*country_corr_df.shape)
    p = pval.applymap(lambda x: 1 if x < PVALUE_VAR else np.NaN)
    country_corr_df = country_corr_df * p


    #Rename to country and remove unnecessary entries.
    country_corr_df = country_corr_df.rename(columns = {col_gdp: country}).drop(index = [col_gdp])

    #Trim it into a single row (pd.Series) and append it to the list.
    country_corr_series = country_corr_df.loc[:, country]
    corr_list.append(country_corr_series)

#Concat all the series, transpose the resulting DataFrame to have the desired format, save it and show it
corr_df = pd.concat(corr_list, axis = 1).transpose().sort_index()
corr_df.index.names = [col_country]
corr_df.to_csv(output_path + '/Corr_DF_pearson.csv')
corr_df

KeyError: 'Requested level (Country) does not match index name (None)'

Saving the correlation dataframe using spearman.

In [None]:
df = pd.read_csv(write_path + 'GoldDataframe.csv')
#List of Series that will be concatenated.
corr_list = []

for country in country_list:
    #Get the DataFrame for a given country.
    country_df = df.loc[df.index.get_level_values(col_country) == country]

    #Correlation matrix for that country, using Spearman.
    country_corr_df = country_df.corr(method = 'spearman')

    #Significance for the correlations
    pval = country_df.corr(method = lambda x, y: spearmanr(x, y)[1]) - np.eye(*country_corr_df.shape)
    p = pval.applymap(lambda x: 1 if x < PVALUE_VAR else np.NaN)
    country_corr_df = country_corr_df * p

    #Rename to country and remove unnecessary entries.
    country_corr_df = country_corr_df.rename(columns = {col_gdp: country}).drop(index = [col_gdp])

    #Trim it into a single row (pd.Series) and append it to the list.
    country_corr_series = country_corr_df.loc[:, country]
    corr_list.append(country_corr_series)

#Concat all the series, transpose the resulting DataFrame to have the desired format, save it and show it
corr_df = pd.concat(corr_list, axis = 1).transpose().sort_index()
corr_df.index.names = [col_country]
corr_df.to_csv(output_path + '/Corr_DF_spearman.csv')
corr_df

Unnamed: 0_level_0,AgriShareGDP,CreditToAgriFishForest,EmploymentRural,TotalAgri,% Soldiers,Birth Rate,Death Rate,Homicides,Life Expectancy,Maternal Death Risk,...,% Population Growth,% Rural Population,Civil Liberties,Freedom of Expression,% Healthcare Investment,% Employment Industry,% Education Expenditure,% Men Employment,% Women Employment,Population
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,-0.864566,,,,,-0.914286,-0.914286,,0.914286,-0.905542,...,,-0.914286,,0.666451,,-0.725737,,,,0.914286
Albania,-0.486522,,,0.919481,-0.782836,-0.854823,0.885714,-0.577374,0.889610,-0.782415,...,,-0.889610,0.796368,,,0.674271,,,,-0.888312
Algeria,,,,0.916883,-0.755527,0.874026,,,0.750649,,...,0.797403,-0.750649,0.696313,,0.629927,0.756356,,,,0.750649
Andorra,,,,,,,,,,,...,,,,,,,,,,0.879064
Angola,,,,,-0.755527,-0.753247,-0.753247,,0.753247,-0.776551,...,,-0.753247,0.744056,0.748459,,,,,,0.753247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Uzbekistan,-0.714518,,,,-0.672302,0.607537,-0.675026,,0.850649,-0.812382,...,0.759740,-0.836364,,,0.925625,0.904891,,,,0.850649
Vanuatu,,-0.569106,,0.798701,,-0.971429,-0.944120,,0.971429,-0.975901,...,,-0.971429,0.827089,,-0.813901,0.471664,,,,0.971429
Yemen,,,,0.657143,,,,,,,...,,,,,-0.757143,0.836364,,,,0.546753
Zambia,-0.815200,0.728456,,0.946532,-0.850456,-0.837662,-0.837662,,0.837662,-0.856682,...,0.884416,-0.837662,-0.658414,-0.531383,-0.547724,0.927041,,,,0.837662


## Cleaned GoldDataframe 
Before the correlation dataframe was generated and it detected all the correlations with a high p-value. To perform all the following analysis a clean GoldDataframe is needed. It simply detects if the correlation for the indicator and country is Nan and procedes to delete the whole column indicator for the country.

We use the spearman method.

In [None]:
for country in country_list:
    for ind in corr_df.columns:
        if np.isnan(corr_df.loc[country, ind]):
            df.loc[df.index.get_level_values(col_country) == country, [ind]] = np.NaN

df.to_csv(write_path + 'GoldDataframe_significant.csv')