# GROUPING

After preparing the data for a more exhaustive analysis, the last step is to separate the different countries into group with common characteristics. This is necessary as an aggregate study of all countries would mean erasing the differences between them and missing important conclusions, making our study too shallow.

Grouping them by similar characteristics will simplify our analysis, all while reducing the data loss and stressing common patterns among countries in the same group.

In [1]:
import os
import numpy as np
import pandas as pd
from Project.Utils.aggregate import aggregate


column_country = 'Country'
column_year = 'Year'
column_region = 'Region'

read_path = os.getcwd()
region_url = read_path + '/Databases/AuxiliarData/world-regions.csv'
df_url = read_path + '/Output/SilverDataframe.csv'

write_path = os.getcwd() + '/Output/'
country_folder = write_path + '/Country/'
region_folder = write_path + '/Region/'
aggregate_folder = region_folder + '/Aggregate/'



## CREATING DIRECTORIES
If the destination folders do not exist, create them.

In [2]:
if not os.path.exists(country_folder):
            os.makedirs(country_folder)
if not os.path.exists(region_folder):
            os.makedirs(region_folder)
if not os.path.exists(aggregate_folder):
            os.makedirs(aggregate_folder)

### READ THE FULL DATAFRAME AND THE REGION DATAFRAME

In [3]:
region_df = pd.read_csv(region_url)
df = pd.read_csv(df_url, index_col = ['Country', 'Year'])

print(region_df)
region_df.drop(columns=['Code', 'Year'], inplace = True)
region_df.rename(columns = {'World Region according to the World Bank': column_region, 'Entity': column_country}, inplace = True)
region_df.set_index(['Country', 'Region'], inplace = True)
print(region_df)

             Entity Code  Year World Region according to the World Bank
0       Afghanistan  AFG  2017                               South Asia
1           Albania  ALB  2017                  Europe and Central Asia
2           Algeria  DZA  2017             Middle East and North Africa
3    American Samoa  ASM  2017                    East Asia and Pacific
4           Andorra  AND  2017                  Europe and Central Asia
..              ...  ...   ...                                      ...
212       Venezuela  VEN  2017              Latin America and Caribbean
213         Vietnam  VNM  2017                    East Asia and Pacific
214           Yemen  YEM  2017             Middle East and North Africa
215          Zambia  ZMB  2017                       Sub-Saharan Africa
216        Zimbabwe  ZWE  2017                       Sub-Saharan Africa

[217 rows x 4 columns]
Empty DataFrame
Columns: []
Index: [(Afghanistan, South Asia), (Albania, Europe and Central Asia), (Algeria, Mid

## ADD THE PROPERTY 'REGION'
By merging both DataFrames, we will obtain a DataFrame similar to the original one with a new field for the index: `region`

In [4]:
print(df)
df = pd.merge(df, region_df, how = 'inner', left_index = True, right_index = True)
print(df)

                  AgriShareGDP  CreditToAgriFishForest  EmploymentRural  \
Country     Year                                                          
Afghanistan 2000      54.06300                     NaN              NaN   
            2001      54.06300                     NaN              NaN   
            2002      45.13440                     NaN              NaN   
            2003      41.90340                     NaN              NaN   
            2004      35.61280                     NaN              NaN   
...                        ...                     ...              ...   
Zimbabwe    2016       7.87399                     NaN              NaN   
            2017       8.34095                     NaN              NaN   
            2018       8.30469                     NaN              NaN   
            2019       8.17322                     NaN              NaN   
            2020      10.93630                     NaN              NaN   

                        

## EXPORT THE SEGREGATED DATAFRAMES
We will export the new DataFrame with the regions and two new kinds of DataFrame: one for every different country, and another for every region.

In [5]:
df.to_csv(write_path + '/GoldDataframe.csv')

country_list = set(df.index.get_level_values(0))
region_list = set(df.index.get_level_values(2))

for country in country_list:
    df.loc[df.index.get_level_values(0) == country].to_csv(country_folder + country + '.csv')

for region in region_list:
    df.loc[df.index.get_level_values(2) == region].to_csv(region_folder + region + '.csv')

# REGION AGGREGATES
In order to create an aggregated DataFrame with entries for the regions we are studying, we will have to estimate the values of its indicators: the summatory for absolute ones, and a weighted average based on population for the relative ones. The weight of the mean could also be established as the GDP of each country, or any other figure we consider representative enough of the country's importance in the calculus.

In [6]:
#Define which indicators should be not a weighted
abs_indicators = ['GDP', 'Population', 'TotalAgri']

#Wether aggregate from the regions aggregate (faster) or from the country raw data (slower but more precise)
fast_aggregate = True

aggregate_df = pd.DataFrame()

for region in region_list:
    #Create sub-DataFrame with only the countries for each region
    region_df = df.loc[df.index.get_level_values(column_region) == region]
    #Aggregate by the countries of said region
    aggregated_region_df = aggregate(region_df, aggregate_by = column_country, for_index = column_year, new_group_col_name = column_region, group_name = region, abs_indicators = abs_indicators)
    #Add the aggregated DataFrame to the DataFrame per aggregated regions
    aggregate_df = pd.concat([aggregate_df, aggregated_region_df])

#Obtain an aggregate for the world
if fast_aggregate:
    aggregate_df_world = aggregate(aggregate_df, column_region, column_year, new_group_col_name = column_region, group_name = 'World', abs_indicators = abs_indicators)
else:
    aggregate_df_world = aggregate(df, column_country, column_year, new_group_col_name = column_region, group_name = 'World', abs_indicators = abs_indicators)

#Export the aggregated DataFrame to a .csv file
aggregate_df.to_csv(write_path + 'Aggregated_DataFrame.csv')


                                  AgriShareGDP CreditToAgriFishForest  \
Region                       Year                                       
Middle East and North Africa 2000     9.677492               0.363875   
                             2001     9.321124               0.299737   
                             2002     9.160783               0.282691   
                             2003     9.536312               0.225336   
                             2004     9.343614               0.235886   
...                                        ...                    ...   
Latin America and Caribbean  2016     5.668804               0.769432   
                             2017     5.502124               0.856587   
                             2018     5.421312               0.851305   
                             2019     5.574387               0.833602   
                             2020     6.423174                0.83335   

                                  EmploymentRural 

In [None]:
import datetime

#Set True if you want to analyse the time for both methods. By default is False to save that time.
test_time = False

if test_time:
    #Count time for the fast method: aggregating from the aggregated regions.
    st_time = datetime.now()
    fast_aggregate_df = aggregate(aggregate_df, column_region, column_year, new_group_col_name = column_region, group_name = 'World', abs_indicators = abs_indicators)
    f_time = datetime.now()
    fast_time = f_time - st_time
    print('Fast aggregation time: ' + fast_time.total_seconds())

    #Count time for the most precise method: aggregating from the raw data per country.
    st_time = datetime.now()
    precise_aggregate_df = aggregate(df, column_country, column_year, new_group_col_name = column_region, group_name = 'World',abs_indicators = abs_indicators)
    f_time = datetime.now()
    precise_time = f_time - st_time
    print('Precisse aggregation time: ' + precise_time.total_seconds())

    #Print the results to compare.
    print(fast_aggregate_df)
    print(precise_aggregate_df)

    #Export the results to .csv files if a more detailed analysis is needed.
    fast_aggregate_df.to_csv(region_folder + 'World_Fast.csv')
    precise_aggregate_df.to_csv(region_folder + 'World.csv')
else:
    print('No aggregation time analysed. If you want to study it, set the test_time variable to True and execute again')

In [7]:
""" #Use Population as weight
col_region = 'Region'
col_year = 'Year'
col_index = [col_region, col_year]
weight_ind = 'Population'
abs_indicators = ['GDP', 'Population', 'TotalAgri']
file_suffix = ' - Aggregated.csv'

global_df = pd.DataFrame()
for region in region_list:
    #Create a new empty DataFrame for the region
    aggregate_df = pd.DataFrame()
    #Create a sub DataFrame with the entries for the region to read their figures
    region_df = df.loc[df.index.get_level_values(2) == region]
    #Create a list of the countries in the region and the years of the entries to iterate more easily over them
    country_list = set(region_df.index.get_level_values(0))
    year_list = set(region_df.index.get_level_values(1))

    for year in year_list:
        indicator_list = region_df.columns
        #Create a Dictionary to be filled with the values of the indicators
        #indicator_dict = {'Region': region, 'Year': year}
        indicator_dict = {}
        for indicator in indicator_list:
            sum = 0
            #For absolute indicators, apply the normal summatory of all terms
            if indicator in abs_indicators:
                for country in country_list:
                    c_val = region_df.loc[region_df.index == (country, year, region)].iloc[0][indicator]
                    #Ignore if NaN
                    if np.isnan(c_val):
                        continue
                    else:
                        sum += c_val

                indicator_dict[indicator] = sum
            #For relative indicators, calculate the weighted average based on population
            else:
                t_pop = 0
                for country in country_list:
                    c_ind = region_df.loc[region_df.index == (country, year, region)].iloc[0][indicator]
                    c_pop = region_df.loc[region_df.index == (country, year, region)].iloc[0][weight_ind]
                    #Ignore if any is NaN
                    if not np.isnan(c_ind) and not np.isnan(c_pop):
                        sum += c_ind*c_pop
                        t_pop += c_pop
                #If no data available, write a None to avoid dividing by 0
                if t_pop == 0 or sum == 0:
                    mean_value = None
                else:
                    mean_value = sum / t_pop
                #Fill the dictionary with the values calculated
                indicator_dict[indicator] = mean_value
            
        #Create a Series from the dictionary, including a column for the region and year, and append it to the aggregated DataFrame    
        year_series = pd.Series({col_region: region, col_year: year} | indicator_dict)
        aggregate_df = pd.concat([aggregate_df, year_series], axis = 1)
    #Transpose to make it easier to read, set the indexes and export it to a .csv
    aggregate_df = aggregate_df.transpose()
    aggregate_df.set_index(col_index, inplace = True)
    aggregate_df.to_csv(aggregate_folder + region + file_suffix)
    #Add it to the global DataFrame
    global_df = pd.concat([global_df, aggregate_df])
global_df.to_csv(aggregate_folder + 'World' + file_suffix)

print(global_df.index.names[1]) """

Year
