# Collecting country data

Many of the tutorials in this class have so far focused on processing at the country level. 

However, to produce a global map we need to do two things:
    
1. Create a composite set of global shapefiles based on our chosen GID level for each country.
2. Collect country-level data estimates and collate them into a single file.
3. Merge data produced in 1 and 2, and then visualize. 

Examples will now be given for each of these steps. 

## Creating a composite set of global shapefiles

We can load in our shapes country-by-country and then append them to a list as a geojson structure. 

Finally, we can convert them to a `geopandas.GeoDataFrame` using the `from_features()` function, and then export them to a .shp file.

Let us begin by processing our regions, and then we can collect them all:

In [None]:
import os
import pandas
import geopandas
from shapely.geometry import MultiPolygon

##first we need to define small shapes function
def remove_small_shapes(x):
    """
    Remove small multipolygon shapes.

    Parameters
    ---------
    x : polygon
        Feature to simplify.

    Returns
    -------
    MultiPolygon : MultiPolygon
        Shapely MultiPolygon geometry without tiny shapes.

    """
    if x.geometry.type == 'Polygon':
        return x.geometry

    elif x.geometry.type == 'MultiPolygon':

        area1 = 0.003
        area2 = 50

        if x.geometry.area < area1: 
            return x.geometry

        if x['GID_0'] in ['CHL','IDN', 'RUS', 'GRL','CAN','USA']:
            threshold = 0.01
        elif x.geometry.area > area2:
            threshold = 0.1
        else:
            threshold = 0.001

        new_geom = []
        for y in list(x['geometry'].geoms):
            if y.area > threshold:
                new_geom.append(y)

        return MultiPolygon(new_geom)

##now we can state our processing code
path = os.path.join('..', 'data', 'countries.csv')
countries = pandas.read_csv(path, encoding='latin-1')

for idx, country_row in countries.iterrows():
    
    if not country_row['iso3'] in ['CAN', 'GBR']:
        continue
    
    iso3 = country_row['iso3']
    gid_region = country_row['gid_region']
    gid_level = 'GID_{}'.format(gid_region)

    filename = "gadm36_{}.shp".format(gid_region)
    folder = os.path.join("..", "data", "raw", "gadm36_levels_shp")
    path_in = os.path.join(folder, filename)

    boundaries = geopandas.read_file(path_in, crs="epsg:4326")

    regions = boundaries[boundaries['GID_0'] == iso3]

    regions["geometry"] = regions.geometry.simplify(
        tolerance=0.01, preserve_topology=True)
        
    regions['geometry'] = regions.apply(
        remove_small_shapes, axis=1)

    filename = "regional_shapes_GID_{}.shp".format(country_row['gid_region'])
    folder = os.path.join("..", "data", "processed", country_row['iso3'], "regions")
    if not os.path.exists(folder):
        os.makedirs(folder)
    path_out = os.path.join(folder, filename)
    
    regions.to_file(path_out, crs='epsg:4326')
    
    print("Processed regional boundaries for {}".format(country_row['iso3']))


Now we can create our global composite shapefile. 

In [None]:
# Example
import os
import pandas 
import geopandas as gpd

path = os.path.join('..', 'data', 'countries.csv')
countries = pandas.read_csv(path, encoding='latin-1')

output = []

for idx, country in countries.iterrows():

    if not country['iso3'] in ['CAN', 'GBR']: # let's work on a single country at a time
        continue   

    print('Working on {}'.format(country['iso3']))
    
    #define our country-specific parameters, including gid information
    iso3 = country['iso3']
    gid_region = country['gid_region']
    gid_level = 'GID_{}'.format(gid_region)
    
    #set the filename depending our preferred regional level
    filename = "regional_shapes_GID_{}.shp".format(gid_region)
    folder = os.path.join('..', 'data', 'processed', iso3, 'regions')
    
    #then load in our regions as a geodataframe
    path_regions = os.path.join(folder, filename)
    regions = gpd.read_file(path_regions, crs='epsg:4326')#[:2]

    regions['gid_id'] = regions[gid_level]
    regions = regions[['geometry', 'gid_id']]
    
    for idx, region in regions.iterrows():
        output.append({
            'geometry': region['geometry'],
            'properties': {
                'gid_id': region['gid_id']
            }
        })

output = gpd.GeoDataFrame.from_features(output, crs='epsg:4326') 

filename = 'global_boundaries_composite.shp'
path_out = os.path.join('..', 'data', 'processed', filename)
output.to_file(path_out, crs='epsg:4326')

print('Processing complete')

## Collecting all data

We can collect all data from our country folders using a loop.

First we can generate a dummy example, with a .csv created for two countries. You will have your own data to collect which relates to your own research topic (therefore, you will need to adapt this code).


In [None]:
import os
import pandas 

path = os.path.join('..', 'data', 'countries.csv')
countries = pandas.read_csv(path, encoding='latin-1')

for idx, country in countries.iterrows():

    if not country['iso3'] in ['CAN', 'GBR']:
        continue   

    print('Working on {}'.format(country['iso3']))
    
    output = []
    
    #define our country-specific parameters, including gid information
    iso3 = country['iso3']
    gid_region = country['gid_region']
    gid_level = 'GID_{}'.format(gid_region)
    
    #set the filename depending our preferred regional level
    #here we import the shapes as a dummy example
    filename = "regional_shapes_GID_{}.shp".format(gid_region)
    folder = os.path.join('..', 'data', 'processed', iso3, 'regions')
    path_regions = os.path.join(folder, filename)
    regions = gpd.read_file(path_regions, crs='epsg:4326')#[:2]
    regions['gid_id'] = regions[gid_level]
    regions = regions[['geometry', 'gid_id']]
    
    for idx, region in regions.iterrows():
        output.append({
            'gid_id': region['gid_id'],
            'population': idx
        })

    #convert from list of dicts to pandas df
    output = pandas.DataFrame(output) 

    filename = 'population.csv'
    path_out = os.path.join('..', 'data', 'processed', iso3, filename)
    output.to_csv(path_out, index=False)

    print('Processing complete')

Now we can collect this data.

In [None]:
import os
import pandas 

path = os.path.join('..', 'data', 'countries.csv')
countries = pandas.read_csv(path, encoding='latin-1')

output = []

for idx, country in countries.iterrows():

    if not country['iso3'] in ['CAN', 'GBR']: # let's work on a single country at a time
        continue   

    print('Working on {}'.format(country['iso3']))
    
    #define our country-specific parameters, including gid information
    iso3 = country['iso3']
    gid_region = country['gid_region']
    gid_level = 'GID_{}'.format(gid_region)
    
    #set the filename depending our preferred regional level
    filename = "population.csv"
    folder = os.path.join('..', 'data', 'processed', iso3)
    
    #then load in our regions as a geodataframe
    path_population = os.path.join(folder, filename)
    population = pandas.read_csv(path_population)#[:2]
    
    population = population.to_dict('records')
    output = output + population
    
output = pandas.DataFrame(output) 

filename = 'global_population_data.csv'
path_out = os.path.join('..', 'data', 'processed', filename)
output.to_csv(path_out, index=False)

print('Processing complete')

## Global map

To create a global map, we only have to import our boundary composite and global data, and then merge prior to plotting.



In [None]:
# Example
import matplotlib.pyplot as plt
import seaborn as sns

#import our boundaries data
filename = 'global_boundaries_composite.shp'
path_in = os.path.join('..', 'data', 'processed', filename) 
boundaries = geopandas.read_file(path_in)

#import our dummy population data
filename = 'global_population_data.csv'
path_in = os.path.join('..', 'data', 'processed', filename) 
data = pandas.read_csv(path_in)

#merge our dummy population data onto our boundaries 
boundaries = boundaries.merge(data, left_on='gid_id', right_on='gid_id')

#define dummy value bins and then labels for each one
bins = [-1e6, 20, 40, 60, 80, 1e12]
labels = ['<20','20-40','40-60','60-80','>80']

#create a new variable with our dummy bin labels
boundaries['bin'] = pandas.cut(
    boundaries['population'],
    bins=bins,
    labels=labels
)

#open a new seaborn figure
sns.set(font_scale=0.9)
fig, ax = plt.subplots(1, 1, figsize=(4.5, 4.5))

#now plot our data using pandas plot
base = boundaries.plot(column='bin', ax=ax, cmap='viridis', linewidth=0, #inferno_r
    legend=True, antialiased=False)

#allocate a plot title 
n = len(boundaries)
name = 'Dummy Population by Sub-Region Globally (n={})'.format(n)
fig.suptitle(name)

#specify where to write our .png file to
path = os.path.join('..', 'data', 'processed', 'fig.png')
fig.savefig(path)
plt.close(fig)


Congratulations, you should now be very close to completing the global assessment coursework!