# Creating the Data for the Map
---
The first step is to clean the dataset to be able to map it correctly.

In [1]:
import numpy as np
import pandas as pd
import pycountry
import collections
import json

DATA_DIR = '../Data/'

Load the data:

In [2]:
data = pd.concat(pd.read_excel(DATA_DIR + 'PARTICIPANT LISTS - 2013-present.xlsx', sheet_name=None, header=1))
display(data.head(), data.shape)

Unnamed: 0,Unnamed: 1,P/I,NAME,E-MAIL,AFFILIATION TYPE,AFFILIATION NAME,COUNTRY,TYPE,COUNTRY.1,START DATE,LENGTH (# OF DAYS),100 LEVEL (INTRO),200 LEVEL (INTERMEDIATE),300 LEVEL (ADVANCED)
2013,0,I,Mary Ruckelshaus,,Academic,Natural Capital Project,USA,Nodal,Argentina,2013-09-12,2.0,1.0,1.0,
2013,1,I,Anne Guerry,,Academic,Natural Capital Project,USA,Nodal,Argentina,2013-09-12,2.0,1.0,1.0,
2013,2,I,Spencer Wood,,Academic,Natural Capital Project,USA,Nodal,Argentina,2013-09-12,2.0,1.0,1.0,
2013,3,I,Robert Griffin,,Academic,Natural Capital Project,USA,Nodal,Argentina,2013-09-12,2.0,1.0,1.0,
2013,4,I,Jess Silver,,Academic,Natural Capital Project,USA,Nodal,Argentina,2013-09-12,2.0,1.0,1.0,


(4192, 13)

Look at the unique country names in both country columns:

In [3]:
pd.concat([data['COUNTRY'], data['COUNTRY.1']]).unique()

array(['USA', 'Argentina', 'Chile', 'Thailand', 'Vietnam', 'Cambodia',
       'Kyrgystan', 'United Kingdom', 'Mozambique', 'Peru', 'Colombia',
       'Brazil', 'Ecuador', 'Germany', 'Malaysia', 'Sweden', 'Australia',
       'Denmark', 'Switzerland', 'Indonesia', 'Japan', 'Laos',
       'Netherlands', 'New Zealand', 'China', 'Mexico', 'Bulgaria',
       'Cameroon', 'Philippines', 'Singapore', 'Hungary', 'Kenya',
       'Canada', 'Costa Rica', 'Nicaragua', 'Panama',
       'República Dominicana', nan, 'Guatemala', 'Brasil', 'Mongolia',
       'Spain', 'Portugal', 'Georgia', 'Italy', 'Belgium', 'India',
       'Bolivia', 'France', 'Trinidad and Tobago', 'Lithuania', 'Nepal',
       'Israel', 'Norway', 'Korea', 'Belgian', 'Greece', 'Barbados',
       'Trinidad & Tobago', 'Bhutan', 'Myanmar', 'Kazakhstan', 'Sri Lanka',
       'Ghana', 'Ethiopia', 'Tanzania', 'South Africa', 'Jordan',
       'Bangladesh', 'UK', 'Tunisia', 'Slovenia', 'United Arab Emirates',
       'Romania', 'Austria', 'Zimb

Look at the country names `pycountry` can understand (because we will use it to get the country codes for the map:

In [4]:
np.array([country.name for country in pycountry.countries])

array(['Aruba', 'Afghanistan', 'Angola', 'Anguilla', 'Åland Islands',
       'Albania', 'Andorra', 'United Arab Emirates', 'Argentina',
       'Armenia', 'American Samoa', 'Antarctica',
       'French Southern Territories', 'Antigua and Barbuda', 'Australia',
       'Austria', 'Azerbaijan', 'Burundi', 'Belgium', 'Benin',
       'Bonaire, Sint Eustatius and Saba', 'Burkina Faso', 'Bangladesh',
       'Bulgaria', 'Bahrain', 'Bahamas', 'Bosnia and Herzegovina',
       'Saint Barthélemy', 'Belarus', 'Belize', 'Bermuda',
       'Bolivia, Plurinational State of', 'Brazil', 'Barbados',
       'Brunei Darussalam', 'Bhutan', 'Bouvet Island', 'Botswana',
       'Central African Republic', 'Canada', 'Cocos (Keeling) Islands',
       'Switzerland', 'Chile', 'China', "Côte d'Ivoire", 'Cameroon',
       'Congo, The Democratic Republic of the', 'Congo', 'Cook Islands',
       'Colombia', 'Comoros', 'Cabo Verde', 'Costa Rica', 'Cuba',
       'Curaçao', 'Christmas Island', 'Cayman Islands', 'Cyprus',
 

Create the corrections as a dictionary:

In [5]:
corrections = {
    'Belgian' : 'Belgium',
    'Bolivia' : 'Bolivia, Plurinational State of',
    'Brasil' : 'Brazil',
    "Côte-d'Ivoire" : "Côte d'Ivoire",
    'England' : 'United Kingdom',
    'Korea' : 'Korea, Republic of',
    'Kyrgystan' : 'Kyrgyzstan',
    'Laos' : "Lao People's Democratic Republic",
    'Not sure Not an SAPECS person…' : None,
    'República Dominicana' : 'Dominican Republic',
    'Russia' : 'Russian Federation',
    'Scotland' : 'United Kingdom', 
    'South Korea' : 'Korea, Republic of',
    'Tanzania' : 'Tanzania, United Republic of',
    'Trinidad & Tobago' : 'Trinidad and Tobago',
    'UK' : 'United Kingdom',
    'USA' : 'United States',
    'Vietnam' : 'Viet Nam',
    'Virtual' : None
}

corrections

{'Belgian': 'Belgium',
 'Bolivia': 'Bolivia, Plurinational State of',
 'Brasil': 'Brazil',
 "Côte-d'Ivoire": "Côte d'Ivoire",
 'England': 'United Kingdom',
 'Korea': 'Korea, Republic of',
 'Kyrgystan': 'Kyrgyzstan',
 'Laos': "Lao People's Democratic Republic",
 'Not sure Not an SAPECS person…': None,
 'República Dominicana': 'Dominican Republic',
 'Russia': 'Russian Federation',
 'Scotland': 'United Kingdom',
 'South Korea': 'Korea, Republic of',
 'Tanzania': 'Tanzania, United Republic of',
 'Trinidad & Tobago': 'Trinidad and Tobago',
 'UK': 'United Kingdom',
 'USA': 'United States',
 'Vietnam': 'Viet Nam',
 'Virtual': None}

Replace the wrong versions with the right versions:

In [6]:
data.replace(to_replace=corrections, inplace=True)

Drop the missing countries because there is no way to map them:

In [7]:
data.dropna(subset=['COUNTRY', 'COUNTRY.1'], inplace=True)  # Drop unknown countries for the country data

Check that `pycountry` can get the codes for all the countries without errors:

In [8]:
country_names = pd.concat([data['COUNTRY'], data['COUNTRY.1']]).unique()
country_codes = np.array([pycountry.countries.get(name=country).alpha_2 for country in country_names])
code_mapping = dict(zip(country_names, country_codes))
code_mapping

{'United States': 'US',
 'Argentina': 'AR',
 'Chile': 'CL',
 'Thailand': 'TH',
 'Viet Nam': 'VN',
 'Cambodia': 'KH',
 'Kyrgyzstan': 'KG',
 'United Kingdom': 'GB',
 'Mozambique': 'MZ',
 'Peru': 'PE',
 'Colombia': 'CO',
 'Brazil': 'BR',
 'Ecuador': 'EC',
 'Germany': 'DE',
 'Malaysia': 'MY',
 'Sweden': 'SE',
 'Australia': 'AU',
 'Denmark': 'DK',
 'Switzerland': 'CH',
 'Indonesia': 'ID',
 'Japan': 'JP',
 "Lao People's Democratic Republic": 'LA',
 'Netherlands': 'NL',
 'New Zealand': 'NZ',
 'China': 'CN',
 'Mexico': 'MX',
 'Bulgaria': 'BG',
 'Cameroon': 'CM',
 'Philippines': 'PH',
 'Singapore': 'SG',
 'Hungary': 'HU',
 'Kenya': 'KE',
 'Canada': 'CA',
 'Costa Rica': 'CR',
 'Nicaragua': 'NI',
 'Panama': 'PA',
 'Dominican Republic': 'DO',
 'Guatemala': 'GT',
 'Mongolia': 'MN',
 'Spain': 'ES',
 'Portugal': 'PT',
 'Georgia': 'GE',
 'Italy': 'IT',
 'Belgium': 'BE',
 'India': 'IN',
 'Bolivia, Plurinational State of': 'BO',
 'France': 'FR',
 'Trinidad and Tobago': 'TT',
 'Lithuania': 'LT',
 'Nepal'

Replace country names in the data with the codes:

In [9]:
data.replace(to_replace=code_mapping, inplace=True)

Save the new data:

In [12]:
data

Unnamed: 0,Unnamed: 1,P/I,NAME,E-MAIL,AFFILIATION TYPE,AFFILIATION NAME,COUNTRY,TYPE,COUNTRY.1,START DATE,LENGTH (# OF DAYS),100 LEVEL (INTRO),200 LEVEL (INTERMEDIATE),300 LEVEL (ADVANCED)
2013,0,I,Mary Ruckelshaus,,Academic,Natural Capital Project,US,Nodal,AR,2013-09-12,2.0,1.0,1.0,
2013,1,I,Anne Guerry,,Academic,Natural Capital Project,US,Nodal,AR,2013-09-12,2.0,1.0,1.0,
2013,2,I,Spencer Wood,,Academic,Natural Capital Project,US,Nodal,AR,2013-09-12,2.0,1.0,1.0,
2013,3,I,Robert Griffin,,Academic,Natural Capital Project,US,Nodal,AR,2013-09-12,2.0,1.0,1.0,
2013,4,I,Jess Silver,,Academic,Natural Capital Project,US,Nodal,AR,2013-09-12,2.0,1.0,1.0,
2013,5,I,Douglas Denu,,Academic,Natural Capital Project,US,Nodal,AR,2013-09-12,2.0,1.0,1.0,
2013,6,P,Ana Liberoff,ana.liberoff@gmail.com,Gov't,CONICET - Centro Nacional Patagónico - Puerto ...,AR,Nodal,AR,2013-09-12,2.0,1.0,1.0,
2013,7,P,María Cecilia Brand,cecibrand@hotmail.com,Gov't,CONICET - Esquel,AR,Nodal,AR,2013-09-12,2.0,1.0,1.0,
2013,8,P,Jorge Kuroda,chimekuroda@yahoo.com.ar,NGO,Centro de Ecología Aplicada de Nequén,AR,Nodal,AR,2013-09-12,2.0,1.0,1.0,
2013,9,P,Daniel Fernandez,dfernandez.ush@gmail.com,Gov't,CONICET - Centro Austral de Investigaciones Ci...,AR,Nodal,AR,2013-09-12,2.0,1.0,1.0,


In [21]:
data.to_csv(DATA_DIR + 'clean_countries.csv', index_label=['Year', 'Index'])

# Processing the Data for the Map
---
The second step is to aggregate the data and count the number of trainees from each country and the number of trainings held in each country.