# Atlas of Economic Complexity: growth predictions data processing and visualizations

Methodological notes, data sources and interactive visualization from http://atlas.cid.harvard.edu/rankings/growth-predictions/

Related links
* [U.N. Comtrade](http://comtrade.un.org) source of trade data
* [Atlas Data](https://github.com/cid-harvard/atlas-data) processing scripts for trade data

In [5]:
# Required modules to run this notebook
from IPython.display import IFrame
import pandas as pd

In [29]:
# Raw input data files
file_projections = 'sourceData/Growth_proj_rankings_2014.csv'
file_rank_eci = 'sourceData/ECI_rankings_2014.csv'
url_countries = 'https://raw.githubusercontent.com/mledoze/countries/master/dist/countries.json'

# Load countries metadata

In [30]:
import json
import urllib

r = urllib.request.urlopen(url_countries)
data = json.loads(r.read().decode(r.info().get_param('charset') or 'utf-8'))

In [31]:
# Flatten the countries dataset
from pandas.io.json import json_normalize
result = json_normalize(data)
df_countries = pd.DataFrame(result)

# Load growth prediction data

In [52]:
# Loading projections data
df = pd.read_csv(file_projections)

In [53]:
# Removing pre-defined ranking and country names column
# We will re-generate them later on
df.pop('rank2014');
df.pop('countryname');

In [54]:
# Turning temporal columns into rows with a column header called 'year'
df = pd.melt(df, id_vars=["iso"], var_name="year", value_name="value")

In [55]:
df.head()

Unnamed: 0,iso,year,value
0,IND,tg2004,7.15
1,UGA,tg2004,6.11
2,KEN,tg2004,6.32
3,TZA,tg2004,6.11
4,EGY,tg2004,5.0


In [56]:
# Formatting year (time) column by removing the first two chars
for index, row in df.iterrows():
    df.loc[index, "year"] = df.loc[index, "year"][2:]

In [57]:
# Do we have any missing data point? (e.g. value is NaN)
df[df['value'].isnull()]

Unnamed: 0,iso,year,value
38,SRB,2004,
614,SYR,2008,
737,SYR,2009,
860,SYR,2010,
983,SYR,2011,
1106,SYR,2012,
1229,SYR,2013,
1352,SYR,2014,


In [58]:
# Find country names with missing data points
countries_null = list(set(df[df['value'].isnull()]['iso']))
countries_null

['SRB', 'SYR']

Unnamed: 0,iso,year,value
0,IND,2004,7.15
1,UGA,2004,6.11
2,KEN,2004,6.32
3,TZA,2004,6.11
4,EGY,2004,5.00
5,MDG,2004,5.41
6,SEN,2004,5.86
7,PHL,2004,4.43
8,MWI,2004,5.72
9,ZMB,2004,5.27


In [65]:
# Discard countries with missing values
df =df.query("iso not in %s" % countries_null)

In [66]:
# Generates unique countries ranks for each year with no gaps
df['rank'] = df.groupby('year')['value'].rank(ascending=False, method='first')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [67]:
# Convert rank to integer
df['rank'] = df['rank'].apply(lambda x: int(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [68]:
# Make sure the ranks are sorted from 1 to Max
df.sort(['rank'], ascending=False);

In [69]:
# Preview of the resulting rankings
df[30:200]

Unnamed: 0,iso,year,value,rank
30,KGZ,2004,4.74,19
31,TUN,2004,3.28,65
32,GHA,2004,4.99,15
33,KHM,2004,3.12,73
34,KOR,2004,3.60,51
35,ZAF,2004,3.86,40
36,TJK,2004,4.19,33
37,UZB,2004,3.88,39
39,LKA,2004,3.29,64
40,PRY,2004,3.65,49


In [70]:
df = pd.merge(df, df_countries[['cca3', 'name.common']], how='left', left_on='iso', right_on='cca3')
df.pop('cca3');

In [71]:
df = df.rename(columns = {'name.common':'name'})
df[30:200]

Unnamed: 0,iso,year,value,rank,name
30,KGZ,2004,4.74,19,Kyrgyzstan
31,TUN,2004,3.28,65,Tunisia
32,GHA,2004,4.99,15,Ghana
33,KHM,2004,3.12,73,Cambodia
34,KOR,2004,3.60,51,South Korea
35,ZAF,2004,3.86,40,South Africa
36,TJK,2004,4.19,33,Tajikistan
37,UZB,2004,3.88,39,Uzbekistan
38,LKA,2004,3.29,64,Sri Lanka
39,PRY,2004,3.65,49,Paraguay


In [72]:
# Export
url_export_projections = '/Users/rvuillemot/Dev/vis-toolkit-datasets/data/atlas_growth_projections_2024.csv'
df_export = df[['rank', 'value', 'name', 'iso', 'year']]
df_export[(df_export['year'] == '2014')].to_csv(url_export_projections, index=False)

## Load ECI data

In [73]:
df_eci = pd.read_csv(file_rank_eci)
df_eci.head()

Unnamed: 0,iso,rank_eci2003,rank_eci2004,rank_eci2005,rank_eci2006,rank_eci2007,rank_eci2008,rank_eci2009,rank_eci2010,rank_eci2011,rank_eci2012,rank_eci2013,rank_eci2014
0,JPN,1,1,1,1,1,1,1,1,1,1,1,1
1,DEU,2,2,2,2,2,2,2,3,3,3,2,2
2,CHE,3,4,3,4,3,3,3,2,2,2,3,3
3,KOR,17,15,9,10,9,9,9,8,8,4,4,4
4,SWE,4,3,4,3,4,4,4,5,4,5,5,5


In [74]:
# Turning temporal columns into rows+year
df_eci = pd.melt(df_eci, id_vars=["iso"], var_name="year", value_name="rank_eci")
df_eci.head()

Unnamed: 0,iso,year,rank_eci
0,JPN,rank_eci2003,1
1,DEU,rank_eci2003,2
2,CHE,rank_eci2003,3
3,KOR,rank_eci2003,17
4,SWE,rank_eci2003,4


In [75]:
# Formatting year (time) column
for index, row in df_eci.iterrows():
    df_eci.loc[index, "year"] = df_eci.loc[index, "year"][8:]
df_eci.head()

Unnamed: 0,iso,year,rank_eci
0,JPN,2003,1
1,DEU,2003,2
2,CHE,2003,3
3,KOR,2003,17
4,SWE,2003,4


In [76]:
# Merging with countries metadata
df_eci_countries = pd.merge(df_eci, df_countries[['cca3', 'name.common']], how='left', left_on='iso', right_on='cca3')

In [77]:
df_eci_countries.pop('cca3');

In [78]:
df_eci_countries = df_eci_countries.rename(columns = {'name.common':'name'})

In [79]:
# Discard countries that don't have data for every time point between 2004 and 2014
countries_null = list(set(df_eci_countries[df_eci_countries['rank_eci'].isnull()]['iso']))
countries_null

['SRB']

In [80]:
df_eci_countries = df_eci_countries.query("iso not in ['SRB']")

In [81]:
df_eci_countries['rank_eci'] = df_eci_countries['rank_eci'].apply(lambda x: int(x))
df_eci_countries.head()

Unnamed: 0,iso,year,rank_eci,name
0,JPN,2003,1,Japan
1,DEU,2003,2,Germany
2,CHE,2003,3,Switzerland
3,KOR,2003,17,South Korea
4,SWE,2003,4,Sweden


In [82]:
df_export = df[['rank', 'name', 'iso']]
url_rankings = '/Users/rvuillemot/Dev/atlas-economic-complexity/media/growth_projections/country_rankings_and_projections_2014.csv'
df_export.to_csv(url_rankings, index = False)

# Merge Projections and ECI Rankings to generate export file

Will go on the sidebar for the page http://beta.atlas.cid.harvard.edu/rankings/

Data for 2013 predictions: http://atlas.cid.harvard.edu/media/growth_projections/country_rankings_and_projections_2013.csv has the format below

<pre>
rank,abbrv,country,eci_value,delta,year,growth_proj_annual_2024
1,JPN,Japan,2.348182,0,2013,2.13
2,CHE,Switzerland,2.331362,0,2013,3.62
3,DEU,Germany,2.03559,0,2013,-1.33
4,KOR,"Korea, Rep.",1.92968,1,2013,3.85
5,SWE,Sweden,1.817256,-1,2013,2.6
</pre>

In [83]:
df_merge = pd.merge(df_eci, df, how='inner', left_on=['iso', 'year'], right_on=['iso', 'year'])

In [84]:
df_merge = df_merge.rename(columns = {'rank':'rank_growth', 'value': 'growth_proj_annual_2024'})
df_merge['rank_eci'] = df_merge['rank_eci'].apply(lambda x: int(x))
df_merge.head()

Unnamed: 0,iso,year,rank_eci,growth_proj_annual_2024,rank_growth,name
0,JPN,2004,1,3.69,47,Japan
1,DEU,2004,2,1.38,118,Germany
2,CHE,2004,4,3.54,53,Switzerland
3,KOR,2004,15,3.6,51,South Korea
4,SWE,2004,3,3.92,38,Sweden


In [85]:
url_rankings = '/Users/rvuillemot/Dev/atlas-economic-complexity/media/growth_projections/country_rankings_and_projections_2014.csv'
df_merge.to_csv(url_rankings, index = False)

In [86]:
df_merge

Unnamed: 0,iso,year,rank_eci,growth_proj_annual_2024,rank_growth,name
0,JPN,2004,1,3.69,47,Japan
1,DEU,2004,2,1.38,118,Germany
2,CHE,2004,4,3.54,53,Switzerland
3,KOR,2004,15,3.60,51,South Korea
4,SWE,2004,3,3.92,38,Sweden
5,AUT,2004,6,2.95,87,Austria
6,CZE,2004,10,3.75,45,Czech Republic
7,FIN,2004,5,3.84,43,Finland
8,HUN,2004,18,3.53,56,Hungary
9,GBR,2004,7,3.63,50,United Kingdom


# Generate HTML static table


      <tr class="news__tr">
        <td class="news__td news__td--categorical">1</td>
        <td class="news__td news__td--categorial">India</td>
        <td class="news__td news__td--quantitative">7.89</td>
      </tr>


In [87]:
for row in df[(df['year'] == '2014')].iterrows():
    print("<tr class=\"news__tr\">")
    print("\t<td class=\"news__td news__td--categorical\">%s</td>" % (row[1][3]))
    print("\t<td class=\"news__td news__td--categorial\">%s</td>" % (row[1][4]))
    print("\t<td class=\"news__td news__td--quantitative\">%s</td>" % (row[1][2]))
    print("</tr>")

<tr class="news__tr">
	<td class="news__td news__td--categorical">1</td>
	<td class="news__td news__td--categorial">India</td>
	<td class="news__td news__td--quantitative">6.98</td>
</tr>
<tr class="news__tr">
	<td class="news__td news__td--categorical">2</td>
	<td class="news__td news__td--categorial">Uganda</td>
	<td class="news__td news__td--quantitative">6.04</td>
</tr>
<tr class="news__tr">
	<td class="news__td news__td--categorical">3</td>
	<td class="news__td news__td--categorial">Kenya</td>
	<td class="news__td news__td--quantitative">6.0</td>
</tr>
<tr class="news__tr">
	<td class="news__td news__td--categorical">4</td>
	<td class="news__td news__td--categorial">Tanzania</td>
	<td class="news__td news__td--quantitative">5.96</td>
</tr>
<tr class="news__tr">
	<td class="news__td news__td--categorical">5</td>
	<td class="news__td news__td--categorial">Egypt</td>
	<td class="news__td news__td--quantitative">5.83</td>
</tr>
<tr class="news__tr">
	<td class="news__td news__td--cate

In [88]:
# from IPython.display import display, HTML
# HTML(df.head().to_html())

# Projections of GDP Growth to 2024 Rankings: Selected Top Countries

In [89]:
# Generate a pretty table
from ipy_table import *
import numpy as np

df_table = df[(df['year'] == '2014')].head(10).reset_index(drop=True).reset_index()
df_table = df_table[['rank', 'name', 'value']]
table = df_table.as_matrix()

header = np.asarray(df_table.columns)
header[0] = 'Rank'
header[2] = 'Value'
# df.rename(columns=lambda x: x[1:], inplace=True)
table_with_header = np.concatenate(([header], table))

# Basic themes
# Detais http://nbviewer.ipython.org/github/epmoyer/ipy_table/blob/master/ipy_table-Introduction.ipynb
make_table(table_with_header)
apply_theme('basic')
# Only show the top-10
set_row_style(1, color='yellow')

0,1,2
Rank,name,Value
1,India,6.98
2,Uganda,6.04
3,Kenya,6.0
4,Tanzania,5.96
5,Egypt,5.83
6,Madagascar,5.78
7,Senegal,5.77
8,Philippines,5.68
9,Malawi,5.66


# Projections of GDP Growth to 2024 Rankings: Full List of Countries

In [90]:
IFrame('https://cid-harvard.github.io/vis-toolkit/examples/barchart_vertical_projections.html', width=900, height=550)

# Economic Complexity Index: Rank of Expected GDP Growth to 2024

In [6]:
IFrame('https://cid-harvard.github.io/vis-toolkit/examples/geomap_projections.html', width=900, height=500)

# Biggest Winners and Losers in Economic Complexity: 2004-2014


In [92]:
IFrame('https://cid-harvard.github.io/vis-toolkit/examples/slopegraph_eci_rankings.html', width=900, height=550)

# Economic Complexity Index: 2004-2014 Country Rankings

In [93]:
IFrame('https://cid-harvard.github.io/vis-toolkit/examples/linechart_eci_rankings.html', width=900, height=550)