# JSON examples and exercise
****
+ get familiar with packages for dealing with JSON
+ study examples with JSON strings and files 
+ work on exercise to be completed and submitted 
****
+ reference: http://pandas.pydata.org/pandas-docs/stable/io.html#io-json-reader
+ data source: http://jsonstudio.com/resources/
****

In [85]:
import pandas as pd

## imports for Python, Pandas

In [86]:
import json
from pandas.io.json import json_normalize

## JSON example, with string

+ demonstrates creation of normalized dataframes (tables) from nested json string
+ source: http://pandas.pydata.org/pandas-docs/stable/io.html#normalization

In [87]:
# define json string
data = [{'state': 'Florida', 
         'shortname': 'FL',
         'info': {'governor': 'Rick Scott'},
         'counties': [{'name': 'Dade', 'population': 12345},
                      {'name': 'Broward', 'population': 40000},
                      {'name': 'Palm Beach', 'population': 60000}]},
        {'state': 'Ohio',
         'shortname': 'OH',
         'info': {'governor': 'John Kasich'},
         'counties': [{'name': 'Summit', 'population': 1234},
                      {'name': 'Cuyahoga', 'population': 1337}]}]

In [88]:
# use normalization to create tables from nested element
json_normalize(data, 'counties')

Unnamed: 0,name,population
0,Dade,12345
1,Broward,40000
2,Palm Beach,60000
3,Summit,1234
4,Cuyahoga,1337


In [89]:
# further populate tables created from nested element
json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])

Unnamed: 0,name,population,state,shortname,info.governor
0,Dade,12345,Florida,FL,Rick Scott
1,Broward,40000,Florida,FL,Rick Scott
2,Palm Beach,60000,Florida,FL,Rick Scott
3,Summit,1234,Ohio,OH,John Kasich
4,Cuyahoga,1337,Ohio,OH,John Kasich


****
## JSON example, with file

+ demonstrates reading in a json file as a string and as a table
+ uses small sample file containing data about projects funded by the World Bank 
+ data source: http://jsonstudio.com/resources/

In [90]:
# load json as string
json.load((open('data/world_bank_projects_less.json')))

[{'_id': {'$oid': '52b213b38594d8a2be17c780'},
  'approvalfy': 1999,
  'board_approval_month': 'November',
  'boardapprovaldate': '2013-11-12T00:00:00Z',
  'borrower': 'FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA',
  'closingdate': '2018-07-07T00:00:00Z',
  'country_namecode': 'Federal Democratic Republic of Ethiopia!$!ET',
  'countrycode': 'ET',
  'countryname': 'Federal Democratic Republic of Ethiopia',
  'countryshortname': 'Ethiopia',
  'docty': 'Project Information Document,Indigenous Peoples Plan,Project Information Document',
  'envassesmentcategorycode': 'C',
  'grantamt': 0,
  'ibrdcommamt': 0,
  'id': 'P129828',
  'idacommamt': 130000000,
  'impagency': 'MINISTRY OF EDUCATION',
  'lendinginstr': 'Investment Project Financing',
  'lendinginstrtype': 'IN',
  'lendprojectcost': 550000000,
  'majorsector_percent': [{'Name': 'Education', 'Percent': 46},
   {'Name': 'Education', 'Percent': 26},
   {'Name': 'Public Administration, Law, and Justice', 'Percent': 16},
   {'Name': 'Educatio

In [91]:
# load as Pandas dataframe
sample_json_df = pd.read_json('data/world_bank_projects_less.json')
sample_json_df

Unnamed: 0,_id,approvalfy,board_approval_month,boardapprovaldate,borrower,closingdate,country_namecode,countrycode,countryname,countryshortname,...,sectorcode,source,status,supplementprojectflg,theme1,theme_namecode,themecode,totalamt,totalcommamt,url
0,{'$oid': '52b213b38594d8a2be17c780'},1999,November,2013-11-12T00:00:00Z,FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA,2018-07-07T00:00:00Z,Federal Democratic Republic of Ethiopia!$!ET,ET,Federal Democratic Republic of Ethiopia,Ethiopia,...,"ET,BS,ES,EP",IBRD,Active,N,"{'Name': 'Education for all', 'Percent': 100}","[{'name': 'Education for all', 'code': '65'}]",65,130000000,130000000,http://www.worldbank.org/projects/P129828/ethi...
1,{'$oid': '52b213b38594d8a2be17c781'},2015,November,2013-11-04T00:00:00Z,GOVERNMENT OF TUNISIA,,Republic of Tunisia!$!TN,TN,Republic of Tunisia,Tunisia,...,"BZ,BS",IBRD,Active,N,"{'Name': 'Other economic management', 'Percent...","[{'name': 'Other economic management', 'code':...",5424,0,4700000,http://www.worldbank.org/projects/P144674?lang=en


****
## JSON exercise

Using data in file 'data/world_bank_projects.json' and the techniques demonstrated above,
1. Find the 10 countries with most projects
2. Find the top 10 major project themes (using column 'mjtheme_namecode')
3. In 2. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in.

# 1. Find the 10 Countries with most projects

In [92]:
# Load the world bank data as a Pandas dataframe and do basic inspection. Note the ID column
df = pd.read_json('data/world_bank_projects.json')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 50 columns):
_id                         500 non-null object
approvalfy                  500 non-null int64
board_approval_month        500 non-null object
boardapprovaldate           500 non-null object
borrower                    485 non-null object
closingdate                 370 non-null object
country_namecode            500 non-null object
countrycode                 500 non-null object
countryname                 500 non-null object
countryshortname            500 non-null object
docty                       446 non-null object
envassesmentcategorycode    430 non-null object
grantamt                    500 non-null int64
ibrdcommamt                 500 non-null int64
id                          500 non-null object
idacommamt                  500 non-null int64
impagency                   472 non-null object
lendinginstr                495 non-null object
lendinginstrtype            495 non

In [93]:
# Count the number of country projects by unique ID, sort in descending order and slice
country_project_counts = df.groupby('countryshortname').id.count()
top_10 = country_project_counts.sort_values(ascending=False)[:10]

# Display a nice looking Top 10 list.  Not totally necessary but if you're
# going to display a Top 10 list...
top_df = top_10.to_frame().reset_index()
top_df.index = range(1, 11)
top_df.columns = ['Country', 'Project Count']
top_df

Unnamed: 0,Country,Project Count
1,Indonesia,19
2,China,19
3,Vietnam,17
4,India,16
5,"Yemen, Republic of",13
6,Nepal,12
7,Bangladesh,12
8,Morocco,12
9,Mozambique,11
10,Africa,11


# 2. Find the top 10 major project themes (using column 'mjtheme_namecode')

In [94]:
# Read the world bank data
with open('data/world_bank_projects.json', 'r') as fjson:
    theme_json = json.load(fjson)

# Use json_normalize to normalize by mjtheme_namecode. Note that each project can
# have multiple theme codes.  This is why there are 3x as many rows as the original data set.
themes = json_normalize(theme_json, 'mjtheme_namecode', ['id'])
themes.head(20)


Unnamed: 0,code,name,id
0,8,Human development,P129828
1,11,,P129828
2,1,Economic management,P144674
3,6,Social protection and risk management,P144674
4,5,Trade and integration,P145310
5,2,Public sector governance,P145310
6,11,Environment and natural resources management,P145310
7,6,Social protection and risk management,P145310
8,7,Social dev/gender/inclusion,P144665
9,7,Social dev/gender/inclusion,P144665


In [95]:
# Group by theme code and count then sort and slice the top 10
code_counts = themes.groupby('code').code.count()
top_10_codes = code_counts.sort_values(ascending=False)[:10]
top_10_codes

code
11    250
10    216
8     210
2     199
6     168
4     146
7     130
5      77
9      50
1      38
Name: code, dtype: int64

In [96]:
import numpy as np

# Not necessary but display a nice looking Top 10 list with theme names
codes = themes.drop('id', axis='columns')
codes = codes.drop_duplicates()
codes = codes.replace('', np.nan).dropna()
codes = codes.set_index('code')
codes['count'] = code_counts
top_10 = codes.sort_values('count', ascending=False)[:10]
top_10 = top_10.reset_index()
top_10.index = range(1,11)
top_10.columns = ['Theme Code', 'Project Theme', 'Count']
top_10

Unnamed: 0,Theme Code,Project Theme,Count
1,11,Environment and natural resources management,250
2,10,Rural development,216
3,8,Human development,210
4,2,Public sector governance,199
5,6,Social protection and risk management,168
6,4,Financial and private sector development,146
7,7,Social dev/gender/inclusion,130
8,5,Trade and integration,77
9,9,Urban development,50
10,1,Economic management,38


# 3. In 2. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in.

In [97]:
# Note that in displaying the nice top 10 list above, I created a clean table of names and codes that's
# indexed by the code.  There are other ways to solve the problem but I'll just use this to map codes 
# to names.  Retain the original 'name' column and verify that missing names are filled in. 
named_themes = themes.copy()
named_themes['filled_name'] = named_themes.code.map(codes.name)
named_themes.head(20)



Unnamed: 0,code,name,id,filled_name
0,8,Human development,P129828,Human development
1,11,,P129828,Environment and natural resources management
2,1,Economic management,P144674,Economic management
3,6,Social protection and risk management,P144674,Social protection and risk management
4,5,Trade and integration,P145310,Trade and integration
5,2,Public sector governance,P145310,Public sector governance
6,11,Environment and natural resources management,P145310,Environment and natural resources management
7,6,Social protection and risk management,P145310,Social protection and risk management
8,7,Social dev/gender/inclusion,P144665,Social dev/gender/inclusion
9,7,Social dev/gender/inclusion,P144665,Social dev/gender/inclusion


In [98]:
# Here's another way to fill the missing names with sorting and filling NaNs. First note how many names are blank
themes = themes.replace('' , np.nan)
themes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1499 entries, 0 to 1498
Data columns (total 3 columns):
code    1499 non-null object
name    1377 non-null object
id      1499 non-null object
dtypes: object(3)
memory usage: 35.2+ KB


In [99]:
# This will work if the dataframe is sorted by code and name in desc order.  This way the non-blank names come
# first and can be filled forward.
themes = themes.sort_values(['code', 'name'], ascending=False)
themes = themes.fillna(method='ffill')
themes

Unnamed: 0,code,name,id
53,9,Urban development,P145359
183,9,Urban development,P121917
194,9,Urban development,P128768
200,9,Urban development,P125120
318,9,Urban development,P122950
320,9,Urban development,P122950
341,9,Urban development,P144357
354,9,Urban development,P127543
356,9,Urban development,P127543
369,9,Urban development,P126749


In [100]:
# Verify there are no blank names
themes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1499 entries, 53 to 1437
Data columns (total 3 columns):
code    1499 non-null object
name    1499 non-null object
id      1499 non-null object
dtypes: object(3)
memory usage: 46.8+ KB
