# JSON examples and exercise
****
+ get familiar with packages for dealing with JSON
+ study examples with JSON strings and files 
+ work on exercise to be completed and submitted 
****
+ reference: http://pandas.pydata.org/pandas-docs/stable/io.html#io-json-reader
+ data source: http://jsonstudio.com/resources/
****

In [141]:
import numpy as np
import pandas as pd

## imports for Python, Pandas

In [142]:
import json
from pandas.io.json import json_normalize

## JSON example, with string

+ demonstrates creation of normalized dataframes (tables) from nested json string
+ source: http://pandas.pydata.org/pandas-docs/stable/io.html#normalization

In [143]:
# define json string
data = [{'state': 'Florida', 
         'shortname': 'FL',
         'info': {'governor': 'Rick Scott'},
         'counties': [{'name': 'Dade', 'population': 12345},
                      {'name': 'Broward', 'population': 40000},
                      {'name': 'Palm Beach', 'population': 60000}]},
        {'state': 'Ohio',
         'shortname': 'OH',
         'info': {'governor': 'John Kasich'},
         'counties': [{'name': 'Summit', 'population': 1234},
                      {'name': 'Cuyahoga', 'population': 1337}]}]

In [144]:
df = pd.DataFrame(data)
df

Unnamed: 0,counties,info,shortname,state
0,"[{'name': 'Dade', 'population': 12345}, {'name...",{'governor': 'Rick Scott'},FL,Florida
1,"[{'name': 'Summit', 'population': 1234}, {'nam...",{'governor': 'John Kasich'},OH,Ohio


In [145]:
# use normalization to create tables from nested element
json_normalize(data, 'counties')

Unnamed: 0,name,population
0,Dade,12345
1,Broward,40000
2,Palm Beach,60000
3,Summit,1234
4,Cuyahoga,1337


In [146]:
# further populate tables created from nested element
json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])

Unnamed: 0,name,population,state,shortname,info.governor
0,Dade,12345,Florida,FL,Rick Scott
1,Broward,40000,Florida,FL,Rick Scott
2,Palm Beach,60000,Florida,FL,Rick Scott
3,Summit,1234,Ohio,OH,John Kasich
4,Cuyahoga,1337,Ohio,OH,John Kasich


****
## JSON example, with file

+ demonstrates reading in a json file as a string and as a table
+ uses small sample file containing data about projects funded by the World Bank 
+ data source: http://jsonstudio.com/resources/

In [147]:
# load json as string
json.load((open('data/world_bank_projects_less.json')))

[{'_id': {'$oid': '52b213b38594d8a2be17c780'},
  'approvalfy': 1999,
  'board_approval_month': 'November',
  'boardapprovaldate': '2013-11-12T00:00:00Z',
  'borrower': 'FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA',
  'closingdate': '2018-07-07T00:00:00Z',
  'country_namecode': 'Federal Democratic Republic of Ethiopia!$!ET',
  'countrycode': 'ET',
  'countryname': 'Federal Democratic Republic of Ethiopia',
  'countryshortname': 'Ethiopia',
  'docty': 'Project Information Document,Indigenous Peoples Plan,Project Information Document',
  'envassesmentcategorycode': 'C',
  'grantamt': 0,
  'ibrdcommamt': 0,
  'id': 'P129828',
  'idacommamt': 130000000,
  'impagency': 'MINISTRY OF EDUCATION',
  'lendinginstr': 'Investment Project Financing',
  'lendinginstrtype': 'IN',
  'lendprojectcost': 550000000,
  'majorsector_percent': [{'Name': 'Education', 'Percent': 46},
   {'Name': 'Education', 'Percent': 26},
   {'Name': 'Public Administration, Law, and Justice', 'Percent': 16},
   {'Name': 'Educatio

In [148]:
# load as Pandas dataframe
sample_json_df = pd.read_json('data/world_bank_projects_less.json')
sample_json_df

Unnamed: 0,_id,approvalfy,board_approval_month,boardapprovaldate,borrower,closingdate,country_namecode,countrycode,countryname,countryshortname,...,sectorcode,source,status,supplementprojectflg,theme1,theme_namecode,themecode,totalamt,totalcommamt,url
0,{'$oid': '52b213b38594d8a2be17c780'},1999,November,2013-11-12T00:00:00Z,FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA,2018-07-07T00:00:00Z,Federal Democratic Republic of Ethiopia!$!ET,ET,Federal Democratic Republic of Ethiopia,Ethiopia,...,"ET,BS,ES,EP",IBRD,Active,N,"{'Name': 'Education for all', 'Percent': 100}","[{'name': 'Education for all', 'code': '65'}]",65,130000000,130000000,http://www.worldbank.org/projects/P129828/ethi...
1,{'$oid': '52b213b38594d8a2be17c781'},2015,November,2013-11-04T00:00:00Z,GOVERNMENT OF TUNISIA,,Republic of Tunisia!$!TN,TN,Republic of Tunisia,Tunisia,...,"BZ,BS",IBRD,Active,N,"{'Name': 'Other economic management', 'Percent...","[{'name': 'Other economic management', 'code':...",5424,0,4700000,http://www.worldbank.org/projects/P144674?lang=en


****
## JSON exercise

Using data in file 'data/world_bank_projects.json' and the techniques demonstrated above,
1. Find the 10 countries with most projects
2. Find the top 10 major project themes (using column 'mjtheme_namecode')
3. In 2. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in.

In [149]:
# load JSON file
with open('data/world_bank_projects.json') as f:
    json_data = json.load(f)

In [150]:
# flattening json structure and load it to Pandas dataframe
json_df = json_normalize(data=json_data, 
                         record_path='mjtheme_namecode', 
                         meta=['countryshortname', 'project_name'])
json_df.head(3)

Unnamed: 0,code,name,countryshortname,project_name
0,8,Human development,Ethiopia,Ethiopia General Education Quality Improvement...
1,11,,Ethiopia,Ethiopia General Education Quality Improvement...
2,1,Economic management,Tunisia,TN: DTF Social Protection Reforms Support


### 1. Find the 10 countries with most projects

In [151]:
json_df['countryshortname'].value_counts().head(10)

Indonesia             56
India                 51
Vietnam               43
Brazil                41
Bangladesh            41
China                 40
Africa                39
Yemen, Republic of    34
Morocco               32
Mozambique            31
Name: countryshortname, dtype: int64

### 2. Find the top 10 major project themes

In [152]:
json_df.groupby(['code', 'name']).size().sort_values(ascending=False)#.head(10)

code  name                                        
11    Environment and natural resources management    223
10    Rural development                               202
8     Human development                               197
2     Public sector governance                        184
6     Social protection and risk management           158
4     Financial and private sector development        130
7     Social dev/gender/inclusion                     119
5     Trade and integration                            72
9     Urban development                                47
1     Economic management                              33
11                                                     27
4                                                      16
2                                                      15
10                                                     14
8                                                      13
3     Rule of law                                      12
7                    

### 3. # There are entries have only the code and the name is missing. Create a dataframe with the missing names filled in.

In [153]:
# Construct a list of unique key-value pair of code-name
code_name_df = json_df[['code', 'name']][(json_df['code'].str.len() > 0) & (json_df['name'].str.len() > 0)]
code_name_df.drop_duplicates(inplace=True)

code_name_df = code_name_df.set_index('code')
code_name_df

Unnamed: 0_level_0,name
code,Unnamed: 1_level_1
8,Human development
1,Economic management
6,Social protection and risk management
5,Trade and integration
2,Public sector governance
11,Environment and natural resources management
7,Social dev/gender/inclusion
4,Financial and private sector development
10,Rural development
9,Urban development


In [154]:
# Create a dict out of the dataframe
code_name_dict = code_name_df.to_dict()
code_name_dict

{'name': {'8': 'Human development',
  '1': 'Economic management',
  '6': 'Social protection and risk management',
  '5': 'Trade and integration',
  '2': 'Public sector governance',
  '11': 'Environment and natural resources management',
  '7': 'Social dev/gender/inclusion',
  '4': 'Financial and private sector development',
  '10': 'Rural development',
  '9': 'Urban development',
  '3': 'Rule of law'}}

In [155]:
# Obtain the code:name dictionary
code_name_mapping = code_name_dict.get('name')
code_name_mapping

{'8': 'Human development',
 '1': 'Economic management',
 '6': 'Social protection and risk management',
 '5': 'Trade and integration',
 '2': 'Public sector governance',
 '11': 'Environment and natural resources management',
 '7': 'Social dev/gender/inclusion',
 '4': 'Financial and private sector development',
 '10': 'Rural development',
 '9': 'Urban development',
 '3': 'Rule of law'}

In [156]:
# Convert series to dataframe to use fillna()
def fill_name(row):
    name = row['name']
    if name == np.nan or len(name) < 1:
        code = row['code']
        name = code_name_mapping.get(code, np.nan)
        
    return name


json_df['name'] = json_df.apply(fill_name, axis=1)
json_df

Unnamed: 0,code,name,countryshortname,project_name
0,8,Human development,Ethiopia,Ethiopia General Education Quality Improvement...
1,11,Environment and natural resources management,Ethiopia,Ethiopia General Education Quality Improvement...
2,1,Economic management,Tunisia,TN: DTF Social Protection Reforms Support
3,6,Social protection and risk management,Tunisia,TN: DTF Social Protection Reforms Support
4,5,Trade and integration,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
5,2,Public sector governance,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
6,11,Environment and natural resources management,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
7,6,Social protection and risk management,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
8,7,Social dev/gender/inclusion,"Yemen, Republic of",Gov't and Civil Society Organization Partnership
9,7,Social dev/gender/inclusion,"Yemen, Republic of",Gov't and Civil Society Organization Partnership


### Answer 2. again after filling in missing theme project names
(2. Find the top 10 major project themes)

In [157]:
json_df.groupby(['code', 'name']).size().sort_values(ascending=False)#.head(10)

code  name                                        
11    Environment and natural resources management    250
10    Rural development                               216
8     Human development                               210
2     Public sector governance                        199
6     Social protection and risk management           168
4     Financial and private sector development        146
7     Social dev/gender/inclusion                     130
5     Trade and integration                            77
9     Urban development                                50
1     Economic management                              38
3     Rule of law                                      15
dtype: int64