# JSON examples and exercise
****
+ get familiar with packages for dealing with JSON
+ study examples with JSON strings and files 
+ work on exercise to be completed and submitted 
****
+ reference: http://pandas.pydata.org/pandas-docs/stable/io.html#io-json-reader
+ data source: http://jsonstudio.com/resources/
****

In [1]:
import numpy as np
import pandas as pd

## imports for Python, Pandas

In [2]:
import json
from pandas.io.json import json_normalize

## JSON example, with string

+ demonstrates creation of normalized dataframes (tables) from nested json string
+ source: http://pandas.pydata.org/pandas-docs/stable/io.html#normalization

In [3]:
# define json string
data = [{'state': 'Florida', 
         'shortname': 'FL',
         'info': {'governor': 'Rick Scott'},
         'counties': [{'name': 'Dade', 'population': 12345},
                      {'name': 'Broward', 'population': 40000},
                      {'name': 'Palm Beach', 'population': 60000}]},
        {'state': 'Ohio',
         'shortname': 'OH',
         'info': {'governor': 'John Kasich'},
         'counties': [{'name': 'Summit', 'population': 1234},
                      {'name': 'Cuyahoga', 'population': 1337}]}]

In [4]:
df = pd.DataFrame(data)
df

Unnamed: 0,counties,info,shortname,state
0,"[{'name': 'Dade', 'population': 12345}, {'name...",{'governor': 'Rick Scott'},FL,Florida
1,"[{'name': 'Summit', 'population': 1234}, {'nam...",{'governor': 'John Kasich'},OH,Ohio


In [5]:
# use normalization to create tables from nested element
json_normalize(data, 'counties')

Unnamed: 0,name,population
0,Dade,12345
1,Broward,40000
2,Palm Beach,60000
3,Summit,1234
4,Cuyahoga,1337


In [6]:
# further populate tables created from nested element
json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])

Unnamed: 0,name,population,state,shortname,info.governor
0,Dade,12345,Florida,FL,Rick Scott
1,Broward,40000,Florida,FL,Rick Scott
2,Palm Beach,60000,Florida,FL,Rick Scott
3,Summit,1234,Ohio,OH,John Kasich
4,Cuyahoga,1337,Ohio,OH,John Kasich


****
## JSON example, with file

+ demonstrates reading in a json file as a string and as a table
+ uses small sample file containing data about projects funded by the World Bank 
+ data source: http://jsonstudio.com/resources/

In [7]:
# load json as string
json.load((open('data/world_bank_projects_less.json')))

[{'_id': {'$oid': '52b213b38594d8a2be17c780'},
  'approvalfy': 1999,
  'board_approval_month': 'November',
  'boardapprovaldate': '2013-11-12T00:00:00Z',
  'borrower': 'FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA',
  'closingdate': '2018-07-07T00:00:00Z',
  'country_namecode': 'Federal Democratic Republic of Ethiopia!$!ET',
  'countrycode': 'ET',
  'countryname': 'Federal Democratic Republic of Ethiopia',
  'countryshortname': 'Ethiopia',
  'docty': 'Project Information Document,Indigenous Peoples Plan,Project Information Document',
  'envassesmentcategorycode': 'C',
  'grantamt': 0,
  'ibrdcommamt': 0,
  'id': 'P129828',
  'idacommamt': 130000000,
  'impagency': 'MINISTRY OF EDUCATION',
  'lendinginstr': 'Investment Project Financing',
  'lendinginstrtype': 'IN',
  'lendprojectcost': 550000000,
  'majorsector_percent': [{'Name': 'Education', 'Percent': 46},
   {'Name': 'Education', 'Percent': 26},
   {'Name': 'Public Administration, Law, and Justice', 'Percent': 16},
   {'Name': 'Educatio

In [8]:
# load as Pandas dataframe
sample_json_df = pd.read_json('data/world_bank_projects_less.json')
sample_json_df

Unnamed: 0,_id,approvalfy,board_approval_month,boardapprovaldate,borrower,closingdate,country_namecode,countrycode,countryname,countryshortname,...,sectorcode,source,status,supplementprojectflg,theme1,theme_namecode,themecode,totalamt,totalcommamt,url
0,{'$oid': '52b213b38594d8a2be17c780'},1999,November,2013-11-12T00:00:00Z,FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA,2018-07-07T00:00:00Z,Federal Democratic Republic of Ethiopia!$!ET,ET,Federal Democratic Republic of Ethiopia,Ethiopia,...,"ET,BS,ES,EP",IBRD,Active,N,"{'Name': 'Education for all', 'Percent': 100}","[{'name': 'Education for all', 'code': '65'}]",65,130000000,130000000,http://www.worldbank.org/projects/P129828/ethi...
1,{'$oid': '52b213b38594d8a2be17c781'},2015,November,2013-11-04T00:00:00Z,GOVERNMENT OF TUNISIA,,Republic of Tunisia!$!TN,TN,Republic of Tunisia,Tunisia,...,"BZ,BS",IBRD,Active,N,"{'Name': 'Other economic management', 'Percent...","[{'name': 'Other economic management', 'code':...",5424,0,4700000,http://www.worldbank.org/projects/P144674?lang=en


****
## JSON exercise

Using data in file 'data/world_bank_projects.json' and the techniques demonstrated above,
1. Find the 10 countries with most projects
2. Find the top 10 major project themes (using column 'mjtheme_namecode')
3. In 2. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in.

In [20]:
# load JSON file
with open('data/world_bank_projects.json') as f:
    json_data = json.load(f)

In [21]:
# flattening json structure and load it to Pandas dataframe
json_df = json_normalize(data=json_data, 
                         record_path='mjtheme_namecode', 
                         meta=['countryshortname', 'project_name'])
json_df.head(10)

Unnamed: 0,code,name,countryshortname,project_name
0,8,Human development,Ethiopia,Ethiopia General Education Quality Improvement...
1,11,,Ethiopia,Ethiopia General Education Quality Improvement...
2,1,Economic management,Tunisia,TN: DTF Social Protection Reforms Support
3,6,Social protection and risk management,Tunisia,TN: DTF Social Protection Reforms Support
4,5,Trade and integration,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
5,2,Public sector governance,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
6,11,Environment and natural resources management,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
7,6,Social protection and risk management,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
8,7,Social dev/gender/inclusion,"Yemen, Republic of",Gov't and Civil Society Organization Partnership
9,7,Social dev/gender/inclusion,"Yemen, Republic of",Gov't and Civil Society Organization Partnership


### Q1. Find the 10 countries with most projects

In [11]:
json_df['countryshortname'].value_counts().head(10)

Indonesia             56
India                 51
Vietnam               43
Bangladesh            41
Brazil                41
China                 40
Africa                39
Yemen, Republic of    34
Morocco               32
Mozambique            31
Name: countryshortname, dtype: int64

### Before answering Q2, some additional analyzing for missing data

In [23]:
json_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1499 entries, 0 to 1498
Data columns (total 4 columns):
code                1499 non-null object
name                1499 non-null object
countryshortname    1499 non-null object
project_name        1499 non-null object
dtypes: object(4)
memory usage: 46.9+ KB


#### Note: There's some missing values in the 'name' column, but that'll get cleaned up in the next exercise below. But there is a need to find out if the missing values are empty or blank spaces.

In [24]:
# What are the empty entries in the 'name' column?
json_df['name'][1]

''

#### It seems that all blank entries in the 'name' column are just empty strings which will need to be handled. How many emtry strings are in the 'name' column? That'll determine if we can disregard those entries.

In [26]:
# How many emtry strings are there in the 'name' column?
(json_df['name'] == '').sum()

122

#### There seems to be 122 out of 1499 entries that have a missing 'name' entry which accounts for ~8% of the data. For purposes of this exercise, I'll disregard the entries with empty strings and handle it in the next exercise.

### Q2. Find the top 10 major project themes

In [29]:
# Student solution
# json_df.groupby(['code', 'name']).size().sort_values(ascending=False).head(10)

# Mentor (Zeehasham Rasheed) solution
top_projects = json_df[json_df['name'] != ''].groupby(['code','name'])['name'].count().sort_values(ascending=False)
top_projects.head(10)

code  name                                        
11    Environment and natural resources management    223
10    Rural development                               202
8     Human development                               197
2     Public sector governance                        184
6     Social protection and risk management           158
4     Financial and private sector development        130
7     Social dev/gender/inclusion                     119
5     Trade and integration                            72
9     Urban development                                47
1     Economic management                              33
3     Rule of law                                      12
Name: name, dtype: int64

### Q3. There are entries have only the code and the name is missing. Create a dataframe with the missing names filled in.

#### Q3A. Student solution for Q3

In [51]:
# Construct a list of unique key-value pair of code-name
code_name_df = json_df[['code', 'name']][(json_df['code'].str.len() > 0) & (json_df['name'].str.len() > 0)]
code_name_df.drop_duplicates(inplace=True)

code_name_df = code_name_df.set_index('code')
code_name_df

Unnamed: 0_level_0,name
code,Unnamed: 1_level_1
8,Human development
11,Environment and natural resources management
1,Economic management
6,Social protection and risk management
5,Trade and integration
2,Public sector governance
7,Social dev/gender/inclusion
4,Financial and private sector development
10,Rural development
9,Urban development


In [52]:
# Create a dict out of the dataframe
code_name_dict = code_name_df.to_dict()
code_name_dict

{'name': {'1': 'Economic management',
  '10': 'Rural development',
  '11': 'Environment and natural resources management',
  '2': 'Public sector governance',
  '3': 'Rule of law',
  '4': 'Financial and private sector development',
  '5': 'Trade and integration',
  '6': 'Social protection and risk management',
  '7': 'Social dev/gender/inclusion',
  '8': 'Human development',
  '9': 'Urban development'}}

In [53]:
# Obtain the code:name dictionary
code_name_mapping = code_name_dict.get('name')
code_name_mapping

{'1': 'Economic management',
 '10': 'Rural development',
 '11': 'Environment and natural resources management',
 '2': 'Public sector governance',
 '3': 'Rule of law',
 '4': 'Financial and private sector development',
 '5': 'Trade and integration',
 '6': 'Social protection and risk management',
 '7': 'Social dev/gender/inclusion',
 '8': 'Human development',
 '9': 'Urban development'}

In [54]:
# Convert series to dataframe to use fillna()
def fill_name(row):
    name = row['name']
    if name == np.nan or len(name) < 1:
        code = row['code']
        name = code_name_mapping.get(code, np.nan)
        
    return name


json_df['name'] = json_df.apply(fill_name, axis=1)
json_df.head(10)

Unnamed: 0,code,name,countryshortname,project_name
0,8,Human development,Ethiopia,Ethiopia General Education Quality Improvement...
1,11,Environment and natural resources management,Ethiopia,Ethiopia General Education Quality Improvement...
2,1,Economic management,Tunisia,TN: DTF Social Protection Reforms Support
3,6,Social protection and risk management,Tunisia,TN: DTF Social Protection Reforms Support
4,5,Trade and integration,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
5,2,Public sector governance,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
6,11,Environment and natural resources management,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
7,6,Social protection and risk management,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
8,7,Social dev/gender/inclusion,"Yemen, Republic of",Gov't and Civil Society Organization Partnership
9,7,Social dev/gender/inclusion,"Yemen, Republic of",Gov't and Civil Society Organization Partnership


#### Q3B. Mentor (Zeehasham Rasheed) solution

In [69]:
type(top_projects)

pandas.core.series.Series

In [70]:
top_projects.index

MultiIndex(levels=[['1', '10', '11', '2', '3', '4', '5', '6', '7', '8', '9'], ['Economic management', 'Environment and natural resources management', 'Financial and private sector development', 'Human development', 'Public sector governance', 'Rule of law', 'Rural development', 'Social dev/gender/inclusion', 'Social protection and risk management', 'Trade and integration', 'Urban development']],
           labels=[[2, 1, 9, 3, 7, 5, 8, 6, 10, 0, 4], [1, 6, 3, 4, 8, 2, 7, 9, 10, 0, 5]],
           names=['code', 'name'])

In [71]:
top_projects.values

array([223, 202, 197, 184, 158, 130, 119,  72,  47,  33,  12])

In [72]:
# Use top_projects DataFrame to create dictionary with code# and name as key-value pairs
name_code = {index1:index2 for index1, index2 in top_projects.sort_index().index}
name_code

{'1': 'Economic management',
 '10': 'Rural development',
 '11': 'Environment and natural resources management',
 '2': 'Public sector governance',
 '3': 'Rule of law',
 '4': 'Financial and private sector development',
 '5': 'Trade and integration',
 '6': 'Social protection and risk management',
 '7': 'Social dev/gender/inclusion',
 '8': 'Human development',
 '9': 'Urban development'}

In [73]:
json_df['code'].values

array(['8', '11', '1', ..., '8', '5', '4'], dtype=object)

In [74]:
type(json_df['code'].values)

numpy.ndarray

In [75]:
np.shape(json_df['code'].values)

(1499,)

In [76]:
# Replace values in 'name' based on their 'code' value by using the dictionary 'name_code' of names for each code
json_df['name'] = [name_code[entry] for entry in json_df['code'].values]
json_df.head(10)

Unnamed: 0,code,name,countryshortname,project_name
0,8,Human development,Ethiopia,Ethiopia General Education Quality Improvement...
1,11,Environment and natural resources management,Ethiopia,Ethiopia General Education Quality Improvement...
2,1,Economic management,Tunisia,TN: DTF Social Protection Reforms Support
3,6,Social protection and risk management,Tunisia,TN: DTF Social Protection Reforms Support
4,5,Trade and integration,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
5,2,Public sector governance,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
6,11,Environment and natural resources management,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
7,6,Social protection and risk management,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
8,7,Social dev/gender/inclusion,"Yemen, Republic of",Gov't and Civil Society Organization Partnership
9,7,Social dev/gender/inclusion,"Yemen, Republic of",Gov't and Civil Society Organization Partnership


### Answer 2. again after filling in missing theme project names
(2. Find the top 10 major project themes)

In [77]:
json_df.groupby(['code', 'name']).size().sort_values(ascending=False)#.head(10)

code  name                                        
11    Environment and natural resources management    250
10    Rural development                               216
8     Human development                               210
2     Public sector governance                        199
6     Social protection and risk management           168
4     Financial and private sector development        146
7     Social dev/gender/inclusion                     130
5     Trade and integration                            77
9     Urban development                                50
1     Economic management                              38
3     Rule of law                                      15
dtype: int64