In [1]:
import json
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np

# no reason to use json_normalize here -- will use for second/third part of exercise
wbp = pd.read_json('data/world_bank_projects.json')

# EXAMINE DATA FOR PROBLEMS
# look at general structure of data frame
print(wbp.head())

# look for problems with county identifier columns
print(wbp.columns)
print(len(wbp.columns))

unique_short = wbp['countryshortname'].unique()
print(len(unique_short))
print(unique_short[0:20])

unique_name = wbp['countryname'].unique()
print(len(unique_name))
print(unique_name[0:20])

unique_ccode = wbp['countrycode'].unique()
print(len(unique_ccode))
print(unique_ccode[0:20])

unique_cncode = wbp['country_namecode'].unique()
print(len(unique_cncode))
print(unique_cncode[0:20])
# seem that all contain the same info in different form

# are there duplicates in the id column
print(len(wbp['_id']))
df_id = pd.DataFrame(wbp['_id'].tolist())
print(len(df_id['$oid'].unique()))
# same length so all entries in _id column are unique ???
# from this I'm assuming there are no duplicates

                                      _id  approvalfy board_approval_month  \
0  {u'$oid': u'52b213b38594d8a2be17c780'}        1999             November   
1  {u'$oid': u'52b213b38594d8a2be17c781'}        2015             November   
2  {u'$oid': u'52b213b38594d8a2be17c782'}        2014             November   
3  {u'$oid': u'52b213b38594d8a2be17c783'}        2014              October   
4  {u'$oid': u'52b213b38594d8a2be17c784'}        2014              October   

      boardapprovaldate                                 borrower  \
0  2013-11-12T00:00:00Z  FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA   
1  2013-11-04T00:00:00Z                    GOVERNMENT OF TUNISIA   
2  2013-11-01T00:00:00Z   MINISTRY OF FINANCE AND ECONOMIC DEVEL   
3  2013-10-31T00:00:00Z   MIN. OF PLANNING AND INT'L COOPERATION   
4  2013-10-31T00:00:00Z                      MINISTRY OF FINANCE   

            closingdate                              country_namecode  \
0  2018-07-07T00:00:00Z  Federal Democratic Repub

In [2]:
# do a value_counts() of 'countryshortname'
# type in this column is not dict so don't need to use json_normalize
sname_prj = wbp['countryshortname']
prj_count = sname_prj.value_counts().sort_values(ascending=False)
print('\nANSWER TO FIRST PART OF EXERCISE')
print('(Africa is not a country but is listed in all relevant columns)')
print('\nTop 10 Countries With The Most Projects\n')
print(prj_count.head(10))
# ANSWER TO FIRST PART OF EXERCISE
# 1. Find the 10 countries with most projects
# len matches number of unique countries determined above


ANSWER TO FIRST PART OF EXERCISE
(Africa is not a country but is listed in all relevant columns)

Top 10 Countries With The Most Projects

China                 19
Indonesia             19
Vietnam               17
India                 16
Yemen, Republic of    13
Morocco               12
Bangladesh            12
Nepal                 12
Africa                11
Mozambique            11
Name: countryshortname, dtype: int64


In [14]:
# put relevant theme data into a data frame using json_normalize this time
# information in this column is type dict so json_normalize simplifies process
wbp_str = json.load((open('data/world_bank_projects.json')))
wbp_norm = json_normalize(wbp_str,'mjtheme_namecode').sort_values(by='code')

# change code column to integer -- comes in handy later
wbp_norm['code'] = wbp_norm['code'].apply(pd.to_numeric)

# examine code <-> name relationship
# there are 11 codes
# code_num = wbp_norm.loc[wbp_norm['code'] == 11]
# print(code_num)

# it appears that .unique doesn't change order and since I sorted this should be a one to one relationship
# names has one extra entry i.e. '' (empty string) -- removing it from list creates correct relationship
# this might not work for all data sets ?????
codes = wbp_norm['code'].unique()
tnames = wbp_norm['name'].unique()
# this works for this data -- no guarantee it will work universially
names = pd.Series([entry for entry in tnames if entry !=''])

# index starts at zero and will cause mismatch when adding to theme_count data frame
names.index += 1
code_name = pd.DataFrame(zip(codes,names))
# rename columns
code_name.columns = ['Code','Project Theme']
code_name = code_name.sort_values(by='Code')
code_name.reset_index(drop=True,inplace=True)
nnames = list(code_name['Project Theme'])
ncodes = list(code_name['Code'])

# do a theme count on codes
# sort to match order in names
theme_count = wbp_norm['code'].value_counts().sort_index()
# make results more readable
df_tcount = pd.DataFrame(theme_count)
df_tcount['Project Theme'] = nnames
df_tcount['Code'] = ncodes
df_tcount.columns = ['Count','Project Theme','Code']
# put into final form
#sort by count to match desired order
df_tcount = df_tcount.sort_values(by='Count',ascending=False)
df_tcount.reset_index(drop=True,inplace=True)

# print results
# ANSWER TO SECOND PART OF EXERCISE
# 2. Find the top 10 major project themes (using column 'mjtheme_namecode')
print('\nANSWER TO SECOND PART OF EXERCISE')
print('\nCounts of All Projects By Theme\n')
print(df_tcount)

# print out list of codes and names
# needed for 3rd part of exercise
print('\n\nNeeded For 3rd Part of Exercise')
print('\nList of Code Names\n')
print(code_name)

# fill in missing data

# make copy
# need to reset index or this goes horribly wrong -- for loop iterates over rows by index not order
wbp_filled = wbp_norm.reset_index(drop=True)

# create empty list
pthemes = []

# iterate over rows in wbp_filled
# create list with all values filled in -- don't bother to check if missing just replace all
for index, row in wbp_filled.iterrows():
    pthemes.append(code_name.get_value(row['code']-1,'Project Theme'))

# add new column with complete theme data
wbp_filled['Project_Theme_Filled'] = pd.Series(pthemes)

# delete column with missing themes
del wbp_filled['name']

# print results
print('\n\nANSWER TO THIRD PART OF EXERCISE')
print('\n\nDataFrame With Missing Themes Filled In')
print(wbp_filled.head(25))
# ANSWER TO THIRD PART OF EXERCISE
# 3. In 2. above you will notice that some entries have only the code and the 
#    name is missing. Create a dataframe with the missing names filled in.    


ANSWER TO SECOND PART OF EXERCISE

Counts of All Projects By Theme

    Count                                 Project Theme  Code
0     250  Environment and natural resources management    11
1     216                             Rural development    10
2     210                             Human development     8
3     199                      Public sector governance     2
4     168         Social protection and risk management     6
5     146      Financial and private sector development     4
6     130                   Social dev/gender/inclusion     7
7      77                         Trade and integration     5
8      50                             Urban development     9
9      38                           Economic management     1
10     15                                   Rule of law     3


Needed For 3rd Part of Exercise

List of Code Names

    Code                                 Project Theme
0      1                           Economic management
1      2              