In [2]:
import json
import pandas as pd
from pandas.io.json import json_normalize

# Load json file as pandas dataframe
df = pd.read_json('data/world_bank_projects.json')


In [6]:
# Problem 1 Find 10 countries with most projects

# Extract the 'countryname' column
country = df['countryname']

# Count the number of entries for each country
country_counts = country.value_counts()

# Print the 10 countries with the most projects
print(country_counts.head(10))

Republic of Indonesia              19
People's Republic of China         19
Socialist Republic of Vietnam      17
Republic of India                  16
Republic of Yemen                  13
Nepal                              12
Kingdom of Morocco                 12
People's Republic of Bangladesh    12
Africa                             11
Republic of Mozambique             11
Name: countryname, dtype: int64


In [5]:
# Problem 2 Find 10 most common project themes

# Extract the 'mjtheme_namecode' projet theme column
pthemes = df['mjtheme_namecode']

# Create an empty list of themes
themes = []

# Append every theme dictionary object to the list
for item in pthemes:
    for theme_dict in item:
        themes.append(theme_dict)

# Coerce the list into a pandas Series object and perform count and rank
themes_count = pd.Series(themes).value_counts()

# Print the top 10 project themes 
print(themes_count.head(10))

{'code': '11', 'name': 'Environment and natural resources management'}    223
{'code': '10', 'name': 'Rural development'}                               202
{'code': '8', 'name': 'Human development'}                                197
{'code': '2', 'name': 'Public sector governance'}                         184
{'code': '6', 'name': 'Social protection and risk management'}            158
{'code': '4', 'name': 'Financial and private sector development'}         130
{'code': '7', 'name': 'Social dev/gender/inclusion'}                      119
{'code': '5', 'name': 'Trade and integration'}                             72
{'code': '9', 'name': 'Urban development'}                                 47
{'code': '1', 'name': 'Economic management'}                               33
dtype: int64


In [14]:
# Problem 3 Approach 1 Fill Missing Project Theme Names with Loop

# Load and assign json data to string
json_str = json.load((open('data/world_bank_projects.json')))

# Normalize the nested data in project themes column 'mjtheme_namecode' and create new dataframe
themes_df = json_normalize(json_str,'mjtheme_namecode',['id','countryname'])

# Sort dataframe by code and name columns in descending order so that the empty name values are last in its code group
themes_df = themes_df.sort_values(['code','name'],ascending=False)

# Reset index to prepare for looping
themes_df = themes_df.reset_index()

# Loop through each row of name column in dataframe
i = 0
last_name = ''

while i < len(themes_df):
    # If name has value, then assign it to last_name
    if len(themes_df.name[i]) > 0:
        last_name = themes_df.name[i]
    # Else if name has no value, then assign last_name to the name entry with missing value
    else:
        themes_df.name[i] = last_name
    i += 1

# Sort data back to original order using old index
themes_df = themes_df.set_index('index').sort_values('index')

# Print to see missing names filled
print(themes_df.head(10))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


      code                                          name       id  \
index                                                               
0        8                             Human development  P129828   
1       11  Environment and natural resources management  P129828   
2        1                           Economic management  P144674   
3        6         Social protection and risk management  P144674   
4        5                         Trade and integration  P145310   
5        2                      Public sector governance  P145310   
6       11  Environment and natural resources management  P145310   
7        6         Social protection and risk management  P145310   
8        7                   Social dev/gender/inclusion  P144665   
9        7                   Social dev/gender/inclusion  P144665   

                                   countryname  
index                                           
0      Federal Democratic Republic of Ethiopia  
1      Federal Democrati

In [7]:
# Problem 3 Approach 2 Fill Missing Project Theme Names with Forward Fill

# Create dataframe from list of project themes dictionaries
tdf = pd.DataFrame.from_dict(d for d in themes)

# Sort on code and name columns in descending order so that empty name values are last in its code group
tdf = tdf.sort_values(['code','name'],ascending=False)

# Reset index of dataframe
tdf = tdf.reset_index()

# Fill missing values in name column using forward fill and assign it back to itself
tdf.name = tdf.name.fillna(method='ffill')

# Set index back to original
tdf = tdf.set_index('index').sort_values('index')

# Print dataframe
print(tdf.head(10))

      code                                          name
index                                                   
0        8                             Human development
1       11                                              
2        1                           Economic management
3        6         Social protection and risk management
4        5                         Trade and integration
5        2                      Public sector governance
6       11  Environment and natural resources management
7        6         Social protection and risk management
8        7                   Social dev/gender/inclusion
9        7                   Social dev/gender/inclusion
