In [386]:
import pandas as pd
import numpy as np
from collections import defaultdict
import json
from pandas.io.json import json_normalize

<h2 style="color:#A0A0A0">Normalization of the data</h2>
<p>Importing the data from a json into python</p>

In [387]:
#imports the data locally
with open('world_bank_projects.json') as file:
    raw_data = json.load(file)

<h2 style="color:#A0A0A0">Display Top Countries</h2>
<p>This section shows the countries with the top number of projects</p>

In [388]:
#changes the 'countryname' column to category type. Sets id to oid 
data = json_normalize(raw_data)
data['countryname'].astype('category')
data.set_index('_id.$oid')
print(data.countryname.value_counts()[:10])

People's Republic of China         19
Republic of Indonesia              19
Socialist Republic of Vietnam      17
Republic of India                  16
Republic of Yemen                  13
Kingdom of Morocco                 12
People's Republic of Bangladesh    12
Nepal                              12
Republic of Mozambique             11
Africa                             11
Name: countryname, dtype: int64


<h2 style="color:#A0A0A0">Displaying The Most Popular Themes</h2>
<p>The project themes are easier to analyze if extracted into a new dataframe.</p>

In [389]:
#removes the dictionary from the list, and creates a new DataFrame for it.
data['mjtheme_namecode'] = data['mjtheme_namecode'].apply(lambda x: x[0])

In [390]:
project_themes = json_normalize(list(data.mjtheme_namecode))

In [391]:
project_themes.name.value_counts()[:10]

Environment and natural resources management    85
Human development                               72
Public sector governance                        64
Social protection and risk management           57
Rural development                               56
Financial and private sector development        53
Social dev/gender/inclusion                     43
Trade and integration                           25
Urban development                               23
Economic management                             11
Name: name, dtype: int64

<h2 style="color:#A0A0A0"> Filling In Missing Values</h2>
<p> Filling in the missing values by using the code in the 'Code' Column</p>

In [392]:
#changes the code type to category, and renames the 'name' category to 'type', to avoid any namespace collisions.
#uses a regex pattern to replace empty unicode strings with NaN and makes a dictionary with code, name key value pairs.
project_themes.code.astype('category')
project_themes.columns = ['code','types']
project_themes.types.replace(ur'^\s*$',np.nan, regex=True)
fill={
for num in project_themes.code.unique():
    fill[num] = project_themes.types[project_themes.code == num].iloc[1]

<p>This part applies a function to fill the missing data with the name, based off the code number.</p>

In [393]:
#function for apply to replace NaN values with the value from the dictionary
def fill_data(row):
    if row['types'] == u'': 
        return fill[row['code']]
    else: return row
print(project_themes.apply(fill_data,axis=1))

                                         code  \
0                                           8   
1                                           1   
2                                           5   
3                                           7   
4                                           5   
5                                           6   
6                                           2   
7                                          11   
8                                          10   
9                                           2   
10                                         10   
11                                         10   
12   Financial and private sector development   
13                                          5   
14                                          6   
15                                         10   
16                                         10   
17                                          8   
18                                          8   
19                  