In [1]:
import numpy as np
import pandas as pd

## 0. Notes
Data Sets: 
1. Human Development Index
2. GINI Index
3. Gender Inequality Index  
4. Internally Displaced Persons
5. Total Population
6. School Enrollment, Primary & Secondary, Gender Parity Index  
7. Governance Indicators
8. US Foreign Aid Spending

Target: Governance Indicators - World Bank (Sum percentile ranks then divide by 600)


**Convert cleaning tasks to functions**

## 1. HDI Raw Data

In [2]:
hdi = pd.read_csv('../data/raw/HumanDevelopmentIndex.csv')

In [3]:
hdi.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Human Development Index (HDI)
HDI Rank (2018),Country,1990,,1991,,1992,,1993,,1994,,1995,,1996,,1997,,1998,,1999,,2000.0,,2001.0,,2002.0,,2003.0,,2004.0,,2005.0,,2006.0,,2007.0,,2008.0,,2009.0,,2010.0,,2011.0,,2012.0,,2013.0,,2014.0,,2015.0,,2016.0,,2017.0,,2018.0,
170,Afghanistan,0.298,,0.304,,0.312,,0.308,,0.303,,0.327,,0.331,,0.335,,0.339,,0.343,,0.345,,0.347,,0.378,,0.387,,0.4,,0.41,,0.419,,0.431,,0.436,,0.447,,0.464,,0.465,,0.479,,0.485,,0.488,,0.49,,0.491,,0.493,,0.496,
69,Albania,0.644,,0.625,,0.608,,0.611,,0.617,,0.629,,0.639,,0.639,,0.649,,0.660,,0.667,,0.673,,0.68,,0.687,,0.692,,0.702,,0.709,,0.718,,0.724,,0.729,,0.74,,0.759,,0.771,,0.781,,0.787,,0.788,,0.788,,0.789,,0.791,
82,Algeria,0.578,,0.582,,0.589,,0.593,,0.597,,0.602,,0.610,,0.619,,0.629,,0.638,,0.646,,0.655,,0.666,,0.676,,0.685,,0.694,,0.699,,0.708,,0.711,,0.72,,0.73,,0.738,,0.737,,0.746,,0.749,,0.751,,0.755,,0.758,,0.759,
36,Andorra,..,,..,,..,,..,,..,,..,,..,,..,,..,,..,,0.759,,0.767,,0.78,,0.82,,0.826,,0.819,,0.829,,0.829,,0.831,,0.83,,0.828,,0.827,,0.849,,0.846,,0.853,,0.85,,0.854,,0.852,,0.857,


Initial things to clean up:
1. Multi-level index (labels should be Country & Year)
2. Drop NaN columns
3. Drop HDI Rank (2018)
4. Convert '..' to NaN

In [4]:
#Reset multi-index
hdi.reset_index(inplace=True)

#Drop all columns with NaN values (includes 'HDI Rank (2018)')
hdi.dropna(axis=1, inplace=True)

#Reset column names as entries from the first row
hdi.columns = hdi.iloc[0]

#Drop redundant first row
hdi = hdi.iloc[1:]

#Convert '..' entries to NaN
hdi.replace('..', np.nan, inplace=True)

In [5]:
hdi.head()

Unnamed: 0,Country,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
1,Afghanistan,0.298,0.304,0.312,0.308,0.303,0.327,0.331,0.335,0.339,...,0.447,0.464,0.465,0.479,0.485,0.488,0.49,0.491,0.493,0.496
2,Albania,0.644,0.625,0.608,0.611,0.617,0.629,0.639,0.639,0.649,...,0.729,0.74,0.759,0.771,0.781,0.787,0.788,0.788,0.789,0.791
3,Algeria,0.578,0.582,0.589,0.593,0.597,0.602,0.61,0.619,0.629,...,0.72,0.73,0.738,0.737,0.746,0.749,0.751,0.755,0.758,0.759
4,Andorra,,,,,,,,,,...,0.83,0.828,0.827,0.849,0.846,0.853,0.85,0.854,0.852,0.857
5,Angola,,,,,,,,,,...,0.508,0.51,0.525,0.537,0.547,0.557,0.565,0.57,0.576,0.574


In [6]:
hdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 1 to 206
Data columns (total 30 columns):
Country    206 non-null object
1990       161 non-null object
1991       159 non-null object
1992       158 non-null object
1993       159 non-null object
1994       159 non-null object
1995       163 non-null object
1996       163 non-null object
1997       162 non-null object
1998       163 non-null object
1999       166 non-null object
2000       189 non-null object
2001       189 non-null object
2002       190 non-null object
2003       192 non-null object
2004       194 non-null object
2005       201 non-null object
2006       201 non-null object
2007       201 non-null object
2008       201 non-null object
2009       201 non-null object
2010       203 non-null object
2011       203 non-null object
2012       203 non-null object
2013       203 non-null object
2014       203 non-null object
2015       203 non-null object
2016       203 non-null object
2017       204 non-null obj

In [7]:
#Drop rows without data for 2017-2018 (they're title rows for 'Human Development' and 'Regions')
hdi.dropna(subset=['2017', '2018'], inplace=True)

#### Save cleaned dataset to interim data folder:

In [8]:
hdi.to_csv('../data/interim/HDI.csv')

## 2. GINI Index

In [1]:
gini = pd.read_csv('../data/raw/GINI.csv')

NameError: name 'pd' is not defined

## 3. Gender Inequality Index

In [None]:
gii = pd.read_csv('../data/raw/GenderInequalityIndex.csv')

## 4. Internally Displaced Persons

In [None]:
displaced = pd.read_csv('../data/raw/InternallyDisplacedPersons.csv')

## 5. Total Population

In [None]:
population = pd.read_csv('../data/raw/Population.csv')

## 6. Gender Parity Index - Primary & Secondary School Enrollment

In [None]:
gpi_school = pd.read_csv('../data/raw/SchoolEnrollment_GenderParityIndex.csv')

## 7. Governance Indicators

In [None]:
gov_indicators = pd.read_csv('../data/raw/GovernanceIndicators.csv')

## 8. US Foreign Aid Spending

In [None]:
usaid = pd.read_csv('../data/raw/us_foreign_aid_complete.csv')