In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read in USAID data pickle
df = pd.read_pickle('./usaid_data.pkl')

In [3]:
df.head()

Unnamed: 0,country_name,region_name,income_group_name,implementing_agency_name,channel_category_name,channel_name,dac_category_name,dac_sector_name,dac_purpose_name,funding_account_name,funding_agency_name,assistance_category_name,activity_name,transaction_type_name,fiscal_year,current_amount,constant_amount,USG_sector_name
0,Afghanistan,South and Central Asia,Low Income Country,Department of Defense,Government,U.S. Government - Department of Defense,Governance,"Conflict, Peace, and Security",Security system management and reform,"Department of the Army, Afghanistan Security F...",Department of the Army,Military,Afghanistan Security Force Fund,Obligations,2011,9941000000,10914528417,Stabilization Operations and Security Sector R...
1,Afghanistan,South and Central Asia,Low Income Country,Department of Defense,Government,U.S. Government - Department of Defense,Governance,"Conflict, Peace, and Security",Security system management and reform,"Department of the Army, Afghanistan Security F...",Department of the Army,Military,Afghanistan Security Force Fund,Obligations,2012,9243000000,9966142830,Stabilization Operations and Security Sector R...
2,Afghanistan,South and Central Asia,Low Income Country,Department of Defense,Government,U.S. Government - Department of Defense,Governance,"Conflict, Peace, and Security",Security system management and reform,"Department of the Army, Afghanistan Security F...",Department of the Army,Military,Afghanistan Security Force Fund,Disbursements,2011,7840175215,8607968532,Stabilization Operations and Security Sector R...
3,Afghanistan,South and Central Asia,Low Income Country,Department of Defense,Government,U.S. Government - Department of Defense,Governance,"Conflict, Peace, and Security",Security system management and reform,"Department of the Army, Afghanistan Security F...",Department of the Army,Military,Afghanistan Security Force Fund,Disbursements,2013,7764310985,8232733951,Stabilization Operations and Security Sector R...
4,Afghanistan,South and Central Asia,Low Income Country,Department of Defense,Government,U.S. Government - Department of Defense,Governance,"Conflict, Peace, and Security",Security system management and reform,"Department of the Army, Afghanistan Security F...",Department of the Army,Military,Afghanistan Security Force Fund,Obligations,2013,6928000000,7345968099,Stabilization Operations and Security Sector R...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 722079 entries, 0 to 986232
Data columns (total 18 columns):
country_name                722079 non-null object
region_name                 722079 non-null object
income_group_name           722079 non-null object
implementing_agency_name    722079 non-null object
channel_category_name       722079 non-null object
channel_name                722079 non-null object
dac_category_name           722079 non-null object
dac_sector_name             722079 non-null object
dac_purpose_name            722079 non-null object
funding_account_name        722079 non-null object
funding_agency_name         722079 non-null object
assistance_category_name    722079 non-null object
activity_name               721919 non-null object
transaction_type_name       722079 non-null object
fiscal_year                 722079 non-null int64
current_amount              722079 non-null int64
constant_amount             722079 non-null int64
USG_sector_name          

In [5]:
df.fiscal_year.unique()

array([2011, 2012, 2013, 2015, 2010, 2008, 2009, 2006, 2014, 2017, 2007,
       2003, 2016, 2000, 2005, 2004, 2002, 2001, 1999, 1991, 1994, 1998,
       1995, 1993, 1992, 1996, 1997, 1990])

## Start paring down dataframe

In [6]:
# Implementing Agency = US Government Agency Responsible for implementing the programs
df.implementing_agency_name.unique()

array(['Department of Defense',
       'U.S. Agency for International Development',
       'Department of the Army', 'Department of State',
       'Department of Energy', 'Department of Agriculture',
       'Millennium Challenge Corporation', 'Department of the Interior',
       'Department of Health and Human Services',
       'Department of the Treasury', 'Department of the Air Force',
       'Department of the Navy', 'Open World Leadership Center',
       'Overseas Private Investment Corporation', 'Department of Labor',
       'Department of Justice', 'Department of Transportation',
       'National Science Foundation', 'Peace Corps',
       'United States Institute of Peace',
       'Environmental Protection Agency',
       'Department of Homeland Security', 'Department of Commerce',
       'Trade and Development Agency', 'Inter-American Foundation',
       'African Development Foundation', 'Federal Trade Commission'],
      dtype=object)

In [7]:
# Channel Category = Broad categories for channels of delivery; 
# provided with funds from Implementing Agency
df.channel_category_name.unique()

array(['Government', 'Enterprises', 'Multilateral', 'NGO',
       'Universities and Research Institutes', 'Church and Faith Based',
       'Networks', 'Public and Private Partnerships'], dtype=object)

Since Implementing Agency is above Channel information, I'll focus on that.

**Move onto DAC data**

In [8]:
# Too general
df.dac_category_name.unique()

array(['Governance', 'Other', 'Infrastructure', 'Humanitarian',
       'Economic Growth', 'Commodity Assistance', 'Health and Population',
       'Agriculture', 'Education', 'Administrative Costs'], dtype=object)

In [9]:
# Some categories don't give a ton of information, but might be a good 
# place to start
df.dac_sector_name.unique()

array(['Conflict, Peace, and Security', 'Government and Civil Society',
       'Other Multisector', 'Transport and Storage', 'Energy',
       'Other Social Infrastructure and Services',
       'General Budget Support', 'Emergency Response', 'Industry',
       'Mineral Resources and Mining', 'Banking and Financial Services',
       'Developmental Food Aid/Food Security Assistance', 'Basic Health',
       'Reconstruction Relief and Rehabilitation',
       'Water Supply and Sanitation', 'Trade Policy and Regulations',
       'HIV/AIDS', 'Action Relating To Debt',
       'Other Commodity Assistance', 'General Environmental Protection',
       'Agriculture', 'Business and Other Services', 'Construction',
       'Education, Level Unspecified', 'Secondary Education',
       'Post-Secondary Education', 'Communications',
       'Maternal and Child Health, Family Planning', 'Operating Expenses',
       'Basic Education', 'Administration and Oversight', 'Tourism',
       'Health, General', 'Unall

In [10]:
# Might be too granular to use for general model (certain programs 
# apply more to certain countries/regions than others)
df.dac_purpose_name.unique()

array(['Security system management and reform',
       'Public sector policy and administrative management',
       'Multisector aid',
       'Transport policy and administrative management',
       'Electric power transmission and  distribution',
       'Narcotics control', 'Legal and judicial development',
       'General budget support-related aid',
       'Material relief assistance and services', 'Energy manufacturing',
       'Energy policy and administrative management', 'Oil and gas',
       'Financial policy and administrative management',
       'Food aid/Food security programmes',
       'Nuclear energy electric power plants', 'Basic health care',
       'Social/ welfare services',
       'Reconstruction relief and rehabilitation',
       'Water sector policy and administrative management',
       'Road transport', 'Emergency food aid',
       'Trade policy and administrative management',
       'STD control including HIV/AIDS', 'Water supply - large systems',
       'Public

**Other Categories:**

In [11]:
# Too granular
df.funding_account_name.unique()

array(['Department of the Army, Afghanistan Security Forces Fund',
       'Department of the Army, Iraq Security Forces Fund',
       'Foreign Military Financing, Direct Loan Program Account',
       'Foreign Military Financing Program',
       'Department of the Army, Iraq Train and Equip Fund',
       'Economic Support Fund', 'INACTIVE - USAID Grants',
       'Iraq Relief and Reconstruction Fund',
       'Department of State, Andean Counterdrug Programs',
       'International Narcotics Control and Law Enforcement',
       'Department of Energy, Defense Nuclear Nonproliferation',
       'Department of the Army, Pakistan Counterinsurgency Fund',
       'Department of Defense, Natural Resources Risk Remediation Fund',
       'Department of  State, Migration and Refugee Assistance',
       'Public Law 480 Title I Food for Progress Credits, Program Account',
       'Department of Defense, Overseas Humanitarian, Disaster, and Civic Aid',
       'Department of the Army, Operations and Main

In [12]:
# Funding information seems redundant, since Implementing Agency already includes much of the information
df.funding_agency_name.unique()

array(['Department of the Army', 'Department of Defense',
       'U.S. Agency for International Development',
       'Executive Office of the President', 'Department of State',
       'Department of Energy', 'Department of Agriculture',
       'Millennium Challenge Corporation', 'Department of the Interior',
       'Department of Health and Human Services',
       'Department of the Treasury', 'Open World Leadership Center',
       'Department of Labor', 'Department of Justice', 'Peace Corps',
       'Environmental Protection Agency',
       'Department of Homeland Security', 'Department of Commerce',
       'Trade and Development Agency', 'Inter-American Foundation',
       'Department of Transportation', 'African Development Foundation',
       'Corps of Engineers, Civil', 'National Science Foundation',
       'Department of the Navy', 'Department of the Air Force',
       'Federal Trade Commission'], dtype=object)

In [13]:
# Might be redundant (Implementing agency essentially captures this data)
df.assistance_category_name.unique()

array(['Military', 'Economic'], dtype=object)

In [14]:
# Remove
df.activity_name.unique()

array(['Afghanistan Security Force Fund', 'Iraq Security Force Fund',
       'Department of Defense - Direct Loans', ...,
       'Total Freight for  Wheat, Soy Blend',
       'Commodity Value for  Vegetable Oil, bulk / Metric Tons: -5271.',
       'Total Freight for  Bulgur / Metric Tons: 7406.8.'], dtype=object)

In [15]:
######## Decided to use Obligations instead!! 
######## Disbursements data didn't have enough data from the 1990s, 
######## and included negative numbers which likely would have thrown off model

# Keep only Disbursements

# We only want to look at Disbursements, since that's when money/aid 
# is actually reaching the countries it's been alloted to 
# ("Obligations" implies the setting aside of money for the cause, 
# but it could take time for the money to actually reach the intended
# country it was promised to... Since we're trying to track the actual 
# results of the aid, it makes more sense to look at when the money 
# arrived)

# Potential other EDA blurb: Look at how long it takes on average for 
# money obligated to a cause to be disbursed???

df.transaction_type_name.unique()

array(['Obligations', 'Disbursements'], dtype=object)

In [16]:
# Current_Amount vs Constant_Amount:
# current amount was amount in year money was disbursed, 
# constant amount converts all those values to 2016 USD values
# (keep only constant_amount)

In [17]:
# This is actually more detailed/better described than dac_sector_name
# Use these labels instead
df.USG_sector_name.unique()

array(['Stabilization Operations and Security Sector Reform',
       'Good Governance', 'Peace and Security - General',
       'Macroeconomic Foundation for Growth', 'Infrastructure',
       'Counter-Narcotics', 'Rule of Law and Human Rights',
       'Protection, Assistance and Solutions', 'Manufacturing',
       'Mining and Natural Resources', 'Financial Sector',
       'Humanitarian Assistance - General', 'Other Public Health Threats',
       'Social Services', 'Water Supply and Sanitation',
       'Trade and Investment', 'HIV/AIDS',
       'Policies, Regulations, and Systems', 'Environment',
       'Social Assistance', 'Conflict Mitigation and Reconciliation',
       'Debt Relief', 'Multi-sector - Unspecified',
       'Combating Weapons of Mass Destruction (WMD)', 'Agriculture',
       'Private Sector Competitiveness', 'Health - General',
       'Education and Social Services - General', 'Higher Education',
       'Counter-Terrorism', 'Maternal and Child Health', 'Civil Society',
  

### Delete unnecessary columns

In [18]:
df.drop(columns = {'channel_category_name', 
                   'channel_name', 
                   'dac_category_name', 
                   'dac_sector_name', 
                   'dac_purpose_name', 
                   'funding_account_name', 
                   'funding_agency_name', 
                   'activity_name', 
                   'current_amount'}, 
       inplace = True)

### Limit data to 'Obligations'

In [19]:
df = df.loc[df.transaction_type_name == 'Obligations']

In [20]:
df.drop(columns = {'transaction_type_name'}, inplace = True)

In [21]:
df.fiscal_year.unique()

array([2011, 2012, 2013, 2010, 2015, 2008, 2009, 2006, 2014, 2017, 2007,
       2003, 2016, 2000, 2005, 2004, 2002, 2001, 1999, 1991, 1994, 1998,
       1995, 1993, 1992, 1996, 1997, 1990])

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 359342 entries, 0 to 986232
Data columns (total 8 columns):
country_name                359342 non-null object
region_name                 359342 non-null object
income_group_name           359342 non-null object
implementing_agency_name    359342 non-null object
assistance_category_name    359342 non-null object
fiscal_year                 359342 non-null int64
constant_amount             359342 non-null int64
USG_sector_name             359342 non-null object
dtypes: int64(2), object(6)
memory usage: 24.7+ MB


In [23]:
# Pickle cleaned dataframe
df.to_pickle('./clean_usaid.pkl')