In [54]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [55]:
!pip install xgboost



In [56]:
# Data
import numpy as np
import pandas as pd

# Modeling
import patsy
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
#from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score
#from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.metrics import r2_score

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
%config InlineBackend.figure_formats = ['retina']
sns.set_style("white")

In [57]:
# Import pickled df with all data
df = pd.read_pickle('./all_data.pkl')

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54262 entries, 0 to 54261
Data columns (total 10 columns):
country_name                54262 non-null object
region_name                 54262 non-null object
income_group_name           54262 non-null object
fiscal_year                 54262 non-null int64
HDI_Change                  54262 non-null float64
assistance_category_name    54262 non-null object
implementing_agency_name    54262 non-null object
dac_category_name           54262 non-null object
dac_sector_name             54262 non-null object
constant_amount             54262 non-null int64
dtypes: float64(1), int64(2), object(7)
memory usage: 4.1+ MB


In [59]:
df.head()

Unnamed: 0,country_name,region_name,income_group_name,fiscal_year,HDI_Change,assistance_category_name,implementing_agency_name,dac_category_name,dac_sector_name,constant_amount
0,Afghanistan,South and Central Asia,Low Income Country,2003,0.01,Economic,Department of Agriculture,Agriculture,Agriculture,81354
1,Afghanistan,South and Central Asia,Low Income Country,2003,0.01,Economic,Department of Agriculture,Commodity Assistance,Developmental Food Aid/Food Security Assistance,6971711
2,Afghanistan,South and Central Asia,Low Income Country,2003,0.01,Economic,Department of Agriculture,Education,Basic Education,4392782
3,Afghanistan,South and Central Asia,Low Income Country,2003,0.01,Economic,Department of Agriculture,Other,General Environmental Protection,34632
4,Afghanistan,South and Central Asia,Low Income Country,2003,0.01,Economic,Department of Health and Human Services,Governance,"Conflict, Peace, and Security",915696


In [60]:
df['Agency_and_Sector'] = df['implementing_agency_name'] + ': ' + df['dac_sector_name']

In [62]:
df.head(10)

Unnamed: 0,country_name,region_name,income_group_name,fiscal_year,HDI_Change,assistance_category_name,implementing_agency_name,dac_category_name,dac_sector_name,constant_amount,Agency_and_Sector
0,Afghanistan,South and Central Asia,Low Income Country,2003,0.01,Economic,Department of Agriculture,Agriculture,Agriculture,81354,Department of Agriculture: Agriculture
1,Afghanistan,South and Central Asia,Low Income Country,2003,0.01,Economic,Department of Agriculture,Commodity Assistance,Developmental Food Aid/Food Security Assistance,6971711,Department of Agriculture: Developmental Food ...
2,Afghanistan,South and Central Asia,Low Income Country,2003,0.01,Economic,Department of Agriculture,Education,Basic Education,4392782,Department of Agriculture: Basic Education
3,Afghanistan,South and Central Asia,Low Income Country,2003,0.01,Economic,Department of Agriculture,Other,General Environmental Protection,34632,Department of Agriculture: General Environment...
4,Afghanistan,South and Central Asia,Low Income Country,2003,0.01,Economic,Department of Health and Human Services,Governance,"Conflict, Peace, and Security",915696,Department of Health and Human Services: Confl...
5,Afghanistan,South and Central Asia,Low Income Country,2003,0.01,Economic,Department of Health and Human Services,Health and Population,Basic Health,3976353,Department of Health and Human Services: Basic...
6,Afghanistan,South and Central Asia,Low Income Country,2003,0.01,Economic,Department of Labor,Governance,"Conflict, Peace, and Security",3924413,"Department of Labor: Conflict, Peace, and Secu..."
7,Afghanistan,South and Central Asia,Low Income Country,2003,0.01,Economic,Department of State,Economic Growth,Business and Other Services,217919,Department of State: Business and Other Services
8,Afghanistan,South and Central Asia,Low Income Country,2003,0.01,Economic,Department of State,Governance,"Conflict, Peace, and Security",14781954,"Department of State: Conflict, Peace, and Secu..."
9,Afghanistan,South and Central Asia,Low Income Country,2003,0.01,Economic,Department of State,Governance,Government and Civil Society,1954992,Department of State: Government and Civil Society


In [65]:
# Transform df so entries for year/country are grouped (want one row per country per year)
df = df.groupby(['region_name', 'income_group_name', 'fiscal_year', 
                 'country_name', 'HDI_Change', 'Agency_and_Sector']
               )['constant_amount'].sum().unstack('Agency_and_Sector').reset_index()
df.head()

Agency_and_Sector,region_name,income_group_name,fiscal_year,country_name,HDI_Change,African Development Foundation: Agriculture,African Development Foundation: Banking and Financial Services,African Development Foundation: Basic Education,African Development Foundation: Basic Health,African Development Foundation: Business and Other Services,...,U.S. Agency for International Development: Post-Secondary Education,U.S. Agency for International Development: Program Design and Learning,U.S. Agency for International Development: Reconstruction Relief and Rehabilitation,U.S. Agency for International Development: Secondary Education,U.S. Agency for International Development: Trade Policy and Regulations,U.S. Agency for International Development: Transport and Storage,U.S. Agency for International Development: Unallocated/ Unspecified,U.S. Agency for International Development: Water Supply and Sanitation,"United States Institute of Peace: Conflict, Peace, and Security",United States Institute of Peace: Government and Civil Society
0,East Asia and Oceania,High Income Country,1991,China (P.R. Hong Kong),0.005,,,,,,...,,,,,,,,,,
1,East Asia and Oceania,High Income Country,1991,Korea Republic,0.011,,,,,,...,,,,,,,,,,
2,East Asia and Oceania,High Income Country,1991,Singapore,0.011,,,,,,...,,,,,,,,,,
3,East Asia and Oceania,High Income Country,1992,China (P.R. Hong Kong),0.005,,,,,,...,,,,,,,,,,
4,East Asia and Oceania,High Income Country,1992,Korea Republic,0.007,,,,,,...,,,,,,,,,,


In [67]:
# Fill all NaNs with 0
df.fillna(0, inplace=True)

In [69]:
df.head()

Agency_and_Sector,region_name,income_group_name,fiscal_year,country_name,HDI_Change,African Development Foundation: Agriculture,African Development Foundation: Banking and Financial Services,African Development Foundation: Basic Education,African Development Foundation: Basic Health,African Development Foundation: Business and Other Services,...,U.S. Agency for International Development: Post-Secondary Education,U.S. Agency for International Development: Program Design and Learning,U.S. Agency for International Development: Reconstruction Relief and Rehabilitation,U.S. Agency for International Development: Secondary Education,U.S. Agency for International Development: Trade Policy and Regulations,U.S. Agency for International Development: Transport and Storage,U.S. Agency for International Development: Unallocated/ Unspecified,U.S. Agency for International Development: Water Supply and Sanitation,"United States Institute of Peace: Conflict, Peace, and Security",United States Institute of Peace: Government and Civil Society
0,East Asia and Oceania,High Income Country,1991,China (P.R. Hong Kong),0.005,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,East Asia and Oceania,High Income Country,1991,Korea Republic,0.011,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,East Asia and Oceania,High Income Country,1991,Singapore,0.011,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,East Asia and Oceania,High Income Country,1992,China (P.R. Hong Kong),0.005,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,East Asia and Oceania,High Income Country,1992,Korea Republic,0.007,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4056 entries, 0 to 4055
Columns: 302 entries, region_name to United States Institute of Peace: Government and Civil Society
dtypes: float64(298), int64(1), object(3)
memory usage: 9.3+ MB


#### Split data into data/target categories

In [71]:
# Define small input df
X = df.loc[:, df.columns != 'HDI_Change']

# Define Targets
y = df['HDI_Change']

#### Transform categorical variables

In [72]:
# Transform 'country_name', 'region_name', 'implementing_agency_name', 
# and 'USG_sector_name' using get_dummies

X = pd.concat([X, pd.get_dummies(X[['region_name', 'country_name']])], axis=1)
X.drop(['country_name', 'region_name'], axis=1, inplace=True)

In [73]:
# Change assistance_category_name to Economic_Assistance (0 or 1)
#X['Economic_Assistance'] = X.assistance_category_name.apply(lambda x: 1 if x == 'Economic' else 0)

# Drop assistance_category_name column
#X.drop('assistance_category_name', axis=1, inplace=True)

In [74]:
# Encode Income Group Name:
# 1 = Low Income Country
# 2 = Lower Middle Income Country
# 3 = Upper Middle Income Country
# 4 = High Income Country

def rankIncomes(incomeClass):
    if incomeClass == 'Low Income Country':
        return 0
    elif incomeClass == 'Lower Middle Income Country':
        return 1
    elif incomeClass == 'Upper Middle Income Country':
        return 2
    else:
        return 3

# Apply function and drop original column
X['Country_Income_Class'] = X.income_group_name.apply(rankIncomes)
X.drop('income_group_name', axis=1, inplace=True)

In [75]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4056 entries, 0 to 4055
Columns: 490 entries, fiscal_year to Country_Income_Class
dtypes: float64(297), int64(2), uint8(191)
memory usage: 10.0 MB


In [79]:
list(X.columns)

['fiscal_year',
 'African Development Foundation: Agriculture',
 'African Development Foundation: Banking and Financial Services',
 'African Development Foundation: Basic Education',
 'African Development Foundation: Basic Health',
 'African Development Foundation: Business and Other Services',
 'African Development Foundation: Conflict, Peace, and Security',
 'African Development Foundation: Construction',
 'African Development Foundation: Developmental Food Aid/Food Security Assistance',
 'African Development Foundation: Education, Level Unspecified',
 'African Development Foundation: Energy',
 'African Development Foundation: General Environmental Protection',
 'African Development Foundation: Government and Civil Society',
 'African Development Foundation: HIV/AIDS',
 'African Development Foundation: Health, General',
 'African Development Foundation: Industry',
 'African Development Foundation: Maternal and Child Health, Family Planning',
 'African Development Foundation: Mineral 

In [13]:
# Look at feature collinearity 
#plt.figure(figsize=(200, 200))
#sns.set_context("paper")
#sns.heatmap(X.corr(), annot=False, cmap='coolwarm', vmin=-1, vmax=1)
#plt.savefig('feature_correlation.png', bbox_inches = 'tight');

### Train/Val/Test Split

In [77]:
#Split the data 60 - 20 - 20 train/val/test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=.25, random_state=43)

### Standardize

In [15]:
#After train/test split, Standardize numerical features (e.g. constant_amount)
from sklearn.preprocessing import StandardScaler

scaled_train = X_train.copy()
scaled_test = X_test.copy()

col_names = ['African Development Foundation: Agriculture', 
             'African Development Foundation: Banking and Financial Services', 
             'African Development Foundation: Basic Education',
             'African Development Foundation: Basic Health',
             'African Development Foundation: Business and Other Services',
             'African Development Foundation: Conflict, Peace, and Security',
             'African Development Foundation: Construction',
             'African Development Foundation: Developmental Food Aid/Food Security Assistance',
             'African Development Foundation: Education, Level Unspecified',
             'African Development Foundation: Energy',
             'African Development Foundation: General Environmental Protection',
             'African Development Foundation: Government and Civil Society',
             'African Development Foundation: HIV/AIDS',
             'African Development Foundation: Health, General',
             'African Development Foundation: Industry',
             'African Development Foundation: Maternal and Child Health, Family Planning',
             'African Development Foundation: Mineral Resources and Mining',
             'African Development Foundation: Operating Expenses',
             'African Development Foundation: Other Multisector',
             'African Development Foundation: Other Social Infrastructure and Services',
             'African Development Foundation: Post-Secondary Education',
             'African Development Foundation: Secondary Education',
             'African Development Foundation: Tourism',
             'African Development Foundation: Trade Policy and Regulations',
             'African Development Foundation: Transport and Storage',
             'African Development Foundation: Water Supply and Sanitation',
             'Department of Agriculture: Action Relating To Debt',
             'Department of Agriculture: Agriculture',
             'Department of Agriculture: Basic Education',
             'Department of Agriculture: Basic Health',
             'Department of Agriculture: Conflict, Peace, and Security',
             'Department of Agriculture: Developmental Food Aid/Food Security Assistance',
             'Department of Agriculture: Disaster Prevention and Preparedness',
             'Department of Agriculture: Emergency Response',
             'Department of Agriculture: General Environmental Protection',
             'Department of Agriculture: Government and Civil Society',
             'Department of Agriculture: Health, General',
             'Department of Agriculture: Other Multisector',
             'Department of Agriculture: Tourism',
             'Department of Agriculture: Trade Policy and Regulations',
             'Department of Agriculture: Water Supply and Sanitation',
             'Department of Commerce: Agriculture',
             'Department of Commerce: Business and Other Services',
             'Department of Commerce: Disaster Prevention and Preparedness',
             'Department of Commerce: Energy',
             'Department of Commerce: General Environmental Protection',
             'Department of Commerce: Mineral Resources and Mining',
             'Department of Commerce: Other Multisector',
             'Department of Commerce: Post-Secondary Education',
             'Department of Commerce: Trade Policy and Regulations',
             'Department of Defense: Agriculture',
             'Department of Defense: Banking and Financial Services',
             'Department of Defense: Basic Education',
             'Department of Defense: Basic Health',
             'Department of Defense: Business and Other Services',
             'Department of Defense: Communications',
             'Department of Defense: Conflict, Peace, and Security',
             'Department of Defense: Construction',
             'Department of Defense: Disaster Prevention and Preparedness',
             'Department of Defense: Education, Level Unspecified',
             'Department of Defense: Emergency Response',
             'Department of Defense: Energy',
             'Department of Defense: General Environmental Protection',
             'Department of Defense: Government and Civil Society',
             'Department of Defense: HIV/AIDS',
             'Department of Defense: Health, General',
             'Department of Defense: Industry',
             'Department of Defense: Mineral Resources and Mining',
             'Department of Defense: Other Multisector',
             'Department of Defense: Other Social Infrastructure and Services',
             'Department of Defense: Post-Secondary Education',
             'Department of Defense: Reconstruction Relief and Rehabilitation',
             'Department of Defense: Secondary Education',
             'Department of Defense: Transport and Storage',
             'Department of Defense: Water Supply and Sanitation',
             'Department of Energy: Conflict, Peace, and Security',
             'Department of Energy: Energy',
             'Department of Energy: General Environmental Protection',
             'Department of Energy: Trade Policy and Regulations',
             'Department of Health and Human Services: Basic Health',
             'Department of Health and Human Services: Conflict, Peace, and Security',
             'Department of Health and Human Services: Emergency Response',
             'Department of Health and Human Services: HIV/AIDS',
             'Department of Health and Human Services: Health, General',
             'Department of Health and Human Services: Maternal and Child Health, Family Planning',
             'Department of Health and Human Services: Operating Expenses',
             'Department of Health and Human Services: Water Supply and Sanitation', 
             'Department of Homeland Security: Conflict, Peace, and Security',
             'Department of Homeland Security: Government and Civil Society',
             'Department of Homeland Security: Other Social Infrastructure and Services',
             'Department of Homeland Security: Trade Policy and Regulations',
             'Department of Justice: Conflict, Peace, and Security',
             'Department of Justice: General Environmental Protection',
             'Department of Justice: Government and Civil Society',
             'Department of Justice: Operating Expenses',
             'Department of Justice: Other Social Infrastructure and Services',
             'Department of Labor: Basic Education',
             'Department of Labor: Conflict, Peace, and Security',
             'Department of Labor: Government and Civil Society',
             'Department of Labor: HIV/AIDS',
             'Department of Labor: Other Social Infrastructure and Services',
             'Department of State: Agriculture',
             'Department of State: Banking and Financial Services',
             'Department of State: Basic Education',
             'Department of State: Basic Health',
             'Department of State: Business and Other Services',
             'Department of State: Communications',
             'Department of State: Conflict, Peace, and Security',
             'Department of State: Disaster Prevention and Preparedness',
             'Department of State: Education, Level Unspecified',
             'Department of State: Emergency Response',
             'Department of State: Energy',
             'Department of State: General Environmental Protection',
             'Department of State: Government and Civil Society',
             'Department of State: HIV/AIDS',
             'Department of State: Health, General',
             'Department of State: Industry',
             'Department of State: Maternal and Child Health, Family Planning',
             'Department of State: Mineral Resources and Mining',
             'Department of State: Operating Expenses',
             'Department of State: Other Multisector',
             'Department of State: Other Social Infrastructure and Services',
             'Department of State: Post-Secondary Education',
             'Department of State: Reconstruction Relief and Rehabilitation',
             'Department of State: Secondary Education',
             'Department of State: Trade Policy and Regulations',
             'Department of State: Water Supply and Sanitation',
             'Department of Transportation: Transport and Storage',
             'Department of the Air Force: Conflict, Peace, and Security',
             'Department of the Army: Agriculture',
             'Department of the Army: Basic Education',
             'Department of the Army: Basic Health',
             'Department of the Army: Communications',
             'Department of the Army: Conflict, Peace, and Security',
             'Department of the Army: Developmental Food Aid/Food Security Assistance',
             'Department of the Army: Education, Level Unspecified',
             'Department of the Army: Emergency Response',
             'Department of the Army: Energy',
             'Department of the Army: Government and Civil Society',
             'Department of the Army: Health, General',
             'Department of the Army: Industry',
             'Department of the Army: Mineral Resources and Mining',
             'Department of the Army: Other Multisector',
             'Department of the Army: Other Social Infrastructure and Services',
             'Department of the Army: Reconstruction Relief and Rehabilitation',
             'Department of the Army: Transport and Storage',
             'Department of the Army: Water Supply and Sanitation',
             'Department of the Interior: Basic Education',
             'Department of the Interior: Business and Other Services',
             'Department of the Interior: Communications',
             'Department of the Interior: Conflict, Peace, and Security',
             'Department of the Interior: Disaster Prevention and Preparedness',
             'Department of the Interior: Education, Level Unspecified',
             'Department of the Interior: Emergency Response',
             'Department of the Interior: Energy',
             'Department of the Interior: General Budget Support',
             'Department of the Interior: General Environmental Protection',
             'Department of the Interior: Government and Civil Society',
             'Department of the Interior: Health, General',
             'Department of the Interior: Operating Expenses',
             'Department of the Interior: Other Multisector',
             'Department of the Interior: Other Social Infrastructure and Services',
             'Department of the Interior: Post-Secondary Education',
             'Department of the Interior: Tourism',
             'Department of the Interior: Water Supply and Sanitation',
             'Department of the Navy: Conflict, Peace, and Security',
             'Department of the Treasury: Action Relating To Debt',
             'Department of the Treasury: Banking and Financial Services', 
             'Department of the Treasury: Business and Other Services',
             'Department of the Treasury: Energy',
             'Department of the Treasury: Government and Civil Society',
             'Department of the Treasury: Unallocated/ Unspecified',
             'Environmental Protection Agency: Conflict, Peace, and Security',
             'Environmental Protection Agency: General Environmental Protection',
             'Environmental Protection Agency: Water Supply and Sanitation',
             'Federal Trade Commission: Business and Other Services',
             'Inter-American Foundation: Agriculture',
             'Inter-American Foundation: Banking and Financial Services',
             'Inter-American Foundation: Basic Education',
             'Inter-American Foundation: Basic Health',
             'Inter-American Foundation: Business and Other Services',
             'Inter-American Foundation: Communications',
             'Inter-American Foundation: Conflict, Peace, and Security',
             'Inter-American Foundation: Disaster Prevention and Preparedness',
             'Inter-American Foundation: Education, Level Unspecified',
             'Inter-American Foundation: Emergency Response',
             'Inter-American Foundation: Energy',
             'Inter-American Foundation: General Environmental Protection',
             'Inter-American Foundation: Government and Civil Society',
             'Inter-American Foundation: HIV/AIDS',
             'Inter-American Foundation: Health, General',
             'Inter-American Foundation: Industry',
             'Inter-American Foundation: Mineral Resources and Mining',
             'Inter-American Foundation: Operating Expenses',
             'Inter-American Foundation: Other Multisector',
             'Inter-American Foundation: Other Social Infrastructure and Services',
             'Inter-American Foundation: Post-Secondary Education',
             'Inter-American Foundation: Reconstruction Relief and Rehabilitation',
             'Inter-American Foundation: Secondary Education',
             'Inter-American Foundation: Tourism',
             'Inter-American Foundation: Trade Policy and Regulations',
             'Inter-American Foundation: Water Supply and Sanitation',
             'Millennium Challenge Corporation: Agriculture',
             'Millennium Challenge Corporation: Banking and Financial Services',
             'Millennium Challenge Corporation: Basic Education',
             'Millennium Challenge Corporation: Basic Health',
             'Millennium Challenge Corporation: Business and Other Services',
             'Millennium Challenge Corporation: Communications',
             'Millennium Challenge Corporation: Education, Level Unspecified',
             'Millennium Challenge Corporation: Energy',
             'Millennium Challenge Corporation: General Environmental Protection',
             'Millennium Challenge Corporation: Government and Civil Society',
             'Millennium Challenge Corporation: HIV/AIDS',
             'Millennium Challenge Corporation: Health, General',
             'Millennium Challenge Corporation: Industry',
             'Millennium Challenge Corporation: Operating Expenses',
             'Millennium Challenge Corporation: Other Multisector',
             'Millennium Challenge Corporation: Other Social Infrastructure and Services',
             'Millennium Challenge Corporation: Post-Secondary Education',
             'Millennium Challenge Corporation: Secondary Education',
             'Millennium Challenge Corporation: Tourism',
             'Millennium Challenge Corporation: Trade Policy and Regulations',
             'Millennium Challenge Corporation: Transport and Storage',
             'Millennium Challenge Corporation: Water Supply and Sanitation',
             'National Science Foundation: Conflict, Peace, and Security',
             'Open World Leadership Center: Government and Civil Society',
             'Open World Leadership Center: Other Multisector',
             'Overseas Private Investment Corporation: Banking and Financial Services',
             'Peace Corps: Basic Education',
             'Peace Corps: Basic Health',
             'Peace Corps: General Environmental Protection',
             'Peace Corps: HIV/AIDS',
             'Peace Corps: Operating Expenses',
             'Peace Corps: Other Multisector',
             'Peace Corps: Other Social Infrastructure and Services',
             'Trade and Development Agency: Agriculture',
             'Trade and Development Agency: Banking and Financial Services',
             'Trade and Development Agency: Basic Health',
             'Trade and Development Agency: Business and Other Services',
             'Trade and Development Agency: Communications',
             'Trade and Development Agency: Conflict, Peace, and Security',
             'Trade and Development Agency: Construction',
             'Trade and Development Agency: Disaster Prevention and Preparedness',
             'Trade and Development Agency: Education, Level Unspecified',
             'Trade and Development Agency: Emergency Response',
             'Trade and Development Agency: Energy',
             'Trade and Development Agency: General Environmental Protection',
             'Trade and Development Agency: Government and Civil Society',
             'Trade and Development Agency: Health, General',
             'Trade and Development Agency: Industry',
             'Trade and Development Agency: Maternal and Child Health, Family Planning',
             'Trade and Development Agency: Mineral Resources and Mining',
             'Trade and Development Agency: Operating Expenses',
             'Trade and Development Agency: Other Multisector',
             'Trade and Development Agency: Other Social Infrastructure and Services',
             'Trade and Development Agency: Post-Secondary Education',
             'Trade and Development Agency: Reconstruction Relief and Rehabilitation',
             'Trade and Development Agency: Secondary Education',
             'Trade and Development Agency: Tourism',
             'Trade and Development Agency: Trade Policy and Regulations',
             'Trade and Development Agency: Transport and Storage',
             'Trade and Development Agency: Water Supply and Sanitation',
             'U.S. Agency for International Development: Administration and Oversight',
             'U.S. Agency for International Development: Agriculture',
             'U.S. Agency for International Development: Banking and Financial Services',
             'U.S. Agency for International Development: Basic Education',
             'U.S. Agency for International Development: Basic Health',
             'U.S. Agency for International Development: Business and Other Services',
             'U.S. Agency for International Development: Communications',
             'U.S. Agency for International Development: Conflict, Peace, and Security',
             'U.S. Agency for International Development: Developmental Food Aid/Food Security Assistance',
             'U.S. Agency for International Development: Disaster Prevention and Preparedness',
             'U.S. Agency for International Development: Education, Level Unspecified',
             'U.S. Agency for International Development: Emergency Response',
             'U.S. Agency for International Development: Energy',
             'U.S. Agency for International Development: General Budget Support',
             'U.S. Agency for International Development: General Environmental Protection',
             'U.S. Agency for International Development: Government and Civil Society',
             'U.S. Agency for International Development: HIV/AIDS',
             'U.S. Agency for International Development: Health, General',
             'U.S. Agency for International Development: Industry',
             'U.S. Agency for International Development: Maternal and Child Health, Family Planning',
             'U.S. Agency for International Development: Mineral Resources and Mining',
             'U.S. Agency for International Development: Operating Expenses',
             'U.S. Agency for International Development: Other Commodity Assistance',
             'U.S. Agency for International Development: Other Multisector',
             'U.S. Agency for International Development: Other Social Infrastructure and Services', 
             'U.S. Agency for International Development: Post-Secondary Education',
             'U.S. Agency for International Development: Program Design and Learning',
             'U.S. Agency for International Development: Reconstruction Relief and Rehabilitation',
             'U.S. Agency for International Development: Secondary Education',
             'U.S. Agency for International Development: Trade Policy and Regulations',
             'U.S. Agency for International Development: Transport and Storage',
             'U.S. Agency for International Development: Unallocated/ Unspecified',
             'U.S. Agency for International Development: Water Supply and Sanitation',
             'United States Institute of Peace: Conflict, Peace, and Security',
             'United States Institute of Peace: Government and Civil Society']
features_train = scaled_train[col_names]
features_test = scaled_test[col_names]

scaler = StandardScaler()
train_amount = scaler.fit_transform(features_train)
test_amount = scaler.transform(features_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  del sys.path[0]


In [16]:
X_train['constant_amount'] = train_amount
X_test['constant_amount'] = test_amount

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [17]:
X_train.head()

Unnamed: 0,constant_amount,fiscal_year,region_name_East Asia and Oceania,region_name_Europe and Eurasia,region_name_Middle East and North Africa,region_name_South and Central Asia,region_name_Sub-Saharan Africa,region_name_Western Hemisphere,country_name_Afghanistan,country_name_Albania,...,USG_sector_name_Rule of Law and Human Rights,USG_sector_name_Social Assistance,USG_sector_name_Social Services,USG_sector_name_Stabilization Operations and Security Sector Reform,USG_sector_name_Trade and Investment,USG_sector_name_Transnational Crime,USG_sector_name_Tuberculosis,USG_sector_name_Water Supply and Sanitation,Economic_Assistance,Country_Income_Class
52566,-0.083203,2007,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
37595,-0.090869,2008,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,2
18259,-0.090986,2012,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,3
1604,-0.076017,2003,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,2
21395,-0.091449,2007,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


## Check distribution of target 

In [18]:
#histogram and normal probability plot
#from scipy.stats import norm
#sns.distplot(train['SalePrice'],fit=norm);
#fig = plt.figure()
#res = stats.probplot(train['SalePrice'], plot=plt)


## Initial OLS

In [19]:
# Create OLS model
#ols_model = sm.OLS(y_train, X_train)

# Fit OLS model to training set
#fit = ols_model.fit()

# Print summary statistics of the model's performance
#fit.summary()

## LassoCV

In [20]:
# Run cross validation, find the best alpha, refit the model on all the data with that alpha
#alphavec = 10**np.linspace(-4,4)

#lasso_model = LassoCV(alphas = alphavec, cv=5)
#lasso_model.fit(X_train, y_train)

# Best alpha value:
#lasso_model.alpha_

In [21]:
# These are the (standardized) coefficients found when it refit using that best alpha
#list(zip(X_train.columns, lasso_model.coef_))

In [22]:
# Make predictions on the test set using the model
#test_set_pred = lasso_model.predict(X_test)

# Evaluation:
#r2_score(y_test, test_set_pred)

## RidgeCV

In [23]:
#ridge_model = RidgeCV(alphas = alphavec, cv=5)
#ridge_model.fit(X_train, y_train)

#list(zip(X_train.columns, ridge_model.coef_))

In [24]:
# Make predictions on the test set using the model
#test_set_pred = ridge_model.predict(X_test)

# Evaluation:
#r2_score(y_test, test_set_pred)

## RandomForestRegressor

In [25]:
#from sklearn.model_selection import cross_val_score, GridSearchCV
#from sklearn.ensemble import RandomForestRegressor
#from sklearn.preprocessing import MinMaxScaler

In [26]:
#gsc = GridSearchCV(estimator=RandomForestRegressor(), 
#                   param_grid={
#                       'max_depth': range(3,7), 
#                       'n_estimators': (10, 50, 100, 1000)}, 
#                   cv=5, scoring='neg_mean_squared_error', 
#                   verbose=0, n_jobs=-1)

#grid_result = gsc.fit(X_train, y_train)
#best_params = grid_result.best_params_

In [27]:
#rfr_model = RandomForestRegressor(max_depth=best_params["max_depth"], 
#                                  n_estimators=best_params["n_estimators"], 
#                                  random_state=42, verbose=False)

#rfr_model.fit(X_train, y_train)

In [28]:
# Make predictions on the test set using the model
#rfr_test_set_pred = rfr_model.predict(X_test)

# Evaluation:
#r2_score(y_test, rfr_test_set_pred)

## XGBoost

In [29]:
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [30]:
#Evaluate models with Root Mean Squared Error
#def rmse(actuals, preds):
#    return np.sqrt(((actuals - preds) ** 2).mean())

In [31]:
gbm = xgb.XGBRegressor(n_estimators=30000, #arbitrary large number
                       max_depth=3,
                       objective="reg:squarederror",
                       learning_rate=.1, 
                       subsample=1,
                       min_child_weight=1,
                       colsample_bytree=.8)

In [32]:
eval_set=[(X_train, y_train),(X_val, y_val)] #tracking train/validation error as we go
fit_model = gbm.fit(X_train, y_train)#, 
     #               eval_set=eval_set,
   #                 eval_metric='rmse',
    #                early_stopping_rounds=20,
    #                verbose=True 
    #               )


  if getattr(data, 'base', None) is not None and \


KeyboardInterrupt: 

In [None]:
fit_model_pred = fit_model.predict(X_test)

r2_score(y_test, fit_model_pred)
#Measure Accuracy instead

In [None]:
#To tune, we should use validation results and ignore test until 
#final verification. So here's the validation error benchmark we want to beat:
#rmse(gbm.predict(X_val, ntree_limit=gbm.best_ntree_limit),y_val) 

In [None]:
#### Standard Scaling

#from sklearn.pipeline import Pipeline
#from sklearn.preprocessing import StandardScaler

## This step fits the Standard Scaler to the training data
## Essentially it finds the mean and standard deviation of each variable in the training set

#std = StandardScaler()
#std.fit(X_train.values)

## This step applies the scaler to the train set.
## It subtracts the mean it learned in the previous step and then divides by the standard deviation

#X_tr = std.transform(X_train.values)

## Apply the scaler to the test set

#X_te = std.transform(X_test.values)