In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
# Data
import numpy as np
import pandas as pd

# Modeling
import patsy
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
#from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score
#from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.metrics import r2_score

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
%config InlineBackend.figure_formats = ['retina']
sns.set_style("white")

In [3]:
# Import pickled df with all data
df = pd.read_pickle('./all_data.pkl')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58567 entries, 0 to 58566
Data columns (total 9 columns):
country_name                58567 non-null object
region_name                 58567 non-null object
income_group_name           58567 non-null object
fiscal_year                 58567 non-null int64
HDI_Change                  58567 non-null float64
assistance_category_name    58567 non-null object
implementing_agency_name    58567 non-null object
USG_sector_name             58567 non-null object
constant_amount             58567 non-null int64
dtypes: float64(1), int64(2), object(6)
memory usage: 4.0+ MB


#### Split data into data/target categories

In [5]:
X = df[['region_name', 'country_name',  
        'income_group_name', 
        'assistance_category_name', 
        'implementing_agency_name', 
        'USG_sector_name', 'constant_amount']]
#'fiscal_year', left out for now

y = df['HDI_Change']

#### Transform categorical variables

In [6]:
# Transform 'country_name', 'region_name', 'implementing_agency_name', 
# and 'USG_sector_name' using get_dummies

X = pd.concat([X, pd.get_dummies(X[['region_name', 'country_name', 
                                    'implementing_agency_name', 
                                    'USG_sector_name']])], axis=1)
X.drop(['country_name', 'region_name', 'implementing_agency_name', 'USG_sector_name'], axis=1, inplace=True)     #

In [7]:
# Change assistance_category_name to Economic_Assistance (0 or 1)
X['Economic_Assistance'] = X.assistance_category_name.apply(lambda x: 1 if x == 'Economic' else 0)

# Drop assistance_category_name column
X.drop('assistance_category_name', axis=1, inplace=True)

In [8]:
# Encode Income Group Name:
# 1 = Low Income Country
# 2 = Lower Middle Income Country
# 3 = Upper Middle Income Country
# 4 = High Income Country

def rankIncomes(incomeClass):
    if incomeClass == 'Low Income Country':
        return 0
    elif incomeClass == 'Lower Middle Income Country':
        return 1
    elif incomeClass == 'Upper Middle Income Country':
        return 2
    else:
        return 3

# Apply function and drop original column
X['Country_Income_Class'] = X.income_group_name.apply(rankIncomes)
X.drop('income_group_name', axis=1, inplace=True)

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58567 entries, 0 to 58566
Columns: 273 entries, constant_amount to Country_Income_Class
dtypes: int64(3), uint8(270)
memory usage: 16.4 MB


In [10]:
# Look at feature collinearity 
#plt.figure(figsize=(200, 200))
#sns.set_context("paper")
#sns.heatmap(X.corr(), annot=False, cmap='coolwarm', vmin=-1, vmax=1)
#plt.savefig('feature_correlation.png', bbox_inches = 'tight');

### Train/Test Split

In [11]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Standardize

In [12]:
#After train/test split, Standardize numerical features (e.g. constant_amount)
from sklearn.preprocessing import StandardScaler

scaled_train = X_train.copy()
scaled_test = X_test.copy()

col_name = ['constant_amount']
features_train = scaled_train[col_name]
features_test = scaled_test[col_name]

scaler = StandardScaler()
train_amount = scaler.fit_transform(features_train)
test_amount = scaler.transform(features_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  del sys.path[0]


In [13]:
X_train['constant_amount'] = train_amount
X_test['constant_amount'] = test_amount

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
X_train.head()

Unnamed: 0,constant_amount,region_name_East Asia and Oceania,region_name_Europe and Eurasia,region_name_Middle East and North Africa,region_name_South and Central Asia,region_name_Sub-Saharan Africa,region_name_Western Hemisphere,country_name_Afghanistan,country_name_Albania,country_name_Algeria,...,USG_sector_name_Rule of Law and Human Rights,USG_sector_name_Social Assistance,USG_sector_name_Social Services,USG_sector_name_Stabilization Operations and Security Sector Reform,USG_sector_name_Trade and Investment,USG_sector_name_Transnational Crime,USG_sector_name_Tuberculosis,USG_sector_name_Water Supply and Sanitation,Economic_Assistance,Country_Income_Class
43392,-0.085284,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
33192,-0.072543,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
33563,-0.040364,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,3
41565,-0.081659,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,2
17470,-0.048877,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


## Check distribution of target 

In [None]:
#histogram and normal probability plot
#from scipy.stats import norm
#sns.distplot(train['SalePrice'],fit=norm);
#fig = plt.figure()
#res = stats.probplot(train['SalePrice'], plot=plt)


## Initial OLS

In [15]:
# Create OLS model
ols_model = sm.OLS(y_train, X_train)

# Fit OLS model to training set
fit = ols_model.fit()

# Print summary statistics of the model's performance
fit.summary()

0,1,2,3
Dep. Variable:,HDI_Change,R-squared:,0.153
Model:,OLS,Adj. R-squared:,0.147
Method:,Least Squares,F-statistic:,28.13
Date:,"Sat, 15 Jun 2019",Prob (F-statistic):,0.0
Time:,23:21:14,Log-Likelihood:,160770.0
No. Observations:,40996,AIC:,-321000.0
Df Residuals:,40734,BIC:,-318700.0
Df Model:,261,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
constant_amount,5.933e-05,2.49e-05,2.383,0.017,1.05e-05,0.000
region_name_East Asia and Oceania,0.0037,0.000,15.407,0.000,0.003,0.004
region_name_Europe and Eurasia,0.0033,0.000,13.793,0.000,0.003,0.004
region_name_Middle East and North Africa,0.0016,0.000,6.647,0.000,0.001,0.002
region_name_South and Central Asia,0.0047,0.000,20.157,0.000,0.004,0.005
region_name_Sub-Saharan Africa,0.0042,0.000,18.028,0.000,0.004,0.005
region_name_Western Hemisphere,0.0024,0.000,10.190,0.000,0.002,0.003
country_name_Afghanistan,0.0017,0.000,7.341,0.000,0.001,0.002
country_name_Albania,0.0019,0.000,6.863,0.000,0.001,0.003

0,1,2,3
Omnibus:,9953.777,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,293530.25
Skew:,-0.538,Prob(JB):,0.0
Kurtosis:,16.065,Cond. No.,2.18e+16


## LassoCV

In [28]:
# Run cross validation, find the best alpha, refit the model on all the data with that alpha
alphavec = 10**np.linspace(-4,4)

lasso_model = LassoCV(alphas = alphavec, cv=5)
lasso_model.fit(X_train, y_train)

# Best alpha value:
lasso_model.alpha_

0.0001

In [29]:
# These are the (standardized) coefficients found when it refit using that best alpha
#list(zip(X_train.columns, lasso_model.coef_))

[('constant_amount', 0.0),
 ('region_name_East Asia and Oceania', 0.0),
 ('region_name_Europe and Eurasia', -0.0),
 ('region_name_Middle East and North Africa', -0.0008605773760942398),
 ('region_name_South and Central Asia', 0.0),
 ('region_name_Sub-Saharan Africa', 0.0),
 ('region_name_Western Hemisphere', -0.0008105026948773543),
 ('country_name_Afghanistan', 0.0),
 ('country_name_Albania', 0.0),
 ('country_name_Algeria', 0.0),
 ('country_name_Angola', 0.0),
 ('country_name_Antigua and Barbuda', -0.0),
 ('country_name_Argentina', -0.0),
 ('country_name_Armenia', -0.0),
 ('country_name_Australia', -0.0),
 ('country_name_Austria', -0.0),
 ('country_name_Azerbaijan', 0.0),
 ('country_name_Bahamas', -0.0),
 ('country_name_Bahrain', -0.0),
 ('country_name_Bangladesh', 0.0),
 ('country_name_Barbados', -0.0),
 ('country_name_Belarus', 0.0),
 ('country_name_Belgium', -0.0),
 ('country_name_Belize', -0.0),
 ('country_name_Benin', 0.0),
 ('country_name_Bhutan', 0.0),
 ('country_name_Bolivia',

In [30]:
# Make predictions on the test set using the model
test_set_pred = lasso_model.predict(X_test)

# Evaluation:
r2_score(y_test, test_set_pred)

0.02197188436593389

## RidgeCV

In [31]:
ridge_model = RidgeCV(alphas = alphavec, cv=5)
ridge_model.fit(X_train, y_train)

list(zip(X_train.columns, ridge_model.coef_))

[('constant_amount', 5.910309804234231e-05),
 ('region_name_East Asia and Oceania', 0.0004412263095932537),
 ('region_name_Europe and Eurasia', 0.0002682845115812343),
 ('region_name_Middle East and North Africa', -0.0016168505607801504),
 ('region_name_South and Central Asia', 0.0012852375944536996),
 ('region_name_Sub-Saharan Africa', 0.0005077210777827811),
 ('region_name_Western Hemisphere', -0.0008856189326052169),
 ('country_name_Afghanistan', 0.0011538436250922307),
 ('country_name_Albania', 0.0015524546244642002),
 ('country_name_Algeria', 0.002854553514100526),
 ('country_name_Angola', 0.005511734597696344),
 ('country_name_Antigua and Barbuda', -0.002387951347897094),
 ('country_name_Argentina', 0.0002538355936914008),
 ('country_name_Armenia', 0.000378513682764241),
 ('country_name_Australia', -0.001635821793515826),
 ('country_name_Austria', -0.0011209129889122001),
 ('country_name_Azerbaijan', 0.0017605948431935411),
 ('country_name_Bahamas', -0.0019597004766610977),
 ('co

In [32]:
# Make predictions on the test set using the model
test_set_pred = ridge_model.predict(X_test)

# Evaluation:
r2_score(y_test, test_set_pred)

0.15397214856263697

## ElasticNetCV

## RandomForestRegressor

In [36]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

In [37]:
gsc = GridSearchCV(estimator=RandomForestRegressor(), 
                   param_grid={
                       'max_depth': range(3,7), 
                       'n_estimators': (10, 50, 100, 1000)}, 
                   cv=5, scoring='neg_mean_squared_error', 
                   verbose=0, n_jobs=-1)

grid_result = gsc.fit(X_train, y_train)
best_params = grid_result.best_params_

In [39]:
rfr_model = RandomForestRegressor(max_depth=best_params["max_depth"], 
                                  n_estimators=best_params["n_estimators"], 
                                  random_state=42, verbose=False)

rfr_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=42, verbose=False,
           warm_start=False)

In [40]:
# Make predictions on the test set using the model
rfr_test_set_pred = rfr_model.predict(X_test)

# Evaluation:
r2_score(y_test, rfr_test_set_pred)

0.08864636838179085

## XGBoost

In [19]:
#Split the data 60 - 20 - 20 train/val/test

#X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
#X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=.25, random_state=43)

In [23]:
#### Standard Scaling

#from sklearn.pipeline import Pipeline
#from sklearn.preprocessing import StandardScaler

## This step fits the Standard Scaler to the training data
## Essentially it finds the mean and standard deviation of each variable in the training set

#std = StandardScaler()
#std.fit(X_train.values)

## This step applies the scaler to the train set.
## It subtracts the mean it learned in the previous step and then divides by the standard deviation

#X_tr = std.transform(X_train.values)

## Apply the scaler to the test set

#X_te = std.transform(X_test.values)