## Estimate Continuous Measure of Poverty for Assets

This file looks specifically at predicting various asset indices using the continuous/regression grid search

## Parameters

In [1]:
# PARAMETERS
# I'm putting this comment to get AWS to register that I made a change so I can commit these changes
TEST_SIZE = 0.2

## Setup

In [2]:
import os, datetime
import numpy as np
import pandas as pd
from scipy import stats
#import geopandas as gpd
import json
#import rasterio
#from rasterio.plot import show
import matplotlib.pyplot as plt

from sklearn.preprocessing import KBinsDiscretizer, StandardScaler, normalize
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (BaggingRegressor, AdaBoostRegressor, 
                              GradientBoostingRegressor, RandomForestRegressor)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
#from keras.models import load_model
#from imblearn.over_sampling import RandomOverSampler

import logging, os 

import boto3
from sagemaker import get_execution_role
from s3fs.core import S3FileSystem 
s3 = S3FileSystem()
role = get_execution_role()

np.random.seed(42)

### User Defined Libraries ###
import grid_params as grids
#import config as cf
#import feature_extraction as fe

bucket = 'worldbank-pakistan-data'
LOCAL_DIR = '/home/ec2-user/SageMaker/'

## Functions

In [3]:
def train_models(params, x_train, x_test, y_train, y_test, verbose=False):
    '''
    Saves a .pkl file of TrainedRegressor objects for each model type, as
    AWS free tier server will usually not hold all 800+ objects in memory.

    Input:  params - dictionary of model parameters
            features - dataframe of feature data
            labels - dataframe of labels
            feature_sets - dictionary of string lists of feature names
    Output: dataframe of training errors
            Also saves a .pkl file of TrainedRegressor objects for each model
    '''
    count = 0

    # Loop over models, hyperparameter combinations, and feature sets
    # Save one set of trained models for each regressor
    results_df = pd.DataFrame()
    y_df = pd.DataFrame({'y': y_test})

    for i in params['regressors']:
        models = []
        for j in params[i]:

            count += 1
            if verbose:
                print(f'{datetime.datetime.now()} Model {count}: Training {i} with params {str(j)}')
            try:
                ### Initialize regressor, fit data, then append model to list
                regressor = eval(i)(**j)
                trained = regressor.fit(x_train, y_train)
                #models.append(TrainedRegressor(i, str(j), k, trained))

                ### Results
                y_pred = trained.predict(x_test)
                pearson_coef, p_value = stats.pearsonr(y_test, y_pred)

                pred_dict = {
                    'regressor': i,
                    'params': j,
                    'R2': r2_score(y_test, y_pred),
                    'MSE': mean_squared_error(y_test, y_pred),
                    'Correlation': pearson_coef,
                    'model_number': count
                 }

                results_df = results_df.append(pred_dict, ignore_index=True)
                y_df['y_pred_' + str(count)] = y_pred

            except Exception as e:
                print(f"{datetime.datetime.now()}    ERROR: {str(e)}")
                training_error_df.append({
                    'regressor': i,
                    'params': str(j),
                    'error_message': str(e)
                }, ignore_index=True)

    return results_df, y_df

## Load/Prep Data and Run Models

In [4]:
df = pd.read_csv(s3.open('{}/{}'.format(bucket, os.path.join('OPM', 'FinalData', 'Merged Datasets', 'cnn_merge_w_assets.csv'))))

In [5]:
df.head()

Unnamed: 0,cnn_pc_0_Nbands3,cnn_pc_1_Nbands3,cnn_pc_2_Nbands3,cnn_pc_3_Nbands3,cnn_pc_4_Nbands3,cnn_pc_5_Nbands3,cnn_pc_6_Nbands3,cnn_pc_7_Nbands3,cnn_pc_8_Nbands3,cnn_pc_9_Nbands3,...,asset_pca,additive_of_main,ammenit_pca,additive_of_ammenit,applian_pca,additive_of_applian,transp_pca,additive_of_transp,entert_pca,additive_of_entert
0,39.657277,316.585818,123.730639,-99.734413,74.571333,-280.209761,-37.870327,-147.286224,110.369955,-76.42115,...,0.133066,3.0,-0.098347,2.0,0.517764,1.0,-0.267899,0.0,0.563201,1.0
1,-727.501455,22.666208,-32.15075,55.803616,-80.666177,121.495729,-48.543442,-129.536411,-51.163886,1.646401,...,-0.049947,3.0,0.046258,3.0,-0.314641,0.0,-0.200183,1.0,-0.433604,0.0
2,-582.583372,254.735452,-21.474623,-114.379673,-70.889502,120.352252,-112.789128,-249.261587,-56.150562,45.430507,...,-0.238062,2.0,-0.098347,2.0,-0.314641,0.0,-0.267899,0.0,0.563201,1.0
3,-698.695464,64.260335,-23.856801,51.578066,-83.343577,126.602706,-62.152917,-138.074619,-36.396963,-8.506154,...,-0.749674,1.0,-0.693127,1.0,-0.314641,0.0,-0.267899,0.0,-0.433604,0.0
4,-85.636413,243.27527,114.789374,-41.459777,117.812657,-240.293008,27.189732,-116.371592,114.46605,-94.985057,...,0.96874,6.0,0.454189,4.0,1.071359,2.0,-0.267899,0.0,0.636058,2.0


In [None]:
import time
start = time.time()
print("started")

count = 1
for target in ['asset_pca',
     'additive_of_main',
     'ammenit_pca',
     'additive_of_ammenit',
     'applian_pca',
     'additive_of_applian',
     'transp_pca',
     'additive_of_transp',
     'entert_pca',
     'additive_of_entert']:

    print(target) # Print Status

    x = df.filter(regex='^cnn_', axis=1)
    y = df[target]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE)

    # Normalize
    x_scaler = StandardScaler().fit(x_train)

    x_train = x_scaler.transform(x_train)
    x_test = x_scaler.transform(x_test)

    # Train/Evaluate -------------------------------------------
    parameters = grids.GRID_REGRESS

    # r_df: dataframe of results. Contains "model_number" variable to match with pred_df
    # pred_df: predicted values (also contains true value). In format of y_pred_[model_number] 
    r_df, pred_df = train_models(parameters, x_train, x_test, y_train, y_test, verbose=False)

    r_df['target'] = target
    r_df.to_csv(os.path.join(LOCAL_DIR, 'results_' + str(count) + '.csv'))
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('Poverty Estimation Results', 'continuous', 'individual_files', target, 'results_' + str(count) + '.csv')).upload_file(os.path.join(LOCAL_DIR, 'results_' + str(count) + '.csv'))
    
    pred_df['target'] = target
    pred_df.to_csv(os.path.join(LOCAL_DIR, 'results_' + str(count) + '.csv'))
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('Poverty Estimation Results', 'continuous', 'predicted_values', target, 'results_' + str(count) + '.csv')).upload_file(os.path.join(LOCAL_DIR, 'results_' + str(count) + '.csv'))
    
    count = count + 1

end = time.time()
print('ended')
print("The time elapsed from start to end was", end - start)

started
asset_pca


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  po

In [None]:
test=2
2