## Estimate Binary Measure of Poverty

## Parameters

In [61]:
# PARAMETERS
TEST_SIZE = 0.2

## Setup

In [62]:
import os, datetime
import numpy as np
import pandas as pd
#import geopandas as gpd
import json
#import rasterio
#from rasterio.plot import show
import matplotlib.pyplot as plt

from sklearn.preprocessing import KBinsDiscretizer, StandardScaler, normalize
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import Ridge

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (BaggingClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, RandomForestClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, classification_report)
#from keras.models import load_model
#from imblearn.over_sampling import RandomOverSampler

import logging, os 

import boto3
from sagemaker import get_execution_role
from s3fs.core import S3FileSystem 
s3 = S3FileSystem()
role = get_execution_role()

np.random.seed(42)

### User Defined Libraries ###
import grid_params as grids
#import config as cf
#import feature_extraction as fe

bucket = 'worldbank-pakistan-data'
LOCAL_DIR = '/home/ec2-user/SageMaker/'

## Functions

In [63]:
def train_models(params, x_train, x_test, y_train, y_test, verbose=False):
    '''
    Saves a .pkl file of TrainedRegressor objects for each model type, as
    AWS free tier server will usually not hold all 800+ objects in memory.

    Input:  params - dictionary of model parameters
            features - dataframe of feature data
            labels - dataframe of labels
            feature_sets - dictionary of string lists of feature names
    Output: dataframe of training errors
            Also saves a .pkl file of TrainedRegressor objects for each model
    '''
    count = 0

    # Loop over models, hyperparameter combinations, and feature sets
    # Save one set of trained models for each regressor
    results_df = pd.DataFrame()
    y_df = pd.DataFrame({'y': y_test})

    for i in params['regressors']:
        models = []
        for j in params[i]:

            count += 1
            if verbose:
                print(f'{datetime.datetime.now()} Model {count}: Training {i} with params {str(j)}')
            try:
                ### Initialize regressor, fit data, then append model to list
                regressor = eval(i)(**j)
                trained = regressor.fit(x_train, y_train)
                #models.append(TrainedRegressor(i, str(j), k, trained))

                ### Results
                y_pred = trained.predict(x_test)

                pred_dict = {
                    'regressor': i,
                    'params': j,
                    'accuracy_score': accuracy_score(y_test, y_pred),
                    'recall_score': recall_score(y_test, y_pred),
                    'precision_score': precision_score(y_test, y_pred),
                    'y_truth_1': sum(y_test == 1),
                    'y_truth_0': sum(y_test == 0),
                    'model_number': count
                 }

                results_df = results_df.append(pred_dict, ignore_index=True)
                y_df['y_pred_' + str(count)] = y_pred

            except Exception as e:
                print(f"{datetime.datetime.now()}    ERROR: {str(e)}")
                training_error_df.append({
                    'regressor': i,
                    'params': str(j),
                    'error_message': str(e)
                }, ignore_index=True)

    return results_df, y_df

## Load/Prep Data and Run Models

In [64]:
df = pd.read_csv(s3.open('{}/{}'.format(bucket, os.path.join('OPM', 'FinalData', 'Merged Datasets', 'cnn_merge.csv'))))

In [65]:
df.head()

Unnamed: 0,uid,period,year,province,psu,locality,treatment,panel,present11,present13,...,cnn_feat_90,cnn_feat_91,cnn_feat_92,cnn_feat_93,cnn_feat_94,cnn_feat_95,cnn_feat_96,cnn_feat_97,cnn_feat_98,cnn_feat_99
0,100389,2,2014,1,1,1,0,0,1,1,...,144.31438,220.20631,0.0,0.0,234.24104,309.0947,418.9084,0.0,0.0,35.905373
1,100401,2,2014,1,1,1,0,0,1,1,...,108.98802,0.0,31.65086,0.0,0.0,78.17691,69.51185,0.0,0.0,0.0
2,100581,2,2014,1,1,1,0,0,1,1,...,216.65073,0.0,0.0,0.0,0.0,70.73168,59.625446,0.0,0.0,0.0
3,101101,2,2014,1,1,1,0,1,1,1,...,129.46504,0.0,7.424894,0.0,0.0,77.71559,67.34999,0.0,0.0,0.0
4,101236,2,2014,1,1,1,0,0,1,0,...,125.643074,181.2686,0.0,0.0,200.46274,283.18677,425.39166,0.0,0.0,41.898094


In [66]:
df['pscores_poor'] = df.pscores <= 16.17
df['pscores_poor_med'] = df.pscores <= df.pscores.median()

df.pscores_poor.value_counts()

False    2335
True     1037
Name: pscores_poor, dtype: int64

In [67]:
count = 1
for target in ['pscores_poor', 'pscores_poor_med']:

    print(target) # Print Status

    x = df.filter(regex='^cnn_', axis=1)
    y = df[target]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE)

    # Normalize
    x_scaler = StandardScaler().fit(x_train)

    x_train = x_scaler.transform(x_train)
    x_test = x_scaler.transform(x_test)

    # Train/Evaluate -------------------------------------------
    parameters = grids.GRID_CLASS

    # r_df: dataframe of results. Contains "model_number" variable to match with pred_df
    # pred_df: predicted values (also contains true value). In format of y_pred_[model_number] 
    r_df, pred_df = train_models(parameters, x_train, x_test, y_train, y_test, verbose=False)

    r_df['target'] = target
    r_df.to_csv(os.path.join(LOCAL_DIR, 'results_' + str(count) + '.csv'))
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('Poverty Estimation Results', 'binary_classification', 'individual_files', 'results_' + str(count) + '.csv')).upload_file(os.path.join(LOCAL_DIR, 'results_' + str(count) + '.csv'))

    pred_df['target'] = target
    pred_df.to_csv(os.path.join(LOCAL_DIR, 'results_' + str(count) + '.csv'))
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('Poverty Estimation Results', 'binary_classification', 'predicted_values', 'results_' + str(count) + '.csv')).upload_file(os.path.join(LOCAL_DIR, 'results_' + str(count) + '.csv'))
    
    count = count + 1

pscores_poor


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


pscores_poor_med


  _warn_prf(average, modifier, msg_start, len(result))
