# CNN: Predict NTL and Extract Features

## Setup

In [30]:
## Libraries
import os, datetime
import numpy as np
import pandas as pd
#import geopandas as gpd
import json
#import rasterio
#from rasterio.plot import show
import matplotlib.pyplot as plt

from sklearn.preprocessing import KBinsDiscretizer, StandardScaler, normalize
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import Ridge

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (BaggingClassifier, AdaBoostClassifier,
                              AdaBoostRegressor,
                              GradientBoostingClassifier, RandomForestClassifier,
                              RandomForestRegressor,
                             BaggingRegressor, GradientBoostingRegressor)
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, classification_report,
                            r2_score, mean_absolute_error, mean_squared_error)

from joblib import dump, load

import logging, os 

import grid_params as grids

In [31]:
## Directories & Parameters
data_dir = "/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data"

SURVEY_NAME = "DHS"
OUT_DIR = os.path.join(data_dir, SURVEY_NAME, 'FinalData', 'results')

## Load Data

In [32]:
df = pd.read_csv(os.path.join(data_dir, SURVEY_NAME, "FinalData", "Merged Datasets", "survey_alldata.csv"))

In [33]:
df = df[df.country_code != 'IA']
df = df[df.country_code != 'TL']

In [43]:
#df_traintest.loc[:, (target)]
#df['country_code']

## Functions

In [50]:
def trainmodel_valresult(df, country, est_type, target, parameters, feature_type):
    # DESCRIPTION: Estimate poverty for a specific country, feature set and 
    # poverty variable across a number of parameters
    # ARGS:
    # df: Dataframe
    # country: iso2
    # est_type: within_country or other_countries
    # target: dependent variable to predict
    # feature_type: features to use

    if est_type == 'within_country_cv':
        df_traintest = df[df.country_code == country]
        df_traintest.reset_index()
                
    # Initialize results and predictions dataframes
    results_df = pd.DataFrame() # results iterating over params
    
    y_df = df_traintest[['uid', 'country_code']]
    y_df = y_df.copy()
    y_df['y'] = df_traintest[target]
    #y_df['y'] = df_traintest.loc[:, (target)]
    y_df['target'] = target
    y_df['feature_type'] = feature_type
        
    model_i = 0
    for i in parameters['regressors']:
        for j in parameters[i]:
            
            pred_dict = {
                'regressor': i,
                'params': j,
                'country': country,
                'est_type': est_type,
                'target': target,
                'model_i': model_i,
                'feature_type': feature_type
             }

            fold = 0
            y_df_parami = pd.DataFrame()
            for split_id in df_traintest.within_country_fold.unique():
                
                # Separate into train and test
                df_train = df_traintest[df_traintest['within_country_fold'] != split_id]
                df_test = df_traintest[df_traintest['within_country_fold'] == split_id]
                
                if (feature_type == "fb"):
                    x_train = df_train.filter(regex='^fb_', axis=1)
                    x_test = df_test.filter(regex='^fb_', axis=1)
                    
                if (feature_type == "osm"):
                    x_train = df_train.filter(regex='^osm_', axis=1)
                    x_test = df_test.filter(regex='^osm_', axis=1)
                    
                if (feature_type == "l8"):
                    x_train = df_train.filter(regex='^l8_', axis=1)
                    x_test = df_test.filter(regex='^l8_', axis=1)
                    
                if (feature_type == "l8_viirs"):
                    x_train = df_train.filter(regex='^l8_|^viirs_', axis=1)
                    x_test = df_test.filter(regex='^l8_|^viirs_', axis=1)
                    
                if (feature_type == "all"):
                    x_train = df_train.filter(regex ='^fb_|^l8_|^viirs_|^osm_|^worldpop_', axis=1)
                    x_test  = df_test.filter(regex  ='^fb_|^l8_|^viirs_|^osm_|^worldpop_', axis=1)
                    
                y_train = df_train[target]
                y_test = df_test[target]

                # Normalize
                x_scaler = StandardScaler().fit(x_train)

                x_train = x_scaler.transform(x_train)
                x_test = x_scaler.transform(x_test)

                ### Initialize regressor, fit data, then append model to list
                regressor = eval(i)(**j)
                trained = regressor.fit(x_train, y_train)

                ### Results
                y_pred = trained.predict(x_test)
                
                pred_dict['r2_score_' + str(fold)] = r2_score(y_test, y_pred)
                
                y_dict_foldi = {
                    'uid': df_test.uid
                 }
                y_dict_foldi['y_' + str(model_i)] = y_pred
                
                y_df_foldi = pd.DataFrame.from_dict(y_dict_foldi)
                y_df_parami = y_df_parami.append(y_df_foldi, ignore_index=True)

                
                fold += 1

            y_df = y_df.merge(y_df_parami, on = 'uid', how = 'right')
            y_df.to_csv(os.path.join(OUT_DIR, 'country_withincv',
                               'withincv_predicted_values_' + 
                               country + '_' +
                               est_type + '_' +
                               feature_type + '_' +
                               target + 
                               '_fbonly.csv'))
                
            results_df = results_df.append(pred_dict, ignore_index=True)
            results_df.to_csv(os.path.join(OUT_DIR, 'country_withincv',
                                           'withincv_results_' + 
                                           country + '_' +
                                           est_type + '_' +
                                           feature_type + '_' +
                                           target + 
                                           '_fbonly.csv'))
            model_i += 1
        
    return results_df, y_df

In [51]:
parameters = grids.GRID_REGRESS

In [52]:
results_all_df = pd.DataFrame()
y_all_df = pd.DataFrame()

for feature_type_i in ['fb', 'osm', 'l8', 'l8_viirs', 'all']:
    for est_type_i in ['within_country_cv']:
        for cc_i in df.country_code.unique():
            for target in ['asset_pca_1', 'wealth_index_score']:
                print(feature_type_i + ' // ' + est_type_i + ' // ' + cc_i + ' // ' + target)

                results_df_i, y_df_i = trainmodel_valresult(df, cc_i, est_type_i, target, parameters, feature_type_i)

                results_all_df = results_all_df.append(results_df_i, ignore_index=True)
                y_all_df = y_all_df.append(y_df_i, ignore_index=True)

fb // within_country_cv // BD // asset_pca_1
fb // within_country_cv // BD // wealth_index_score
fb // within_country_cv // KH // asset_pca_1
fb // within_country_cv // KH // wealth_index_score
fb // within_country_cv // KY // asset_pca_1
fb // within_country_cv // KY // wealth_index_score
fb // within_country_cv // MM // asset_pca_1
fb // within_country_cv // MM // wealth_index_score
fb // within_country_cv // NP // asset_pca_1
fb // within_country_cv // NP // wealth_index_score
fb // within_country_cv // PH // asset_pca_1
fb // within_country_cv // PH // wealth_index_score
fb // within_country_cv // PK // asset_pca_1
fb // within_country_cv // PK // wealth_index_score
fb // within_country_cv // TJ // asset_pca_1
fb // within_country_cv // TJ // wealth_index_score
osm // within_country_cv // BD // asset_pca_1
osm // within_country_cv // BD // wealth_index_score
osm // within_country_cv // KH // asset_pca_1
osm // within_country_cv // KH // wealth_index_score
osm // within_country_cv /

In [53]:
results_all_df.to_csv(os.path.join(OUT_DIR, 'results_fbonly_withincv.csv'))
y_all_df.to_csv(os.path.join(OUT_DIR, 'ypred_fbonly_withincv.csv'))