# Predict household income from satellite imagery data

First pass.

General ML pipeline steps:
1. Import data
2. Split data into test/train sets
3. Preprocess test/train sets separately
4. Generate features from data
5. For each regressor-hyperparameter combination:
    - Train regressor with given hyperparameters and training data and labels
    - Generate predicted labels for test data with trained regressor
    - Evaluate regressor-hyperparameter performance against actual test labels and get $R^2$
6. Explore best-performing models

In [None]:
import os
import math
import pickle
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import LinearSVR
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier

from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

import matplotlib.pyplot as plt

# Import configuration file
import config as cf

# Display options 
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = -1

# Turn off big pink warnings
import warnings
warnings.filterwarnings('ignore')

# Data file path 
final_data_file_path = "/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/FinalData"
#


In [None]:
# Test grid to make sure everything works - limited models and parameters
# 'BaggingClassifier'
GRID_TEST_CLASS = {
    'regressors': ['LinearSVC','SVC', 'DecisionTreeClassifier',
                    'GradientBoostingClassifier', 'RandomForestClassifier'],
    'LinearSVC': [
        {'penalty': penalty, 'C': C, 'loss': loss, 'max_iter': max_iter,
        'random_state': 0} \
        for penalty in ('l2', ) \
        for C in (1e-2,1,2) \
        for loss in ('epsilon_insensitive','squared_hinge', ) \
        for max_iter in (1e1, )
    ],
    'SVC': [
        {'kernel': kernel, 'C': C, 'class_weight': class_weight,         
        'random_state': 0} \
        for C in (1e-2,1,2) \
        for class_weight in c(None, 'balanced',)
        for kernel in ('linear','poly','rbf','sigmoid', ) \
    ],
    'DecisionTreeClassifier': [
        {'criterion': criterion, 'splitter': splitter, 'max_depth': max_depth,
        'max_features': max_features, 'random_state': 0} \
        for criterion in ('gini', ) \
        for splitter in ('best', ) \
        for max_depth in (1,2,3,4, 5, 10, 20, 30, 50, 70, 100, ) \
        for max_features in ('sqrt', ) \
    ],
    #'BaggingClassifier': [
    #    {'n_estimators': n_estimators, 'max_features': max_features,
    #    'random_state': 0, 'n_jobs': -1} \
    #    for n_estimators in (10, 100, 1000,) \
    #    for max_features in (0.1, 0.2, 0.3,0.4, 0.5, 1.0,)
    #],
    'RandomForestClassifier': [
        {'n_estimators': n_estimators, 'criterion': criterion,
        'max_depth': max_depth, 'max_features': max_features, 'n_jobs': -1,
        'random_state': 0} \
        for n_estimators in (5, 10, 100, 1000, 5000) \
        for criterion in ('gini', ) \
        for max_depth in (1,2,3,4,5,6,7,8,9,10, ) \
        for max_features in ('sqrt','log2',None, )
    ],
    'GradientBoostingClassifier': [
        {'loss': loss, 'learning_rate': rate, 'n_estimators': n_estimators,
        'criterion': criterion, 'max_features': max_features,
        'random_state': 0} \
        for loss in ('deviance', ) \
        for rate in (1e-4, )
        for n_estimators in (100, ) \
        for criterion in ('friedman_mse', ) \
        for max_features in ('sqrt', ) \
    ]
}

## 1. Import data and drop "future" rows

In [None]:
#### Predict Changes
#DATA_PATH = os.path.join(final_data_file_path, 'BISP','Merged Datasets', 'bisp_socioeconomic_satellite_firstdiff_r13.csv')
#DATA_PATH = os.path.join('/Users/robmarty/Desktop/', 'bisp_socioeconomic_satellite_firstdiff_r13.csv')

#### Predict Levels
DATA_PATH = os.path.join(final_data_file_path, 'BISP','Merged Datasets', 'bisp_socioeconomic_satellite_panel_full_satPovNAsRemoved.csv')

df = pd.read_csv(DATA_PATH)
df.shape

In [None]:
df.head()

In [None]:
#df['pscores_poor']

In [None]:
#### Changes
#df['pscores_bin'] = df['pscores'] < 0

#### Levels
#df = df.loc[df['survey_round'] != 1]
#df['pscores_bin'] = df['pscores'] <= 16.17
df['pscores_bin'] = df['pscores_poor']

# DV as Quantiles
#df['pscores_2011'] = pd.qcut(df['pscores_2011'], 3, labels=False)
#df['pscores_2011'].value_counts()
#df['pscores_bin'] = df['pscores'] < 0

df.pscores_bin.value_counts()

In [None]:
#### Restrict to Year
df = df[df['year'] == 2013]
df.pscores_bin.value_counts()

In [None]:
# Keep Select Columns
df_viirs = df.filter(regex='viirs').filter(regex='_2km')
df_landsat = df.filter(regex='^b').filter(regex='_1km')
df_osm = df.filter(regex='fclass').filter(regex='meters')
df_facebook = df.filter(regex='au$')

df_y = df.filter(regex='^pscores_bin$')

df_all = df_y.join(df_viirs).join(df_landsat).join(df_osm).join(df_facebook)
df_all.head()

In [171]:
# Drop columns where the label is missing
#df = df.loc[~pd.isnull(df['hhinc_2011'])]

#df.shape

## 2. Split data into test/train

In [172]:
LABEL = 'pscores_bin'
TEST_SIZE = 0.2

# Separate feature sets from label sets
x_df = df_all.drop(labels=[LABEL], axis=1)
y_df = df_all[LABEL]

# Split into test and train sets for features and labels
x_train, x_test, y_train, y_test =  train_test_split(x_df, y_df, test_size=TEST_SIZE)

In [None]:
x_train.head()
x_test.head()

# check that lengths match
print(len(x_train) == len(y_train))
print(len(x_test) == len(y_test))

print(y_train.value_counts())
print(y_test.value_counts())

True
True
False    2300
True     1194
Name: pscores_bin, dtype: int64
False    615
True     259
Name: pscores_bin, dtype: int64


### Define Training Variables

In [None]:
DAY_FEATURES = df_all.filter(regex='^b', axis=1).columns.tolist()
NIGHT_FEATURES = df_all.filter(regex='viirs', axis=1).columns.tolist()
SATELLITE_FEATURES = df_all.filter(regex='^b|viirs').columns.tolist()
NONSATELLITE_FEATURES = df_all.filter(regex='dist_osm|estimate_').columns.tolist()
ALL_FEATURES = x_df.columns.tolist()

## 5. Train and Evaluate Regressors

### 5.1 Training

In [None]:
x_all = x_test.append(x_train)
y_all = y_test.append(y_train)

In [None]:
# Define a TrainedRegressor object to hold key results information
class TrainedRegressor:
    
    def __init__(self, method, params, features, regressor):
        self.method = method
        self.params = params
        self.regressor = regressor
        self.features = features
    
    def __repr__(self):
        return f'Trained {self.method} on feature set {self.features} with params {self.params}'

In [None]:
# Use GRID_MAIN for full grid search
# parameters = cf.GRID_TEST_CLASS
parameters = GRID_TEST_CLASS

results_df = pd.DataFrame()
results_df_all = pd.DataFrame()
results_df_trainedonly_all = pd.DataFrame()

x_trainedonly_all = x_all.copy()

trained_list = []
trained_list_all = []
count = 0
# print('Training model ', end='')
for i in parameters['regressors']:
    for j in parameters[i]:
        for k in ('DAY_FEATURES', 'NIGHT_FEATURES', 'ALL_FEATURES', 'SATELLITE_FEATURES', 'NONSATELLITE_FEATURES'):
        
            print(f'Model {count}: Training {i} on {k} with params {str(j)}')

            # A. Train Models --------------------------
            regressor = eval(i)(**j)
            
            trained = regressor.fit(x_train[eval(k)], y_train)
            trained_list.append(TrainedRegressor(i, str(j), k, trained))
            
            # B. Results -------------------------------------
            pred_labels = trained_list[count].regressor.predict(x_test[eval(k)])

            pred_dict = {
                'regressor': trained_list[count].method,
                'features': trained_list[count].features,
                'params': trained_list[count].params,
                'accuracy_score': accuracy_score(y_true=y_test, y_pred=pred_labels),
                'average_precision_score': average_precision_score(y_test, pred_labels),
                'recall_score': recall_score(y_test, pred_labels)
            }
    
            results_df = results_df.append(pred_dict, ignore_index=True) \
                .sort_values(by='accuracy_score', ascending=False, axis=0) \
                [['regressor', 'params', 'features', 'accuracy_score','average_precision_score',
                 'recall_score']]
        
            results_df.to_csv("/Users/robmarty/Desktop/pov_results_r13.csv")
            
            x_test['y_true'] = y_test
            x_test['y_predict_' + str(count)] = pred_labels
            #x_test.to_csv(os.path.join(final_data_file_path, 'Data with Predicted Income', 'pov_opm_data_with_predictions_traineddatamodel_testdatapredict_r13.csv'))
            x_test.to_csv(os.path.join('/Users/robmarty/Desktop', 'pov_opm_data_with_predictions.csv'))

            
            
            
            
            
            
            
  
            # A. Train ------------------------------------
            # Initialize regressor, fit data, then append TrainedRegressor object to list
            # 1. Train Data
            #regressor = eval(i)(**j)
            #trained = regressor.fit(x_train[eval(k)], y_train)
            #trained_list.append(TrainedRegressor(i, str(j), k, trained))

            # 2. All Data
            #trained_all = trained
            #trained_list_all = trained_list

            
            #trained_all = regressor.fit(x_all[eval(k)], y_all)
            #trained_list_all.append(TrainedRegressor(i, str(j), k, trained_all))
            
            
            
            
            
            
            
            # B. Results -------------------------------------
            # 1. Trained Model on Test Data - - - - - - - - - -
            #pred_labels = trained_list[count].regressor.predict(x_test[eval(k)])

            #pred_dict = {
            #    'regressor': trained_list[count].method,
            #    'features': trained_list[count].features,
            #    'params': trained_list[count].params,
            #    'accuracy_score': accuracy_score(y_true=y_test, y_pred=pred_labels)        
            #}
    
            #results_df = results_df.append(pred_dict, ignore_index=True) \
            #    .sort_values(by='accuracy_score', ascending=False, axis=0) \
            #    [['regressor', 'params', 'features', 'accuracy_score']]
        
            #results_df.to_csv("/Users/robmarty/Desktop/pov_results_r13.csv")
            
            #x_test['y_true'] = y_test
            #x_test['y_predict_' + str(count)] = pred_labels
            #x_test.to_csv(os.path.join(final_data_file_path, 'Data with Predicted Income', 'pov_opm_data_with_predictions_traineddatamodel_testdatapredict_r13.csv'))
            
            
            
            
            
            
            
            # 2. Trained All Model on All Data - - - - - - - - - -
            #pred_labels_all = trained_list_all[count].regressor.predict(x_all[eval(k)])

            # Append results to dataframe and sort by R^2
            #pred_dict = {
            #    'regressor': trained_list_all[count].method,
            #    'features': trained_list_all[count].features,
            #    'params': trained_list_all[count].params,
            #    'accuracy_score': accuracy_score(y_true=y_all, y_pred=pred_labels_all)        
            #}
    
            #results_df_all = results_df_all.append(pred_dict, ignore_index=True) \
            #    .sort_values(by='accuracy_score', ascending=False, axis=0) \
            #    [['regressor', 'params', 'features', 'accuracy_score']]
        
            #results_df_all.to_csv("/Users/robmarty/Desktop/pov_results_all_r13.csv")

            # ALL
            #x_trainedonly_all['y_true'] = y_all
            #x_trainedonly_all['y_predict_' + str(count)] = trained_list_all[count].regressor.predict(x_all[eval(k)])
            #x_trainedonly_all.to_csv(os.path.join(final_data_file_path, 'Data with Predicted Income', 'pov_opm_data_with_predictions_alldatamodel_alldatapredict_r13.csv'))
            
            
            
            
            
            
            # 3. Trained Model on All Data - - - - - - - - - -
            #pred_labels_trainedonly_all = trained_list[count].regressor.predict(x_all[eval(k)])

            # Append results to dataframe and sort by R^2
            #pred_dict = {
            #    'regressor': trained_list[count].method,
            #    'features': trained_list[count].features,
            #    'params': trained_list[count].params,
            #    'accuracy_score': accuracy_score(y_true=y_all, y_pred=pred_labels_trainedonly_all)        
            #}
    
            #results_df_trainedonly_all = results_df_trainedonly_all.append(pred_dict, ignore_index=True) \
            #    .sort_values(by='accuracy_score', ascending=False, axis=0) \
            #    [['regressor', 'params', 'features', 'accuracy_score']]
        
            #results_df_trainedonly_all.to_csv("/Users/robmarty/Desktop/pov_results_trainedonly_all_r13.csv")

            # ALL
            #x_all['y_true'] = y_all
            #x_all['y_predict_' + str(count)] = trained_list[count].regressor.predict(x_all[eval(k)])
            #x_all.to_csv(os.path.join(final_data_file_path, 'Data with Predicted Income', 'pov_opm_data_with_predictions_testdatamodel_alldatapredict_r13.csv'))

            ####
            count += 1


Model 0: Training LinearSVC on DAY_FEATURES with params {'penalty': 'l2', 'C': 0.01, 'loss': 'epsilon_insensitive', 'max_iter': 10.0, 'random_state': 0}
Model 1: Training LinearSVC on NIGHT_FEATURES with params {'penalty': 'l2', 'C': 0.01, 'loss': 'epsilon_insensitive', 'max_iter': 10.0, 'random_state': 0}
Model 2: Training LinearSVC on ALL_FEATURES with params {'penalty': 'l2', 'C': 0.01, 'loss': 'epsilon_insensitive', 'max_iter': 10.0, 'random_state': 0}
Model 3: Training LinearSVC on SATELLITE_FEATURES with params {'penalty': 'l2', 'C': 0.01, 'loss': 'epsilon_insensitive', 'max_iter': 10.0, 'random_state': 0}
Model 4: Training LinearSVC on NONSATELLITE_FEATURES with params {'penalty': 'l2', 'C': 0.01, 'loss': 'epsilon_insensitive', 'max_iter': 10.0, 'random_state': 0}
Model 5: Training LinearSVC on DAY_FEATURES with params {'penalty': 'l2', 'C': 0.01, 'loss': 'squared_hinge', 'max_iter': 10.0, 'random_state': 0}
Model 6: Training LinearSVC on NIGHT_FEATURES with params {'penalty': '