# Predict household income from satellite imagery data

First pass.

General ML pipeline steps:
1. Import data
2. Split data into test/train sets
3. Preprocess test/train sets separately
4. Generate features from data
5. For each regressor-hyperparameter combination:
    - Train regressor with given hyperparameters and training data and labels
    - Generate predicted labels for test data with trained regressor
    - Evaluate regressor-hyperparameter performance against actual test labels and get $R^2$
6. Explore best-performing models

In [1]:
import os
import math
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import r2_score

# Import configuration file
import config as cf

# Display options 
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = -1

# Turn off big pink warnings
import warnings
warnings.filterwarnings('ignore')

## 1. Import data and drop "future" rows

In [2]:
DATA_PATH = os.path.join('..', '..', '..', 'Data', 'FinalData', 'BISP', 'bisp_sat_inc_data.csv')
df = pd.read_csv(DATA_PATH)
df.shape

(5416, 39)

In [3]:
df.head()

Unnamed: 0,uid,viirs_2012,viirs_2013,viirs_2014,viirs_2015,viirs_2016,viirs_2017,viirs_2018,dmspols_1992,dmspols_1993,dmspols_1994,dmspols_1995,dmspols_1996,dmspols_1997,dmspols_1998,dmspols_1999,dmspols_2000,dmspols_2001,dmspols_2002,dmspols_2003,dmspols_2004,dmspols_2005,dmspols_2006,dmspols_2007,dmspols_2008,dmspols_2009,dmspols_2010,dmspols_2011,dmspols_2012,dmspols_2013,l7_2011_1,l7_2011_2,l7_2011_3,l7_2011_4,l7_2011_5,l7_2011_6,l7_2011_7,hhinc_2011,hhinc_2013
0,100389,2.052018,2.141392,2.089507,2.307763,2.850603,3.653005,3.75,43.0,33.666667,35.5,45.333333,40.0,33.166667,39.5,40.333333,37.333333,39.666667,38.833333,33.666667,34.0,34.5,40.666667,45.0,43.0,30.333333,46.0,32.666667,47.666667,45.333333,902.331348,1224.739396,1393.123911,2555.792708,2474.174317,3005.856769,1922.539802,9000.0,73000.0
1,100401,1.964332,2.133366,2.052437,2.296554,2.76996,3.702374,3.488333,43.0,33.666667,35.5,45.333333,40.0,33.166667,39.5,40.333333,37.333333,39.666667,38.833333,33.666667,34.0,34.5,40.666667,45.0,43.0,30.333333,46.0,32.666667,47.666667,45.333333,885.841488,1200.54835,1366.253764,2512.672843,2451.849595,3004.616242,1890.566155,75000.0,159000.0
2,100581,1.824753,1.937131,1.875487,2.04754,2.557241,3.198625,3.286,43.0,32.5,34.25,43.0,38.0,31.75,38.25,38.75,36.0,38.25,37.75,32.0,32.75,33.75,40.0,43.75,42.5,30.0,45.5,30.5,47.5,44.5,886.021385,1206.745127,1373.031277,2550.999418,2462.90966,3006.164678,1900.64984,48000.0,0.0
3,101101,1.964332,2.133366,2.052437,2.296554,2.76996,3.702374,3.488333,43.0,33.666667,35.5,45.333333,40.0,33.166667,39.5,40.333333,37.333333,39.666667,38.833333,33.666667,34.0,34.5,40.666667,45.0,43.0,30.333333,46.0,32.666667,47.666667,45.333333,886.196798,1201.037263,1366.468559,2514.479913,2450.865939,3004.699563,1890.108734,31200.0,219000.0
4,101236,2.052018,2.141392,2.089507,2.307763,2.850603,3.653005,3.75,43.0,33.666667,35.5,45.333333,40.0,33.166667,39.5,40.333333,37.333333,39.666667,38.833333,33.666667,34.0,34.5,40.666667,45.0,43.0,30.333333,46.0,32.666667,47.666667,45.333333,891.264553,1209.61309,1374.709528,2535.919345,2453.881552,3005.134086,1897.493484,14000.0,


In [4]:
# Keep only 2011 columns, but include viirs_2012
df = df.filter(regex='_2011', axis=1).join(df['viirs_2012'])
df.head()

Unnamed: 0,dmspols_2011,l7_2011_1,l7_2011_2,l7_2011_3,l7_2011_4,l7_2011_5,l7_2011_6,l7_2011_7,hhinc_2011,viirs_2012
0,32.666667,902.331348,1224.739396,1393.123911,2555.792708,2474.174317,3005.856769,1922.539802,9000.0,2.052018
1,32.666667,885.841488,1200.54835,1366.253764,2512.672843,2451.849595,3004.616242,1890.566155,75000.0,1.964332
2,30.5,886.021385,1206.745127,1373.031277,2550.999418,2462.90966,3006.164678,1900.64984,48000.0,1.824753
3,32.666667,886.196798,1201.037263,1366.468559,2514.479913,2450.865939,3004.699563,1890.108734,31200.0,1.964332
4,32.666667,891.264553,1209.61309,1374.709528,2535.919345,2453.881552,3005.134086,1897.493484,14000.0,2.052018


In [5]:
# Drop columns where the label is missing
df = df.loc[~pd.isnull(df['hhinc_2011'])]

df.shape

(4875, 10)

## 2. Split data into test/train

In [6]:
LABEL = 'hhinc_2011'
TEST_SIZE = 0.3

# Separate feature sets from label sets
x_df = df.drop(labels=[LABEL], axis=1)
y_df = df[LABEL]

# Split into test and train sets for features and labels
x_train, x_test, y_train, y_test =  train_test_split(x_df, y_df, test_size=TEST_SIZE)

## 3. Preprocess data

All vars are numeric - impute missing data with mean

In [7]:
# Check how many rows are missing across columns
print("TRAINING FEATURES MISSING:")
print(pd.isnull(x_train).sum())
print("")
print("TEST FEATURES MISSING:")
print(pd.isnull(x_test).sum())

TRAINING FEATURES MISSING:
dmspols_2011    41
l7_2011_1       0 
l7_2011_2       0 
l7_2011_3       0 
l7_2011_4       0 
l7_2011_5       0 
l7_2011_6       0 
l7_2011_7       0 
viirs_2012      41
dtype: int64

TEST FEATURES MISSING:
dmspols_2011    11
l7_2011_1       0 
l7_2011_2       0 
l7_2011_3       0 
l7_2011_4       0 
l7_2011_5       0 
l7_2011_6       0 
l7_2011_7       0 
viirs_2012      11
dtype: int64


In [8]:
for i in (x_train, x_test):
    for j in i.columns:
        
        if i[j].isnull().sum():
            # Create imputed flag
            new_name = i[j].name + '_imputed'
            i[new_name] = pd.isnull(i[j]).astype('int')
            # Fill with mean
            i[j] = i[j].fillna(i[j].mean())
        else:
            continue

In [9]:
# All missing values were imputed
print("TRAINING FEATURES MISSING:")
print(pd.isnull(x_train).sum())
print("")
print("TEST FEATURES MISSING:")
print(pd.isnull(x_test).sum())

TRAINING FEATURES MISSING:
dmspols_2011            0
l7_2011_1               0
l7_2011_2               0
l7_2011_3               0
l7_2011_4               0
l7_2011_5               0
l7_2011_6               0
l7_2011_7               0
viirs_2012              0
dmspols_2011_imputed    0
viirs_2012_imputed      0
dtype: int64

TEST FEATURES MISSING:
dmspols_2011            0
l7_2011_1               0
l7_2011_2               0
l7_2011_3               0
l7_2011_4               0
l7_2011_5               0
l7_2011_6               0
l7_2011_7               0
viirs_2012              0
dmspols_2011_imputed    0
viirs_2012_imputed      0
dtype: int64


## 4. Feature Generation

[Landsat 7 specs](https://landsat.usgs.gov/sites/default/files/documents/si_product_guide.pdf#page=14)

Create indices from every possible pair of Landsat 7 band.
- Normalized Difference Vegetation Index, NDVI = $\frac{NIR - Red}{NIR + Red}$ is formed from the (NIR, Red) pair.
- Normalized Difference Built-up Index, NDBI = $\frac{SWIR1 - NIR}{SWIR1 + NIR}$ is formed from the (NIR, SWIR1) pair.
- Normalized Difference Water Index, NDWO = $\frac{NIR - SWIR1}{NIR + SWIR1}$ is also formed from the (NIR, SWIR1) pair.
- Modified NDWI, MNDWI = $\frac{Green - SWIR1}{Green + SWIR1}$ is formed from the (NIR, Green) pair. And so on.


| Band | 1 | 2 | 3 | 4 | 5 | 6 | 7
| ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- 
| 1 | NA 
| 2 | ? | NA 
| 3 | ? | ? | NA 
| 4 | ? | ? | NDVI | NA
| 5 | ? | MNDWI | ? | NDBI, NDWI | NA 
| 6 | ? | ? | ? | ? | ? | NA 
| 7 | ? | ? | ? | ? | ? | ? | NA



In [10]:
# Create ratios 
# Note that ratio of Band A to Band B is the same as ratio of Band B to Band A
# Solution: only create ratios where A < B
for df in (x_train, x_test):
    for i in range(1, 8):
        for j in range(1, 8):

            if i >= j:
                continue
            else:
                band1 = f'l7_2011_{i}'
                band2 = f'l7_2011_{j}'
                new_var = f'ratio_{i}_{j}'
                df[new_var] = abs((df[band1] - df[band2]) / (df[band1] + df[band2]))

In [11]:
x_train.head()

Unnamed: 0,dmspols_2011,l7_2011_1,l7_2011_2,l7_2011_3,l7_2011_4,l7_2011_5,l7_2011_6,l7_2011_7,viirs_2012,dmspols_2011_imputed,viirs_2012_imputed,ratio_1_2,ratio_1_3,ratio_1_4,ratio_1_5,ratio_1_6,ratio_1_7,ratio_2_3,ratio_2_4,ratio_2_5,ratio_2_6,ratio_2_7,ratio_3_4,ratio_3_5,ratio_3_6,ratio_3_7,ratio_4_5,ratio_4_6,ratio_4_7,ratio_5_6,ratio_5_7,ratio_6_7
3179,8.25,909.742472,1177.802837,1311.173133,2281.040388,1898.027215,3015.763897,1340.814129,0.345797,0,0,0.128409,0.18075,0.429768,0.351982,0.536497,0.19154,0.053584,0.318961,0.234156,0.438281,0.064723,0.269992,0.182866,0.393949,0.011177,0.09165,0.138711,0.259598,0.227469,0.172041,0.384465
4258,0.0,1324.690862,1839.935512,2243.79627,2840.146183,3263.665269,3031.526605,2855.312608,0.363278,0,0,0.162814,0.257562,0.363869,0.422586,0.391816,0.366177,0.098895,0.213716,0.278966,0.244606,0.216256,0.117301,0.18518,0.149324,0.119926,0.069386,0.032594,0.002663,0.036876,0.066735,0.029934
5230,38.666667,1280.289901,1676.507835,1975.053976,2627.297156,2910.663233,3049.879861,2404.171793,4.791777,0,0,0.134002,0.213423,0.344716,0.389022,0.408665,0.305033,0.081758,0.220918,0.269045,0.290575,0.178319,0.14172,0.191499,0.213899,0.097989,0.051168,0.074435,0.044346,0.023356,0.095298,0.118391
3322,15.333333,1184.763776,1567.776102,1862.128625,2940.840487,2513.584252,3057.37558,1998.246375,0.912215,0,0,0.139149,0.222313,0.425653,0.359301,0.441431,0.25557,0.085819,0.304542,0.231738,0.322065,0.120714,0.224593,0.14888,0.242961,0.03526,0.078332,0.019428,0.190844,0.097612,0.114219,0.209495
1112,7.666667,840.145283,1135.340493,1331.007547,2592.282293,2281.683454,3031.58418,1776.47402,0.276674,0,0,0.149429,0.226084,0.510466,0.461761,0.56601,0.357839,0.079335,0.39085,0.33548,0.45507,0.220184,0.321484,0.263149,0.389809,0.143353,0.063726,0.078114,0.186737,0.141137,0.124492,0.261043


In [12]:
x_test.head()

Unnamed: 0,dmspols_2011,l7_2011_1,l7_2011_2,l7_2011_3,l7_2011_4,l7_2011_5,l7_2011_6,l7_2011_7,viirs_2012,dmspols_2011_imputed,viirs_2012_imputed,ratio_1_2,ratio_1_3,ratio_1_4,ratio_1_5,ratio_1_6,ratio_1_7,ratio_2_3,ratio_2_4,ratio_2_5,ratio_2_6,ratio_2_7,ratio_3_4,ratio_3_5,ratio_3_6,ratio_3_7,ratio_4_5,ratio_4_6,ratio_4_7,ratio_5_6,ratio_5_7,ratio_6_7
4613,19.5,843.244564,1117.562192,1188.872427,2491.434474,2205.066541,2998.034938,1598.18991,2.455506,0,0,0.1399,0.170083,0.494257,0.446746,0.560956,0.309222,0.030918,0.38068,0.327302,0.456914,0.176978,0.353928,0.299414,0.4321,0.146863,0.060975,0.092286,0.218417,0.152403,0.159568,0.304564
4169,31.333333,891.490983,1163.404305,1387.957824,2443.310646,2269.944154,3011.784177,1785.646306,7.009732,0,0,0.132325,0.217801,0.465341,0.436021,0.543209,0.333997,0.088013,0.354868,0.322292,0.442706,0.210997,0.275458,0.241118,0.369073,0.125311,0.036783,0.10421,0.155515,0.140454,0.119415,0.255582
2191,4.666667,1154.946506,1723.827486,2071.125254,2482.552769,2804.201653,3087.878081,2512.388953,0.21482,0,0,0.197612,0.283992,0.364978,0.416568,0.455577,0.370144,0.091516,0.180375,0.238597,0.283486,0.186148,0.090351,0.150365,0.197083,0.096272,0.060841,0.108668,0.005973,0.048145,0.054887,0.102761
1180,29.5,813.534897,1076.892557,1210.209528,2470.776281,2124.422531,3007.845641,1587.72227,1.600409,0,0,0.139311,0.19601,0.504593,0.44619,0.574219,0.322409,0.058291,0.392901,0.327219,0.472724,0.191709,0.342454,0.274157,0.426177,0.134926,0.075373,0.09803,0.217581,0.172131,0.14458,0.30902
2045,12.5,731.582077,992.264211,1079.259716,2687.091357,1941.664298,2999.478683,1195.915168,0.318954,0,0,0.151221,0.191998,0.572008,0.452664,0.607842,0.240899,0.041996,0.460632,0.323593,0.502842,0.093069,0.426894,0.285477,0.470787,0.051273,0.161043,0.054934,0.384026,0.214083,0.237683,0.429891


In [13]:
# check that lengths match
print(len(x_train) == len(y_train))
print(len(x_test) == len(y_test))

True
True


### 4.1 Define feature groups

1. Daytime-only: Landsat 7 band data and computed indices
2. Nighttime-only: DMSP and VIIRS data + imputed flags
3. All features

In [27]:
DAY_FEATURES = df.filter(regex='l7|ratio', axis=1).columns.tolist()
NIGHT_FEATURES = ['dmspols_2011', 'viirs_2012', 'dmspols_2011_imputed', 'viirs_2012_imputed']
ALL_FEATURES = df.columns.tolist()

print("Day-only:", DAY_FEATURES)
print("-----")
print("Night-only:", NIGHT_FEATURES)

Day-only: ['l7_2011_1', 'l7_2011_2', 'l7_2011_3', 'l7_2011_4', 'l7_2011_5', 'l7_2011_6', 'l7_2011_7', 'ratio_1_2', 'ratio_1_3', 'ratio_1_4', 'ratio_1_5', 'ratio_1_6', 'ratio_1_7', 'ratio_2_3', 'ratio_2_4', 'ratio_2_5', 'ratio_2_6', 'ratio_2_7', 'ratio_3_4', 'ratio_3_5', 'ratio_3_6', 'ratio_3_7', 'ratio_4_5', 'ratio_4_6', 'ratio_4_7', 'ratio_5_6', 'ratio_5_7', 'ratio_6_7']
-----
Night-only: ['dmspols_2011', 'viirs_2012', 'dmspols_2011_imputed', 'viirs_2012_imputed']


## 5. Train and Evaluate Regressors

### 5.1 Training

In [25]:
# Define a TrainedRegressor object to hold key results information
class TrainedRegressor:
    
    def __init__(self, method, params, features, regressor):
        self.method = method
        self.params = params
        self.regressor = regressor
        self.features = features
    
    def __repr__(self):
        return f'Trained {self.method} on feature set {self.features} with params {self.params}'

In [39]:
# Use GRID_MAIN for full grid search
parameters = cf.GRID_TEST

trained_list = []
count = 0
# print('Training model ', end='')
for i in parameters['regressors']:
    for j in parameters[i]:
        for k in ('DAY_FEATURES', 'NIGHT_FEATURES', 'ALL_FEATURES'):
        
            #print(str(count), end=' ')
            count += 1
            print(f'Model {count}: Training {i} on {k} with params {str(j)}')

            # Initialize regressor, fit data, then append TrainedRegressor object to list
            regressor = eval(i)(**j)
            trained = regressor.fit(x_train[eval(k)], y_train)
            trained_list.append(TrainedRegressor(i, str(j), k, trained))


Model 1: Training LinearRegression on DAY_FEATURES with params {'n_jobs': -1}
Model 2: Training LinearRegression on NIGHT_FEATURES with params {'n_jobs': -1}
Model 3: Training LinearRegression on ALL_FEATURES with params {'n_jobs': -1}
Model 4: Training Lasso on DAY_FEATURES with params {'alpha': 0.01, 'max_iter': 1000.0, 'selection': 'random', 'random_state': 0}
Model 5: Training Lasso on NIGHT_FEATURES with params {'alpha': 0.01, 'max_iter': 1000.0, 'selection': 'random', 'random_state': 0}
Model 6: Training Lasso on ALL_FEATURES with params {'alpha': 0.01, 'max_iter': 1000.0, 'selection': 'random', 'random_state': 0}
Model 7: Training Ridge on DAY_FEATURES with params {'alpha': 0.01, 'max_iter': 1000.0, 'solver': 'cholesky', 'random_state': 0}
Model 8: Training Ridge on NIGHT_FEATURES with params {'alpha': 0.01, 'max_iter': 1000.0, 'solver': 'cholesky', 'random_state': 0}
Model 9: Training Ridge on ALL_FEATURES with params {'alpha': 0.01, 'max_iter': 1000.0, 'solver': 'cholesky', 'r

In [30]:
len(trained_list)

24

### 5.2 Prediction and Evaluation

In [37]:
results_df = pd.DataFrame()
for i in trained_list:
    
    # Get predicted results from test data
    features = eval(i.features)
    pred_labels = i.regressor.predict(x_test[features])
    
    # Append results to dataframe and sort by R^2
    pred_dict = {
        'regressor': i.method,
        'features': i.features,
        'params': i.params,
        'r2': r2_score(y_true=y_test, y_pred=pred_labels)        
    }
    
    results_df = results_df.append(pred_dict, ignore_index=True) \
        .sort_values(by='r2', ascending=False, axis=0) \
        [['regressor', 'params', 'features', 'r2']]

results_df.shape

(24, 4)

In [38]:
results_df

Unnamed: 0,regressor,params,features,r2
0,DecisionTreeRegressor,"{'criterion': 'mse', 'splitter': 'best', 'max_depth': 1, 'max_features': 'sqrt', 'random_state': 0}",ALL_FEATURES,-0.001394
1,GradientBoostingRegressor,"{'loss': 'ls', 'learning_rate': 0.0001, 'n_estimators': 100, 'criterion': 'mse', 'max_features': 'sqrt', 'random_state': 0}",NIGHT_FEATURES,-0.00155
2,GradientBoostingRegressor,"{'loss': 'ls', 'learning_rate': 0.0001, 'n_estimators': 100, 'criterion': 'mse', 'max_features': 'sqrt', 'random_state': 0}",ALL_FEATURES,-0.001606
3,GradientBoostingRegressor,"{'loss': 'ls', 'learning_rate': 0.0001, 'n_estimators': 100, 'criterion': 'mse', 'max_features': 'sqrt', 'random_state': 0}",DAY_FEATURES,-0.001608
4,Ridge,"{'alpha': 0.01, 'max_iter': 1000.0, 'solver': 'cholesky', 'random_state': 0}",NIGHT_FEATURES,-0.003138
5,Lasso,"{'alpha': 0.01, 'max_iter': 1000.0, 'selection': 'random', 'random_state': 0}",NIGHT_FEATURES,-0.003138
6,LinearRegression,{'n_jobs': -1},NIGHT_FEATURES,-0.003138
7,RandomForestRegressor,"{'n_estimators': 100, 'criterion': 'mse', 'max_depth': 1, 'max_features': 'sqrt', 'n_jobs': -1, 'random_state': 0}",NIGHT_FEATURES,-0.003434
23,RandomForestRegressor,"{'n_estimators': 100, 'criterion': 'mse', 'max_depth': 1, 'max_features': 'sqrt', 'n_jobs': -1, 'random_state': 0}",ALL_FEATURES,-0.00441
8,DecisionTreeRegressor,"{'criterion': 'mse', 'splitter': 'best', 'max_depth': 1, 'max_features': 'sqrt', 'random_state': 0}",NIGHT_FEATURES,-0.004975
