# Predict household income from satellite imagery data

First pass.

In [1]:
import os
import math
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split

# Display options 
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = -1

## 1. Import data and drop "future" rows

In [2]:
DATA_PATH = os.path.join('..', '..', '..', 'Data', 'FinalData', 'BISP', 'bisp_sat_inc_data.csv')
df = pd.read_csv(DATA_PATH)
df.shape

(5416, 39)

In [3]:
df.head()

Unnamed: 0,uid,viirs_2012,viirs_2013,viirs_2014,viirs_2015,viirs_2016,viirs_2017,viirs_2018,dmspols_1992,dmspols_1993,dmspols_1994,dmspols_1995,dmspols_1996,dmspols_1997,dmspols_1998,dmspols_1999,dmspols_2000,dmspols_2001,dmspols_2002,dmspols_2003,dmspols_2004,dmspols_2005,dmspols_2006,dmspols_2007,dmspols_2008,dmspols_2009,dmspols_2010,dmspols_2011,dmspols_2012,dmspols_2013,l7_2011_1,l7_2011_2,l7_2011_3,l7_2011_4,l7_2011_5,l7_2011_6,l7_2011_7,hhinc_2011,hhinc_2013
0,100389,2.052018,2.141392,2.089507,2.307763,2.850603,3.653005,3.75,43.0,33.666667,35.5,45.333333,40.0,33.166667,39.5,40.333333,37.333333,39.666667,38.833333,33.666667,34.0,34.5,40.666667,45.0,43.0,30.333333,46.0,32.666667,47.666667,45.333333,902.331348,1224.739396,1393.123911,2555.792708,2474.174317,3005.856769,1922.539802,9000.0,73000.0
1,100401,1.964332,2.133366,2.052437,2.296554,2.76996,3.702374,3.488333,43.0,33.666667,35.5,45.333333,40.0,33.166667,39.5,40.333333,37.333333,39.666667,38.833333,33.666667,34.0,34.5,40.666667,45.0,43.0,30.333333,46.0,32.666667,47.666667,45.333333,885.841488,1200.54835,1366.253764,2512.672843,2451.849595,3004.616242,1890.566155,75000.0,159000.0
2,100581,1.824753,1.937131,1.875487,2.04754,2.557241,3.198625,3.286,43.0,32.5,34.25,43.0,38.0,31.75,38.25,38.75,36.0,38.25,37.75,32.0,32.75,33.75,40.0,43.75,42.5,30.0,45.5,30.5,47.5,44.5,886.021385,1206.745127,1373.031277,2550.999418,2462.90966,3006.164678,1900.64984,48000.0,0.0
3,101101,1.964332,2.133366,2.052437,2.296554,2.76996,3.702374,3.488333,43.0,33.666667,35.5,45.333333,40.0,33.166667,39.5,40.333333,37.333333,39.666667,38.833333,33.666667,34.0,34.5,40.666667,45.0,43.0,30.333333,46.0,32.666667,47.666667,45.333333,886.196798,1201.037263,1366.468559,2514.479913,2450.865939,3004.699563,1890.108734,31200.0,219000.0
4,101236,2.052018,2.141392,2.089507,2.307763,2.850603,3.653005,3.75,43.0,33.666667,35.5,45.333333,40.0,33.166667,39.5,40.333333,37.333333,39.666667,38.833333,33.666667,34.0,34.5,40.666667,45.0,43.0,30.333333,46.0,32.666667,47.666667,45.333333,891.264553,1209.61309,1374.709528,2535.919345,2453.881552,3005.134086,1897.493484,14000.0,


In [4]:
# Keep only 2011 columns, but include viirs_2012
df = df.filter(regex='_2011', axis=1).join(df['viirs_2012'])
df.head()

Unnamed: 0,dmspols_2011,l7_2011_1,l7_2011_2,l7_2011_3,l7_2011_4,l7_2011_5,l7_2011_6,l7_2011_7,hhinc_2011,viirs_2012
0,32.666667,902.331348,1224.739396,1393.123911,2555.792708,2474.174317,3005.856769,1922.539802,9000.0,2.052018
1,32.666667,885.841488,1200.54835,1366.253764,2512.672843,2451.849595,3004.616242,1890.566155,75000.0,1.964332
2,30.5,886.021385,1206.745127,1373.031277,2550.999418,2462.90966,3006.164678,1900.64984,48000.0,1.824753
3,32.666667,886.196798,1201.037263,1366.468559,2514.479913,2450.865939,3004.699563,1890.108734,31200.0,1.964332
4,32.666667,891.264553,1209.61309,1374.709528,2535.919345,2453.881552,3005.134086,1897.493484,14000.0,2.052018


In [5]:
# Drop columns where the label is missing
df = df.loc[~pd.isnull(df['hhinc_2011'])]

df.shape

(4875, 10)

## 2. Split data into test/train

In [6]:
LABEL = 'hhinc_2011'
TEST_SIZE = 0.3

# Separate feature sets from label sets
x_df = df.drop(labels=[LABEL], axis=1)
y_df = df[LABEL]

# Split into test and train sets for features and labels
x_train, x_test, y_train, y_test =  train_test_split(x_df, y_df, test_size=TEST_SIZE)

## 3. Preprocess data

All vars are numeric - impute missing data with mean

In [7]:
# Check how many rows are missing across columns
print("TRAINING FEATURES MISSING:")
print(pd.isnull(x_train).sum())
print("")
print("TEST FEATURES MISSING:")
print(pd.isnull(x_test).sum())

TRAINING FEATURES MISSING:
dmspols_2011    34
l7_2011_1       0 
l7_2011_2       0 
l7_2011_3       0 
l7_2011_4       0 
l7_2011_5       0 
l7_2011_6       0 
l7_2011_7       0 
viirs_2012      34
dtype: int64

TEST FEATURES MISSING:
dmspols_2011    18
l7_2011_1       0 
l7_2011_2       0 
l7_2011_3       0 
l7_2011_4       0 
l7_2011_5       0 
l7_2011_6       0 
l7_2011_7       0 
viirs_2012      18
dtype: int64


In [8]:
for i in (x_train, x_test):
    for j in i.columns:
        
        if i[j].isnull().sum():
            # Create imputed flag
            new_name = i[j].name + '_imputed'
            i[new_name] = pd.isnull(i[j]).astype('int')
            # Fill with mean
            i[j] = i[j].fillna(i[j].mean())
        else:
            continue

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [9]:
# All missing values were imputed
print("TRAINING FEATURES MISSING:")
print(pd.isnull(x_train).sum())
print("")
print("TEST FEATURES MISSING:")
print(pd.isnull(x_test).sum())

TRAINING FEATURES MISSING:
dmspols_2011            0
l7_2011_1               0
l7_2011_2               0
l7_2011_3               0
l7_2011_4               0
l7_2011_5               0
l7_2011_6               0
l7_2011_7               0
viirs_2012              0
dmspols_2011_imputed    0
viirs_2012_imputed      0
dtype: int64

TEST FEATURES MISSING:
dmspols_2011            0
l7_2011_1               0
l7_2011_2               0
l7_2011_3               0
l7_2011_4               0
l7_2011_5               0
l7_2011_6               0
l7_2011_7               0
viirs_2012              0
dmspols_2011_imputed    0
viirs_2012_imputed      0
dtype: int64


## 4. Feature Generation

[Landsat 7 specs](https://landsat.usgs.gov/sites/default/files/documents/si_product_guide.pdf#page=14)

Create indices from every possible pair of Landsat 7 band.
- Normalized Difference Vegetation Index, NDVI = $\frac{NIR - Red}{NIR + Red}$ is formed from the (NIR, Red) pair.
- Normalized Difference Built-up Index, NDBI = $\frac{SWIR1 - NIR}{SWIR1 + NIR}$ is formed from the (NIR, SWIR1) pair.
- Normalized Difference Water Index, NDWO = $\frac{NIR - SWIR1}{NIR + SWIR1}$ is also formed from the (NIR, SWIR1) pair.
- Modified NDWI, MNDWI = $\frac{Green - SWIR1}{Green + SWIR1}$ is formed from the (NIR, Green) pair. And so on.


| Band | 1 | 2 | 3 | 4 | 5 | 6 | 7
| ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- 
| 1 | NA 
| 2 | ? | NA 
| 3 | ? | ? | NA 
| 4 | ? | ? | NDVI | NA
| 5 | ? | MNDWI | ? | NDBI, NDWI | NA 
| 6 | ? | ? | ? | ? | ? | NA 
| 7 | ? | ? | ? | ? | ? | ? | NA



In [10]:
# Create ratios 
# Note that ratio of Band A to Band B is the same as ratio of Band B to Band A
# Solution: only create ratios where A < B
for df in (x_train, x_test):
    for i in range(1, 8):
        for j in range(1, 8):

            if i >= j:
                continue
            else:
                band1 = f'l7_2011_{i}'
                band2 = f'l7_2011_{j}'
                new_var = f'ratio_{i}_{j}'
                df[new_var] = abs((df[band1] - df[band2]) / (df[band1] + df[band2]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [11]:
x_train.head()

Unnamed: 0,dmspols_2011,l7_2011_1,l7_2011_2,l7_2011_3,l7_2011_4,l7_2011_5,l7_2011_6,l7_2011_7,viirs_2012,dmspols_2011_imputed,viirs_2012_imputed,ratio_2_1,ratio_3_1,ratio_3_2,ratio_4_1,ratio_4_2,ratio_4_3,ratio_5_1,ratio_5_2,ratio_5_3,ratio_5_4,ratio_6_1,ratio_6_2,ratio_6_3,ratio_6_4,ratio_6_5,ratio_7_1,ratio_7_2,ratio_7_3,ratio_7_4,ratio_7_5,ratio_7_6
3739,34.0,933.867169,1233.648347,1391.880945,2380.166618,2351.975783,2999.888486,1807.197506,4.83112,0,0,0.138306,0.196932,0.060267,0.436417,0.31726,0.262002,0.431581,0.311892,0.256445,0.005957,0.525203,0.417202,0.366141,0.115189,0.121063,0.31861,0.188615,0.129824,0.136833,0.130982,0.248111
980,48.666667,996.723983,1277.002616,1520.621657,2275.935174,2355.01032,3043.332122,2034.937209,5.04023,0,0,0.123268,0.208115,0.087081,0.390878,0.281157,0.198947,0.405249,0.296807,0.215291,0.017075,0.506579,0.408841,0.333638,0.144267,0.127506,0.342457,0.228849,0.144651,0.055905,0.07291,0.198571
2826,8.333333,763.526461,1034.873364,1054.89401,2645.278715,1866.595377,3012.90695,1206.142483,0.379795,0,0,0.150882,0.160231,0.00958,0.552027,0.437592,0.429814,0.419398,0.286656,0.277838,0.172585,0.595636,0.488671,0.481344,0.064973,0.234924,0.224716,0.076425,0.066893,0.373664,0.21494,0.42824
819,61.666667,1148.804563,1390.461203,1593.157948,2047.266492,2190.735397,3041.03531,1947.28887,4.308349,0,0,0.095168,0.162057,0.067937,0.281115,0.191058,0.124741,0.311998,0.223466,0.157927,0.033853,0.451624,0.372464,0.312434,0.195305,0.162526,0.257901,0.166827,0.100024,0.025028,0.058832,0.219261
4570,1.0,689.395568,940.003621,1025.494786,2146.420046,1930.442207,2995.626883,1336.126304,0.199283,0,0,0.153804,0.195989,0.043496,0.513794,0.390878,0.353391,0.473711,0.345047,0.306146,0.052976,0.625839,0.522311,0.489946,0.16515,0.216234,0.319291,0.174033,0.131533,0.232673,0.181939,0.383101


In [12]:
x_test.head()

Unnamed: 0,dmspols_2011,l7_2011_1,l7_2011_2,l7_2011_3,l7_2011_4,l7_2011_5,l7_2011_6,l7_2011_7,viirs_2012,dmspols_2011_imputed,viirs_2012_imputed,ratio_2_1,ratio_3_1,ratio_3_2,ratio_4_1,ratio_4_2,ratio_4_3,ratio_5_1,ratio_5_2,ratio_5_3,ratio_5_4,ratio_6_1,ratio_6_2,ratio_6_3,ratio_6_4,ratio_6_5,ratio_7_1,ratio_7_2,ratio_7_3,ratio_7_4,ratio_7_5,ratio_7_6
2536,7.333333,855.459745,1143.410223,1243.046481,2774.439039,2169.429482,3019.695048,1488.166522,0.658477,0,0,0.144057,0.184696,0.041751,0.528659,0.416307,0.381182,0.434386,0.30971,0.271469,0.122376,0.55849,0.450694,0.416785,0.042328,0.163855,0.269969,0.131008,0.089748,0.301757,0.18626,0.339746
3957,22.0,873.818788,1131.933459,1223.534793,2389.562047,2195.877646,3006.230647,1661.158597,2.013705,0,0,0.128687,0.166742,0.038889,0.46447,0.357129,0.322722,0.430681,0.319713,0.28436,0.042239,0.549584,0.45293,0.421464,0.114287,0.155774,0.31059,0.189476,0.151705,0.179821,0.138635,0.288185
1656,5.0,1162.755146,1625.067121,1893.677443,2680.252972,3002.015222,3034.748188,2388.531168,0.279252,0,0,0.165833,0.239142,0.076337,0.394872,0.245089,0.171969,0.441623,0.297585,0.22639,0.056626,0.445978,0.302519,0.231528,0.062029,0.005422,0.345164,0.190219,0.11556,0.057553,0.113807,0.119156
4451,6.0,478.082707,609.211828,627.612637,1936.649942,1447.59413,2940.616397,907.512724,0.383096,0,0,0.120601,0.135236,0.014877,0.604028,0.52141,0.510493,0.503465,0.407614,0.395132,0.14451,0.720313,0.656765,0.648222,0.205846,0.340235,0.309925,0.196674,0.18233,0.361842,0.229324,0.528336
3772,31.0,988.906477,1287.293494,1411.081905,2389.237293,2181.551844,3006.194307,1714.057072,3.142325,0,0,0.13109,0.175907,0.045875,0.414527,0.299724,0.257388,0.376174,0.257797,0.214458,0.045438,0.50494,0.400351,0.361108,0.114348,0.15896,0.26828,0.142191,0.096948,0.164546,0.120006,0.273743


In [16]:
# check that lengths match
print(len(x_train) == len(y_train))
print(len(x_test) == len(y_test))

True
True


## 5. Train classifiers