# Predict household income from satellite imagery data

First pass.

General ML pipeline steps:
1. Import data
2. Split data into test/train sets
3. Preprocess test/train sets separately
4. Generate features from data
5. For each regressor-hyperparameter combination:
    - Train regressor with given hyperparameters and training data and labels
    - Generate predicted labels for test data with trained regressor
    - Evaluate regressor-hyperparameter performance against actual test labels and get $R^2$
6. Explore best-performing models

In [506]:
import os
import math
import pickle
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import LinearSVR
from sklearn.svm import LinearSVC

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier

from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

# Import configuration file
import config as cf

# Display options 
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = -1

# Turn off big pink warnings
import warnings
warnings.filterwarnings('ignore')

# Data file path 
final_data_file_path = "/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/FinalData"
#


In [507]:
# Test grid to make sure everything works - limited models and parameters
GRID_TEST_CLASS = {
    'regressors': ['LinearSVC','DecisionTreeClassifier', 'BaggingClassifier',
                    'GradientBoostingClassifier', 'RandomForestClassifier'],
    'LinearSVC': [
        {'penalty': penalty, 'C': C, 'loss': loss, 'max_iter': max_iter,
        'random_state': 0} \
        for penalty in ('l2', ) \
        for C in (1e-2, 1e2) \
        for loss in ('epsilon_insensitive', ) \
        for max_iter in (1e3, 1e5)
    ],
    'DecisionTreeClassifier': [
        {'criterion': criterion, 'splitter': splitter, 'max_depth': max_depth,
        'max_features': max_features, 'random_state': 0} \
        for criterion in ('gini', ) \
        for splitter in ('best', ) \
        for max_depth in (1, 5, 10, 20, 30) \
        for max_features in ('sqrt', ) \
    ],
    'BaggingClassifier': [
        {'n_estimators': n_estimators, 'max_features': max_features,
        'random_state': 0, 'n_jobs': -1} \
        for n_estimators in (100, 1000, 10000) \
        for max_features in (0.3, 0.5, 1.0)
    ],
    'GradientBoostingClassifier': [
        {'loss': loss, 'learning_rate': rate, 'n_estimators': n_estimators,
        'criterion': criterion, 'max_features': max_features,
        'random_state': 0} \
        for loss in ('deviance', ) \
        for rate in (1e-4, )
        for n_estimators in (100, ) \
        for criterion in ('friedman_mse', ) \
        for max_features in ('sqrt', ) \
    ],
    'RandomForestClassifier': [
        {'n_estimators': n_estimators, 'criterion': criterion,
        'max_depth': max_depth, 'max_features': max_features, 'n_jobs': -1,
        'random_state': 0} \
        for n_estimators in (10, 100, 1000) \
        for criterion in ('gini', ) \
        for max_depth in (1, ) \
        for max_features in ('sqrt', )
    ]
}

## 1. Import data and drop "future" rows

In [508]:
DATA_PATH = os.path.join(final_data_file_path, 'Outputs for Analysis TEMP', 'bisp_sat_inc_data.csv')
df = pd.read_csv(DATA_PATH)
df.shape

(5416, 46)

In [509]:
df.head()

Unnamed: 0,uid,viirs_2012,viirs_2013,viirs_2014,viirs_2015,viirs_2016,viirs_2017,viirs_2018,dmspols_1992,dmspols_1993,dmspols_1994,dmspols_1995,dmspols_1996,dmspols_1997,dmspols_1998,dmspols_1999,dmspols_2000,dmspols_2001,dmspols_2002,dmspols_2003,dmspols_2004,dmspols_2005,dmspols_2006,dmspols_2007,dmspols_2008,dmspols_2009,dmspols_2010,dmspols_2011,dmspols_2012,dmspols_2013,l7_2011_1,l7_2011_2,l7_2011_3,l7_2011_4,l7_2011_5,l7_2011_6,l7_2011_7,l7_2013_1,l7_2013_2,l7_2013_3,l7_2013_4,l7_2013_5,l7_2013_6,l7_2013_7,hhinc_2011,hhinc_2013
0,100389,2.052018,2.141392,2.089507,2.307763,2.850603,3.653005,3.75,43.0,33.666667,35.5,45.333333,40.0,33.166667,39.5,40.333333,37.333333,39.666667,38.833333,33.666667,34.0,34.5,40.666667,45.0,43.0,30.333333,46.0,32.666667,47.666667,45.333333,902.331348,1224.739396,1393.123911,2555.792708,2474.174317,3005.856769,1922.539802,951.897734,1282.748257,1417.251598,2574.000436,2469.137711,3000.408919,1889.609384,9000.0,73000.0
1,100401,1.964332,2.133366,2.052437,2.296554,2.76996,3.702374,3.488333,43.0,33.666667,35.5,45.333333,40.0,33.166667,39.5,40.333333,37.333333,39.666667,38.833333,33.666667,34.0,34.5,40.666667,45.0,43.0,30.333333,46.0,32.666667,47.666667,45.333333,885.841488,1200.54835,1366.253764,2512.672843,2451.849595,3004.616242,1890.566155,941.063694,1268.392009,1402.77707,2547.212362,2463.117111,2998.70194,1876.871453,75000.0,159000.0
2,100581,1.824753,1.937131,1.875487,2.04754,2.557241,3.198625,3.286,43.0,32.5,34.25,43.0,38.0,31.75,38.25,38.75,36.0,38.25,37.75,32.0,32.75,33.75,40.0,43.75,42.5,30.0,45.5,30.5,47.5,44.5,886.021385,1206.745127,1373.031277,2550.999418,2462.90966,3006.164678,1900.64984,935.16206,1263.157696,1398.079866,2572.847832,2458.750073,2999.056008,1880.909223,48000.0,0.0
3,101101,1.964332,2.133366,2.052437,2.296554,2.76996,3.702374,3.488333,43.0,33.666667,35.5,45.333333,40.0,33.166667,39.5,40.333333,37.333333,39.666667,38.833333,33.666667,34.0,34.5,40.666667,45.0,43.0,30.333333,46.0,32.666667,47.666667,45.333333,886.196798,1201.037263,1366.468559,2514.479913,2450.865939,3004.699563,1890.108734,940.979913,1268.248763,1401.871616,2547.740466,2459.946143,2998.786463,1874.074672,31200.0,219000.0
4,101236,2.052018,2.141392,2.089507,2.307763,2.850603,3.653005,3.75,43.0,33.666667,35.5,45.333333,40.0,33.166667,39.5,40.333333,37.333333,39.666667,38.833333,33.666667,34.0,34.5,40.666667,45.0,43.0,30.333333,46.0,32.666667,47.666667,45.333333,891.264553,1209.61309,1374.709528,2535.919345,2453.881552,3005.134086,1897.493484,943.113959,1271.824645,1403.386186,2563.689111,2453.279467,2999.725311,1869.047929,14000.0,


In [510]:
# Remove NAs
df = df.loc[(df['hhinc_2011'] >= 0)]
df = df.loc[(df['hhinc_2011'] <= 20000000)]

In [511]:
# DV as Quantiles
df['hhinc_2011'] = pd.qcut(df['hhinc_2011'], 3, labels=False)
df['hhinc_2011'].value_counts()

0    1663
1    1613
2    1590
Name: hhinc_2011, dtype: int64

In [512]:
# Keep only 2011 columns, but include viirs_2012
df = df.filter(regex='_2011', axis=1).join(df[['viirs_2012','uid']])
df.head()

Unnamed: 0,dmspols_2011,l7_2011_1,l7_2011_2,l7_2011_3,l7_2011_4,l7_2011_5,l7_2011_6,l7_2011_7,hhinc_2011,viirs_2012,uid
0,32.666667,902.331348,1224.739396,1393.123911,2555.792708,2474.174317,3005.856769,1922.539802,0,2.052018,100389
1,32.666667,885.841488,1200.54835,1366.253764,2512.672843,2451.849595,3004.616242,1890.566155,1,1.964332,100401
2,30.5,886.021385,1206.745127,1373.031277,2550.999418,2462.90966,3006.164678,1900.64984,1,1.824753,100581
3,32.666667,886.196798,1201.037263,1366.468559,2514.479913,2450.865939,3004.699563,1890.108734,0,1.964332,101101
4,32.666667,891.264553,1209.61309,1374.709528,2535.919345,2453.881552,3005.134086,1897.493484,0,2.052018,101236


In [513]:
# Drop columns where the label is missing
df = df.loc[~pd.isnull(df['hhinc_2011'])]

df.shape

(4866, 11)

## 2. Split data into test/train

In [514]:
LABEL = 'hhinc_2011'
TEST_SIZE = 0.3

# Separate feature sets from label sets
x_df = df.drop(labels=[LABEL], axis=1)
y_df = df[LABEL]

# Split into test and train sets for features and labels
x_train, x_test, y_train, y_test =  train_test_split(x_df, y_df, test_size=TEST_SIZE)

## 3. Preprocess data

All vars are numeric - impute missing data with mean

In [515]:
# Check how many rows are missing across columns
print("TRAINING FEATURES MISSING:")
print(pd.isnull(x_train).sum())
print("")
print("TEST FEATURES MISSING:")
print(pd.isnull(x_test).sum())

TRAINING FEATURES MISSING:
dmspols_2011    34
l7_2011_1       0 
l7_2011_2       0 
l7_2011_3       0 
l7_2011_4       0 
l7_2011_5       0 
l7_2011_6       0 
l7_2011_7       0 
viirs_2012      34
uid             0 
dtype: int64

TEST FEATURES MISSING:
dmspols_2011    18
l7_2011_1       0 
l7_2011_2       0 
l7_2011_3       0 
l7_2011_4       0 
l7_2011_5       0 
l7_2011_6       0 
l7_2011_7       0 
viirs_2012      18
uid             0 
dtype: int64


In [516]:
for i in (x_train, x_test):
    for j in i.columns:
        
        if i[j].isnull().sum():
            # Create imputed flag
            new_name = i[j].name + '_imputed'
            i[new_name] = pd.isnull(i[j]).astype('int')
            # Fill with mean
            i[j] = i[j].fillna(i[j].mean())
        else:
            continue

In [517]:
# All missing values were imputed
print("TRAINING FEATURES MISSING:")
print(pd.isnull(x_train).sum())
print("")
print("TEST FEATURES MISSING:")
print(pd.isnull(x_test).sum())

TRAINING FEATURES MISSING:
dmspols_2011            0
l7_2011_1               0
l7_2011_2               0
l7_2011_3               0
l7_2011_4               0
l7_2011_5               0
l7_2011_6               0
l7_2011_7               0
viirs_2012              0
uid                     0
dmspols_2011_imputed    0
viirs_2012_imputed      0
dtype: int64

TEST FEATURES MISSING:
dmspols_2011            0
l7_2011_1               0
l7_2011_2               0
l7_2011_3               0
l7_2011_4               0
l7_2011_5               0
l7_2011_6               0
l7_2011_7               0
viirs_2012              0
uid                     0
dmspols_2011_imputed    0
viirs_2012_imputed      0
dtype: int64


## 4. Feature Generation

[Landsat 7 specs](https://landsat.usgs.gov/sites/default/files/documents/si_product_guide.pdf#page=14)

Create indices from every possible pair of Landsat 7 band.
- Normalized Difference Vegetation Index, NDVI = $\frac{NIR - Red}{NIR + Red}$ is formed from the (NIR, Red) pair.
- Normalized Difference Built-up Index, NDBI = $\frac{SWIR1 - NIR}{SWIR1 + NIR}$ is formed from the (NIR, SWIR1) pair.
- Normalized Difference Water Index, NDWO = $\frac{NIR - SWIR1}{NIR + SWIR1}$ is also formed from the (NIR, SWIR1) pair.
- Modified NDWI, MNDWI = $\frac{Green - SWIR1}{Green + SWIR1}$ is formed from the (NIR, Green) pair. And so on.


| Band | 1 | 2 | 3 | 4 | 5 | 6 | 7
| ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- 
| 1 | NA 
| 2 | ? | NA 
| 3 | ? | ? | NA 
| 4 | ? | ? | NDVI | NA
| 5 | ? | MNDWI | ? | NDBI, NDWI | NA 
| 6 | ? | ? | ? | ? | ? | NA 
| 7 | ? | ? | ? | ? | ? | ? | NA



In [518]:
# Create ratios 
# Note that ratio of Band A to Band B is the same as ratio of Band B to Band A
# Solution: only create ratios where A < B
for df in (x_train, x_test):
    for i in range(1, 8):
        for j in range(1, 8):

            if i >= j:
                continue
            else:
                band1 = f'l7_2011_{i}'
                band2 = f'l7_2011_{j}'
                new_var = f'ratio_{i}_{j}'
                df[new_var] = abs((df[band1] - df[band2]) / (df[band1] + df[band2]))

In [519]:
x_train.head()

Unnamed: 0,dmspols_2011,l7_2011_1,l7_2011_2,l7_2011_3,l7_2011_4,l7_2011_5,l7_2011_6,l7_2011_7,viirs_2012,uid,dmspols_2011_imputed,viirs_2012_imputed,ratio_1_2,ratio_1_3,ratio_1_4,ratio_1_5,ratio_1_6,ratio_1_7,ratio_2_3,ratio_2_4,ratio_2_5,ratio_2_6,ratio_2_7,ratio_3_4,ratio_3_5,ratio_3_6,ratio_3_7,ratio_4_5,ratio_4_6,ratio_4_7,ratio_5_6,ratio_5_7,ratio_6_7
3432,8.25,1163.895851,1550.294314,1817.927038,2933.406005,2526.418045,3052.574993,2039.724398,0.493734,37500574,0,0,0.142362,0.219339,0.431872,0.369216,0.447929,0.273387,0.079458,0.308475,0.239439,0.326379,0.136331,0.234772,0.163084,0.253495,0.057495,0.074542,0.019908,0.179702,0.09431,0.106587,0.198898
3006,4.0,1042.715843,1380.166134,1518.158721,2497.521366,2146.843169,3010.179651,1475.236047,0.212836,33502329,0,0,0.139276,0.185656,0.410934,0.346169,0.485446,0.171775,0.047611,0.28815,0.217373,0.371272,0.033295,0.243885,0.171537,0.329485,0.014339,0.075506,0.09308,0.257324,0.16741,0.18542,0.342208
5279,5.0,1658.599069,2283.994472,2757.122636,3403.197847,4255.314664,3038.856125,3459.023276,0.431078,61500427,0,0,0.158625,0.248776,0.34466,0.439086,0.293831,0.351809,0.093854,0.196794,0.301457,0.141815,0.204601,0.104877,0.213648,0.048608,0.112916,0.111264,0.056557,0.008135,0.166771,0.103222,0.064662
700,8.0,723.232666,989.416884,1095.922251,2654.296055,2021.74369,3015.491732,1458.030316,0.44672,9301283,0,0,0.155422,0.20487,0.571739,0.47305,0.613113,0.336868,0.051073,0.456918,0.342834,0.505898,0.19147,0.415542,0.29696,0.466888,0.141783,0.135275,0.063705,0.290898,0.19728,0.161997,0.348151
3641,4.666667,636.219512,884.598432,941.108159,2344.991144,2054.88676,2979.426829,1397.264228,0.298406,39503329,0,0,0.163319,0.193294,0.57318,0.527169,0.648074,0.374257,0.030952,0.452191,0.398127,0.542136,0.22467,0.427219,0.371756,0.519908,0.195074,0.065935,0.119156,0.25325,0.183648,0.190496,0.361497


In [520]:
x_test.head()

Unnamed: 0,dmspols_2011,l7_2011_1,l7_2011_2,l7_2011_3,l7_2011_4,l7_2011_5,l7_2011_6,l7_2011_7,viirs_2012,uid,dmspols_2011_imputed,viirs_2012_imputed,ratio_1_2,ratio_1_3,ratio_1_4,ratio_1_5,ratio_1_6,ratio_1_7,ratio_2_3,ratio_2_4,ratio_2_5,ratio_2_6,ratio_2_7,ratio_3_4,ratio_3_5,ratio_3_6,ratio_3_7,ratio_4_5,ratio_4_6,ratio_4_7,ratio_5_6,ratio_5_7,ratio_6_7
634,13.0,864.398108,1131.496652,1245.548763,2627.650946,2213.787773,3023.319505,1643.098399,0.496819,7905808,0,0,0.133824,0.180645,0.504934,0.438372,0.555318,0.310549,0.047981,0.398004,0.323527,0.455333,0.184388,0.356837,0.279892,0.41645,0.137625,0.085484,0.070018,0.230534,0.154576,0.147966,0.295777
261,6.333333,799.658946,1100.635495,1403.190504,2282.755067,2646.810799,3025.422264,2280.678344,0.325722,3501957,0,0,0.158384,0.273978,0.481148,0.535955,0.581887,0.480798,0.120837,0.349389,0.412594,0.466495,0.348989,0.238627,0.307067,0.366307,0.238197,0.073851,0.13991,0.000455,0.066748,0.074304,0.140356
3389,11.5,957.33818,1270.395755,1382.035766,2840.356935,2284.70064,3021.468741,1675.123582,0.683521,37202289,0,0,0.140527,0.181543,0.495832,0.409422,0.518781,0.272667,0.04209,0.381916,0.28531,0.407998,0.137405,0.345378,0.246177,0.372302,0.095869,0.10842,0.030897,0.258053,0.138851,0.15394,0.286664
3273,53.5,1017.534107,1423.928737,1702.397823,2622.532511,2620.549927,3040.629318,2038.411176,8.751615,36102201,0,0,0.166455,0.251794,0.440926,0.440621,0.498525,0.334063,0.089072,0.29621,0.295865,0.362119,0.177476,0.212751,0.21239,0.282147,0.089824,0.000378,0.073827,0.125323,0.074204,0.12495,0.197324
389,15.666667,889.260093,1208.297851,1352.438426,2646.46747,2353.721173,3019.848243,1806.44743,0.623078,5101034,0,0,0.1521,0.206619,0.496986,0.451579,0.545032,0.34024,0.056289,0.373089,0.321566,0.42845,0.198408,0.323596,0.270167,0.381359,0.143724,0.058547,0.065895,0.188645,0.123964,0.131551,0.251415


In [521]:
# check that lengths match
print(len(x_train) == len(y_train))
print(len(x_test) == len(y_test))

True
True


### 4.1 Define feature groups

1. Daytime-only: Landsat 7 band data and computed indices
2. Nighttime-only: DMSP and VIIRS data + imputed flags
3. All features

In [522]:
DAY_FEATURES = df.filter(regex='l7|ratio', axis=1).columns.tolist()
NIGHT_FEATURES = ['dmspols_2011', 'viirs_2012', 'dmspols_2011_imputed', 'viirs_2012_imputed']
ALL_FEATURES = df.columns.tolist()

print("Day-only:", DAY_FEATURES)
print("-----")
print("Night-only:", NIGHT_FEATURES)

Day-only: ['l7_2011_1', 'l7_2011_2', 'l7_2011_3', 'l7_2011_4', 'l7_2011_5', 'l7_2011_6', 'l7_2011_7', 'ratio_1_2', 'ratio_1_3', 'ratio_1_4', 'ratio_1_5', 'ratio_1_6', 'ratio_1_7', 'ratio_2_3', 'ratio_2_4', 'ratio_2_5', 'ratio_2_6', 'ratio_2_7', 'ratio_3_4', 'ratio_3_5', 'ratio_3_6', 'ratio_3_7', 'ratio_4_5', 'ratio_4_6', 'ratio_4_7', 'ratio_5_6', 'ratio_5_7', 'ratio_6_7']
-----
Night-only: ['dmspols_2011', 'viirs_2012', 'dmspols_2011_imputed', 'viirs_2012_imputed']


### 4.2 Pickle cleaned data for future use

In [523]:
clean_data = [x_train, x_test, y_train, y_test]

output_path = os.path.join('output', 'final_data.pkl')
with open(output_path, 'wb') as f:
    pickle.dump(obj=clean_data,
                file=f,
                protocol=pickle.HIGHEST_PROTOCOL)

## 5. Train and Evaluate Regressors

### 5.1 Training

In [524]:
x_all = x_test.append(x_train)
y_all = y_test.append(y_train)

In [525]:
# Define a TrainedRegressor object to hold key results information
class TrainedRegressor:
    
    def __init__(self, method, params, features, regressor):
        self.method = method
        self.params = params
        self.regressor = regressor
        self.features = features
    
    def __repr__(self):
        return f'Trained {self.method} on feature set {self.features} with params {self.params}'

In [526]:
# Use GRID_MAIN for full grid search
# parameters = cf.GRID_TEST_CLASS
parameters = GRID_TEST_CLASS

results_df = pd.DataFrame()
results_df_all = pd.DataFrame()
results_df_trainedonly_all = pd.DataFrame()

x_trainedonly_all = x_all.copy()

trained_list = []
trained_list_all = []
count = 0
# print('Training model ', end='')
for i in parameters['regressors']:
    for j in parameters[i]:
        for k in ('DAY_FEATURES', 'NIGHT_FEATURES', 'ALL_FEATURES'):
        
            print(f'Model {count}: Training {i} on {k} with params {str(j)}')

            # A. Train ------------------------------------
            # Initialize regressor, fit data, then append TrainedRegressor object to list
            # 1. Train Data
            regressor = eval(i)(**j)
            trained = regressor.fit(x_train[eval(k)], y_train)
            trained_list.append(TrainedRegressor(i, str(j), k, trained))

            # 2. All Data
            trained_all = regressor.fit(x_all[eval(k)], y_all)
            trained_list_all.append(TrainedRegressor(i, str(j), k, trained_all))
            
            
            
            
            
            
            
            # B. Results -------------------------------------
            # 1. Trained Model on Test Data - - - - - - - - - -
            pred_labels = trained_list[count].regressor.predict(x_test[eval(k)])

            pred_dict = {
                'regressor': trained_list[count].method,
                'features': trained_list[count].features,
                'params': trained_list[count].params,
                'accuracy_score': accuracy_score(y_true=y_test, y_pred=pred_labels)        
            }
    
            results_df = results_df.append(pred_dict, ignore_index=True) \
                .sort_values(by='accuracy_score', ascending=False, axis=0) \
                [['regressor', 'params', 'features', 'accuracy_score']]
        
            results_df.to_csv("/Users/robmarty/Desktop/results.csv")
            
            x_test['y_true'] = y_test
            x_test['y_predict_' + str(count)] = pred_labels
            x_test.to_csv(os.path.join(final_data_file_path, 'Data with Predicted Income', 'opm_data_with_predictions_traineddatamodel_testdatapredict.csv'))
            
            
            
            
            
            
            
            # 2. Trained All Model on All Data - - - - - - - - - -
            pred_labels_all = trained_list_all[count].regressor.predict(x_all[eval(k)])

            # Append results to dataframe and sort by R^2
            pred_dict = {
                'regressor': trained_list_all[count].method,
                'features': trained_list_all[count].features,
                'params': trained_list_all[count].params,
                'accuracy_score': accuracy_score(y_true=y_all, y_pred=pred_labels_all)        
            }
    
            results_df_all = results_df_all.append(pred_dict, ignore_index=True) \
                .sort_values(by='accuracy_score', ascending=False, axis=0) \
                [['regressor', 'params', 'features', 'accuracy_score']]
        
            results_df_all.to_csv("/Users/robmarty/Desktop/results_all.csv")

            # ALL
            x_trainedonly_all['y_true'] = y_all
            x_trainedonly_all['y_predict_' + str(count)] = trained_list_all[count].regressor.predict(x_all[eval(k)])
            x_trainedonly_all.to_csv(os.path.join(final_data_file_path, 'Data with Predicted Income', 'opm_data_with_predictions_alldatamodel_alldatapredict.csv'))
            
            
            
            
            
            
            # 3. Trained Model on All Data - - - - - - - - - -
            pred_labels_trainedonly_all = trained_list[count].regressor.predict(x_all[eval(k)])

            # Append results to dataframe and sort by R^2
            pred_dict = {
                'regressor': trained_list[count].method,
                'features': trained_list[count].features,
                'params': trained_list[count].params,
                'accuracy_score': accuracy_score(y_true=y_all, y_pred=pred_labels_trainedonly_all)        
            }
    
            results_df_trainedonly_all = results_df_trainedonly_all.append(pred_dict, ignore_index=True) \
                .sort_values(by='accuracy_score', ascending=False, axis=0) \
                [['regressor', 'params', 'features', 'accuracy_score']]
        
            results_df_trainedonly_all.to_csv("/Users/robmarty/Desktop/results_trainedonly_all.csv")

            # ALL
            x_all['y_true'] = y_all
            x_all['y_predict_' + str(count)] = trained_list[count].regressor.predict(x_all[eval(k)])
            x_all.to_csv(os.path.join(final_data_file_path, 'Data with Predicted Income', 'opm_data_with_predictions_testdatamodel_alldatapredict.csv'))

            ####
            count += 1


Model 0: Training LinearSVC on DAY_FEATURES with params {'penalty': 'l2', 'C': 0.01, 'loss': 'epsilon_insensitive', 'max_iter': 1000.0, 'random_state': 0}
Model 1: Training LinearSVC on NIGHT_FEATURES with params {'penalty': 'l2', 'C': 0.01, 'loss': 'epsilon_insensitive', 'max_iter': 1000.0, 'random_state': 0}
Model 2: Training LinearSVC on ALL_FEATURES with params {'penalty': 'l2', 'C': 0.01, 'loss': 'epsilon_insensitive', 'max_iter': 1000.0, 'random_state': 0}
Model 3: Training LinearSVC on DAY_FEATURES with params {'penalty': 'l2', 'C': 0.01, 'loss': 'epsilon_insensitive', 'max_iter': 100000.0, 'random_state': 0}
Model 4: Training LinearSVC on NIGHT_FEATURES with params {'penalty': 'l2', 'C': 0.01, 'loss': 'epsilon_insensitive', 'max_iter': 100000.0, 'random_state': 0}
Model 5: Training LinearSVC on ALL_FEATURES with params {'penalty': 'l2', 'C': 0.01, 'loss': 'epsilon_insensitive', 'max_iter': 100000.0, 'random_state': 0}
Model 6: Training LinearSVC on DAY_FEATURES with params {'pe

KeyboardInterrupt: 

In [None]:
#y_test
#pred_labels
#parameters

### 5.2 Prediction and Evaluation

In [None]:
results_df = pd.DataFrame()
for i in trained_list:
    
    # Get predicted results from test data
    features = eval(i.features)
    pred_labels = i.regressor.predict(x_test[features])
    
    # Append results to dataframe and sort by R^2
    pred_dict = {
        'regressor': i.method,
        'features': i.features,
        'params': i.params,
        'r2': r2_score(y_true=y_test, y_pred=pred_labels)        
    }
    
    results_df = results_df.append(pred_dict, ignore_index=True) \
        .sort_values(by='r2', ascending=False, axis=0) \
        [['regressor', 'params', 'features', 'r2']]

results_df.shape

In [None]:
results_df