In [4]:
import pandas as pd
import numpy as np
from predictiveImputer_mod import PredictiveImputer

In [5]:
### read in 1st X rows of big data
data = pd.read_csv('../data/assembled_data.csv', nrows = 1200000)
data.head()

Unnamed: 0,site,year,date,MonitorData,GFEDFireCarbon,USElevation_dsc10000,USElevation_max100,USElevation_max10000,USElevation_mea100,USElevation_mea10000,...,Nearby_Peak2Lag3_MeanTemperature,Nearby_Peak2Lag3_MinTemperature,OMAEROe_UVAerosolIndex_Mean,OMAEROe_VISAerosolIndex_Mean,OMAERUVd_UVAerosolIndex_Mean,OMNO2d_ColumnAmountNO2StratoCloudScreened_Mean,OMO3PR,OMSO2e_ColumnAmountSO2_PBL_Mean,OMTO3e_ColumnAmountO3,OMUVBd_UVindex_Mean
0,1,2000,2000-01-01,,0.001167,26.790501,43,30.143499,36.0,26.504299,...,286.112711,280.293551,,,,,,,,
1,1,2000,2000-01-02,,0.001236,26.790501,43,30.143499,36.0,26.504299,...,286.112711,280.293551,,,,,,,,
2,1,2000,2000-01-03,,0.001305,26.790501,43,30.143499,36.0,26.504299,...,286.112711,280.293551,,,,,,,,
3,1,2000,2000-01-04,,0.001373,26.790501,43,30.143499,36.0,26.504299,...,286.112711,280.293551,,,,,,,,
4,1,2000,2000-01-05,,0.001442,26.790501,43,30.143499,36.0,26.504299,...,290.424271,286.541158,,,,,,,,


In [7]:
### read in census data as example static variables
census = pd.read_csv('../data/sensor_locations_with_census.csv')
site_lat_long = census.loc[:, ('Site_ID', 'Lat', 'Lon')]
census_reduced = census.iloc[:, 8:-1]
census_reduced = pd.concat([site_lat_long, census_reduced], axis = 1)
census_reduced.head()

Unnamed: 0,Site_ID,Lat,Lon,Continental_indicator,Population,Land_Sq_Mi,Population_Density,White,Black,Native,...,Income_150to200k_p,Income_over200k_p,Age_0to9_p,Age_10to19_p,Age_20to29_p,Age_30to39_p,Age_40to49_p,Age_50to59_p,Age_60to69_p,Age_over70_p
0,,30.49748,-87.88026,1,32285.0,73.738,437.8,28378.0,1901.0,43.0,...,0.022,0.03,0.123,0.141,0.077,0.095,0.146,0.135,0.142,0.142
1,,33.28493,-85.80361,1,5195.0,144.453,36.0,4457.0,504.0,42.0,...,0.002,0.002,0.124,0.125,0.104,0.089,0.173,0.135,0.114,0.136
2,,34.76262,-87.6381,1,16861.0,55.916,301.5,13570.0,2666.0,110.0,...,0.018,0.005,0.124,0.11,0.119,0.121,0.13,0.155,0.107,0.134
3,,34.28857,-85.96986,1,9691.0,78.702,123.1,6789.0,4.0,52.0,...,0.002,0.002,0.151,0.17,0.137,0.134,0.139,0.104,0.054,0.111
4,,33.99149,-85.99265,1,17106.0,67.416,253.7,10507.0,5395.0,9.0,...,0.007,0.008,0.107,0.112,0.156,0.12,0.117,0.152,0.134,0.102


In [8]:
data_final = data.merge(right = census_reduced, left_on = 'site', right_on = 'Site_ID', how = 'left')
data_final = data_final.drop(['Site_ID'], axis = 1)
data_final.head()

Unnamed: 0,site,year,date,MonitorData,GFEDFireCarbon,USElevation_dsc10000,USElevation_max100,USElevation_max10000,USElevation_mea100,USElevation_mea10000,...,Income_150to200k_p,Income_over200k_p,Age_0to9_p,Age_10to19_p,Age_20to29_p,Age_30to39_p,Age_40to49_p,Age_50to59_p,Age_60to69_p,Age_over70_p
0,1,2000,2000-01-01,,0.001167,26.790501,43,30.143499,36.0,26.504299,...,,,,,,,,,,
1,1,2000,2000-01-02,,0.001236,26.790501,43,30.143499,36.0,26.504299,...,,,,,,,,,,
2,1,2000,2000-01-03,,0.001305,26.790501,43,30.143499,36.0,26.504299,...,,,,,,,,,,
3,1,2000,2000-01-04,,0.001373,26.790501,43,30.143499,36.0,26.504299,...,,,,,,,,,,
4,1,2000,2000-01-05,,0.001442,26.790501,43,30.143499,36.0,26.504299,...,,,,,,,,,,


In [9]:
### train/val/test split by site id
np.random.seed(1)

# get sites for val/test data
val_test_sites = np.random.choice(np.unique(data_final['site'].values), round(len(np.unique(data_final['site'].values))/4), replace = False)

# get sites for test data
test_sites = np.random.choice(np.unique(val_test_sites), round(len(np.unique(val_test_sites))/2), replace = False)

# train sites/rows and x/y split
train = data_final[~data_final['site'].isin(val_test_sites)]
train_x = train.iloc[:, 4:20]
#train_x = train_x.drop('REANALYSIS_windspeed_10m_1Day', axis=1)
train_y = train.loc[:, 'MonitorData']
train_sites = train.loc[:, 'site']

# val sites/rows and x/y split
val = data_final[(data_final['site'].isin(val_test_sites)) & (~data_final['site'].isin(test_sites))]
val_x = val.iloc[:, 4:20]
#val_x = val_x.drop('REANALYSIS_windspeed_10m_1Day', axis=1)
val_y = val.loc[:, 'MonitorData']
val_sites = val.loc[:, 'site']

# test sites/rows and x/y split
test = data_final[data_final['site'].isin(test_sites)]
test_x = test.iloc[:, 4:20]
#test_x = test_x.drop('REANALYSIS_windspeed_10m_1Day', axis=1)
test_y = test.loc[:, 'MonitorData']
test_sites = test.loc[:, 'site']

In [10]:
pred_imputer = PredictiveImputer(max_iter=10, initial_strategy='mean', f_model='Ridge')
pred_imputer.fit(X = train_x, y = None, alpha=1, normalize=True)

PredictiveImputer(f_model='Ridge', initial_strategy='mean', max_iter=10)

In [72]:
pred_imputer = PredictiveImputer(max_iter=10, initial_strategy='mean', f_model='RandomForest')
pred_imputer.fit(X = train_x, y = None, n_estimators = 50, max_features = 'sqrt', n_jobs = -1, oob_score=False, verbose=1)

0


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   13.6s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.6s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.9s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   12.4s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.3s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   12.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.6s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.9s
[Parall

0.000444359709239
1


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   14.2s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.4s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.6s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.7s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.6s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.8s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   12.6s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.0s
[Parall

3.91347104188e-05
2


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   13.6s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.4s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.7s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.7s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.5s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.6s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.8s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.0s
[Parall

1.50965147013e-06
3


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   13.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.4s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.7s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   12.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.4s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.3s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   12.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.2s
[Parall

2.76042814637e-08
4


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   13.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   13.1s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   12.7s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   15.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   12.9s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   13.8s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   13.3s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.9s
[Parall

9.52889243633e-10
5


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   13.3s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   12.1s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   13.6s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   14.6s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   12.5s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   12.9s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   13.2s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.6s
[Parall

2.97509075185e-09


PredictiveImputer(f_model='RandomForest', initial_strategy='mean',
         max_iter=10)