In [9]:
import pandas as pd
import datetime

# read in data
sensor_census = pd.read_csv('../data/subset_with_sensor_locations_and_census.csv')

# extract month from dates and add to dataframe
sensor_census['month'] = pd.Series(sensor_census['date'].map(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").month))

# add interaction between lat and long
sensor_census['lat_long_int'] = pd.Series(sensor_census['Lat'] * sensor_census['Lon'])

# convert months to dummy cols
sensor_census = pd.get_dummies(sensor_census, columns = ['month'])

# have year start at year zero
sensor_census['year'] = sensor_census['year'] - sensor_census['year'].min()

# keep rows where continental_ind == 1
sensor_census = sensor_census[sensor_census['Continental_ind'] == 1]

# drop census count data and keep proportions; drop site, city, state, county, zip, country, date;
# drop REANALYSIS_windspeed_10m_1Day because all missing
sensor_census = sensor_census.drop(['White', 'Black', 'Native', 'Asian', 'Islander', 'Other', 'Two', 'Hispanic', 
    'Age_0_to_9', 'Age_10_to_19', 'Age_20_to_29','Age_30_to_39','Age_40_to_49','Age_50_to_59','Age_60_to_69', 'Age_70_plus', 
    'Income_less_than_25k', 'Income_25k_to_50k', 'Income_25k_to_50k', 'Income_50k_to_75k', 'Income_75k_to_100k', 'Income_100k_to_150k', 'Income_150k_to_200k', 'Income_200k_or_more',
    'Households', 'Family_Households', 'site', 'City', 'State', 'County', 'Zip', 'Country', 'date', 'REANALYSIS_windspeed_10m_1Day'], axis = 1)

# going to try to impute in R, so saving and reloading in R
sensor_census.to_csv('../data/subet_sensor_census_toImpute.csv', index = False)

Now using R kernel

In [1]:
require('missForest')
install.packa

Loading required package: mice
Loading required package: ImputeRobust
Loading required package: gamlss
Loading required package: splines
Loading required package: gamlss.data
Loading required package: gamlss.dist
Loading required package: MASS
Loading required package: nlme
Loading required package: parallel
 **********   GAMLSS Version 5.0-1  ********** 
For more on GAMLSS look at http://www.gamlss.org/
Type gamlssNews() to see new features/changes/bug fixes.

Loading required package: lattice


In [74]:
# read in data
sensor_census2 = read.csv('subet_sensor_census_readyToImpute.csv')

In [78]:
# get correlation matrix with for all variables
corr.mat = as.matrix(cor(sensor_census2, use = 'pairwise.complete.obs')) - diag(ncol(sensor_census2))

# get variable names for variables with high correlations 
high.corr.colnames = colnames(corr.mat)[apply(corr.mat, 1, max) >= 0.9]

# columns to delete
to.delete = c('USElevation_dsc10000','USElevation_max100','USElevation_max10000','USElevation_mea10000','USElevation_med100','USElevation_med10000','USElevation_min100',
'USElevation_min10000','USElevation_bln100','USElevation_bln10000', 'NLCD_Developed10000', 'NLCD_Impervious10000', 'MAIACUS_Optical_Depth_055_Aqua_Nearest4', 
'MAIACUS_Optical_Depth_055_Terra_Nearest4', 'REANALYSIS_shum_2m_DailyMax', 'REANALYSIS_prate_DailyMax', 'REANALYSIS_prate_DailyMean', 'REANALYSIS_dlwrf_DailyMean',
'REANALYSIS_shum_2m_DailyMin', 'REANALYSIS_shum_2m_1Day', 'REANALYSIS_air_sfc_DailyMin', 'REANALYSIS_air_sfc_DailyMax', 'REANALYSIS_air_sfc_DailyMean',
'Nearby_Peak2_MaxTemperature', 'Nearby_Peak2_MinTemperature', 'Nearby_Peak2Lag1_MaxTemperature', 'Nearby_Peak2Lag1_MeanTemperature', 'Nearby_Peak2Lag1_MinTemperature', 
'Nearby_Peak2Lag3_MaxTemperature', 'Nearby_Peak2Lag3_MinTemperature')

# columns to keep that have high correlations by initial analysis
high.corr.colnames.del = high.corr.colnames[!(high.corr.colnames %in% to.delete)]

# to inspect which variables have high correlations
#corr.mat[high.corr.colnames.del, high.corr.colnames.del]

# all columns to keep
cols.to.keep = names(sensor_census2)[!(names(sensor_census2) %in% to.delete)]

# final data for imputation
sensor_census2.high.corr.drop = sensor_census2[, cols.to.keep]

In [81]:
write.csv(sensor_census2.high.corr.drop, 'subet_sensor_census_readyToImpute.csv', row.names = F)