# Prepare CNN Data

## Setup

In [11]:
# USE ACCOUNT: robmarty3@gmail.com
import ee
ee.Authenticate()
ee.Initialize()

Enter verification code:  4/1AX4XfWjlFwhBsIV7lyCBTrEGb4Ko9wHfnWacAh6DR7ZmqFYcE5vnMBaaV1M



Successfully saved authorization token.


In [12]:
#image = ee.ImageCollection('LANDSAT/LC08/C02/T1_L2')\
#    .filterDate("2019-01-01", "2019-02-01")\
#    .map(cloud_mask_landsatSR)\
#    .median()
#
#print(image.getInfo())

In [13]:
import numpy as np
import os, datetime
import pandas as pd
import itertools
import tensorflow as tf
import config as cf
import ee_utils as utils
import eeconvert

#import geetools
#from geetools import ui, cloud_mask
#cloud_mask_landsatSR = cloud_mask.landsatSR()
#cloud_mask_sentinel2 = cloud_mask.sentinel2()

## Parameters

In [14]:
#224
SURVEY_NAME = 'DHS'
SATELLITE = 'l8'
SKIP_IF_SCRAPED = True
CHUNK_SIZE = 10 # Number of observtaions to scrape in GEE at any given time
DROPBOX_DIR = cf.DROPBOX_DIRECTORY
GOOGLEDRIVE_DIR = cf.GOOGLEDRIVE_DIRECTORY

if SATELLITE == 's2':
    KERNEL_SIZE = 500
elif SATELLITE == 'l8':
    KERNEL_SIZE = 167

print(KERNEL_SIZE)

167


In [15]:
# Directory to store tfrecords
out_path = os.path.join(GOOGLEDRIVE_DIR, 
            'Data', 
            SURVEY_NAME, 
            'FinalData',
            'Individual Datasets',
            'cnn_' + SATELLITE,
            'tfrecords')

## Load Data

In [16]:
# Relies on VIIRs Values
survey_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'survey_socioeconomic.csv'))

viirs_2_5km_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'survey_viirs_2_5km.csv'))
viirs_5km_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'survey_viirs_5km.csv'))

viirs_2_5km_df.viirs_avg_rad = np.log(viirs_2_5km_df.viirs_avg_rad + 1)
viirs_5km_df.viirs_avg_rad = np.log(viirs_5km_df.viirs_avg_rad + 1)

viirs_2_5km_df = viirs_2_5km_df.rename(columns={"viirs_avg_rad": "viirs_avg_rad_2_5km"})
viirs_5km_df = viirs_5km_df.rename(columns={"viirs_avg_rad": "viirs_avg_rad_5km"})

survey_df = survey_df.merge(viirs_2_5km_df, on = 'uid')
survey_df = survey_df.merge(viirs_5km_df, on = 'uid')

survey_df.shape

(64249, 44)

In [17]:
# Sentinel doesn't capture Lakshadweep (island off coast of mainland India)
#survey_df = survey_df[survey_df['GID_2'] != "IND.18.1_1"]

# List of TF Records
tf_record_list = list(np.unique(survey_df.tfrecord_name))

In [18]:
# Remove if issues extracting
#survey_df = survey_df[survey_df['uid'] != "IA201400180079"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180052"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180112"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180081"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180011"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180048"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180058"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180028"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180072"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180047"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180012"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180040"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180055"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180140"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180030"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180104"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180123"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180062"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180080"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180050"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180116"]

In [19]:
# If skip already scraped, remove existing tfrecords from tf_record_list
if SKIP_IF_SCRAPED:
    tf_records_exist = os.listdir(out_path)
    tf_record_list = [x for x in tf_record_list if x not in tf_records_exist]

In [None]:
### Loop through all tfrecords
for tfr_i in tf_record_list:
    
    survey_df_yeari = survey_df[survey_df['tfrecord_name'] == tfr_i]
    year_i = survey_df_yeari['year'].iloc[0]

    ### Loop through chunks within tfrecord (can only pull so much data from GEE at a time)
    survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)

    print("Putting " + str(survey_df_yeari.shape[0]) + " observations into " + tfr_i)
    
    proto_examples_all = []
    for chunk_i in list(np.unique(survey_df_yeari.chunk_id)):
        
        print("Observation: " + str(len(proto_examples_all)) + "/" + str(survey_df_yeari.shape[0]))
        
        survey_df_yeari_chunki = survey_df_yeari[survey_df_yeari['chunk_id'] == chunk_i]
      
        proto_examples_i = utils.prep_cnn_np(survey_df_yeari_chunki, SATELLITE, KERNEL_SIZE, year_i)
        proto_examples_all.extend(proto_examples_i)

    ### Save data as tf record
    out_path_i = os.path.join(out_path, tfr_i)
    print(out_path_i)
    with tf.io.TFRecordWriter(out_path_i) as writer:
        for tf_example in proto_examples_all:
            writer.write(tf_example.SerializeToString())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Putting 122 observations into AL_1_1.tfrecord
Observation: 0/122
Observation: 10/122
Observation: 20/122
Observation: 30/122
Observation: 40/122
Observation: 50/122
Observation: 60/122
Observation: 70/122
Observation: 80/122
Observation: 90/122
Observation: 100/122
Observation: 110/122
Observation: 120/122
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/AL_1_1.tfrecord
Putting 126 observations into AL_2_1.tfrecord
Observation: 0/126
Observation: 10/126
Observation: 20/126
Observation: 30/126
Observation: 40/126
Observation: 50/126
Observation: 60/126
Observation: 70/126
Observation: 80/126
Observation: 90/126


In [None]:
for row_i in range(0,10):
    print(row_i)
    proto_examples_i = utils.prep_cnn_np(survey_df_yeari_chunki.iloc[[row_i]], SATELLITE, KERNEL_SIZE, year_i)

In [None]:
survey_df_yeari_chunki.iloc[[1]]