# Prepare CNN Data

## Setup

In [9]:
# USE ACCOUNT: robmarty3@gmail.com
import ee
ee.Authenticate()
ee.Initialize()

Enter verification code:  4/1AX4XfWjn821CvfRvzyX7FvTw25Vvn0m0_ge1lNoeBqCwRsYuZuhSSYUng0c



Successfully saved authorization token.


In [10]:
#image = ee.ImageCollection('LANDSAT/LC08/C02/T1_L2')\
#    .filterDate("2019-01-01", "2019-02-01")\
#    .map(cloud_mask_landsatSR)\
#    .median()
#
#print(image.getInfo())

In [11]:
import numpy as np
import os, datetime
import pandas as pd
import itertools
import tensorflow as tf
import config as cf
import ee_utils as utils
import eeconvert
import time

#import geetools
#from geetools import ui, cloud_mask
#cloud_mask_landsatSR = cloud_mask.landsatSR()
#cloud_mask_sentinel2 = cloud_mask.sentinel2()

## Parameters

In [12]:
#224
SURVEY_NAME = 'DHS'
SATELLITE = 'l8'
SKIP_IF_SCRAPED = True
CHUNK_SIZE = 10 # Number of observtaions to scrape in GEE at any given time
DROPBOX_DIR = cf.DROPBOX_DIRECTORY
GOOGLEDRIVE_DIR = cf.GOOGLEDRIVE_DIRECTORY

if SATELLITE == 's2':
    KERNEL_SIZE = 500
elif SATELLITE == 'l8':
    KERNEL_SIZE = 167

print(KERNEL_SIZE)

167


In [13]:
# Directory to store tfrecords
out_path = os.path.join(GOOGLEDRIVE_DIR, 
            'Data', 
            SURVEY_NAME, 
            'FinalData',
            'Individual Datasets',
            'cnn_' + SATELLITE,
            'tfrecords')

## Load Data

In [14]:
# Relies on VIIRs Values
survey_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'survey_socioeconomic.csv'))

viirs_2_5km_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'survey_viirs_2_5km.csv'))
viirs_5km_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'survey_viirs_5km.csv'))

viirs_2_5km_df.viirs_avg_rad = np.log(viirs_2_5km_df.viirs_avg_rad + 1)
viirs_5km_df.viirs_avg_rad = np.log(viirs_5km_df.viirs_avg_rad + 1)

viirs_2_5km_df = viirs_2_5km_df.rename(columns={"viirs_avg_rad": "viirs_avg_rad_2_5km"})
viirs_5km_df = viirs_5km_df.rename(columns={"viirs_avg_rad": "viirs_avg_rad_5km"})

survey_df = survey_df.merge(viirs_2_5km_df, on = 'uid')
survey_df = survey_df.merge(viirs_5km_df, on = 'uid')

survey_df.shape

(64249, 44)

In [15]:
# Sentinel doesn't capture Lakshadweep (island off coast of mainland India)
#survey_df = survey_df[survey_df['GID_2'] != "IND.18.1_1"]

# List of TF Records
tf_record_list = list(np.unique(survey_df.tfrecord_name))

In [16]:
# Remove if issues extracting
#survey_df = survey_df[survey_df['uid'] != "IA201400180079"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180052"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180112"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180081"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180011"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180048"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180058"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180028"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180072"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180047"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180012"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180040"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180055"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180140"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180030"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180104"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180123"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180062"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180080"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180050"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180116"]

In [17]:
# If skip already scraped, remove existing tfrecords from tf_record_list
if SKIP_IF_SCRAPED:
    tf_records_exist = os.listdir(out_path)
    tf_record_list = [x for x in tf_record_list if x not in tf_records_exist]

In [None]:
### Loop through all tfrecords
for tfr_i in tf_record_list:
    
    # Sometimes we get computational time out errors. If occurs, just skip and go to next.
    # We can then go back and rescrape missed ones.
    try:
    
        survey_df_yeari = survey_df[survey_df['tfrecord_name'] == tfr_i]
        year_i = survey_df_yeari['year'].iloc[0]

        ### Loop through chunks within tfrecord (can only pull so much data from GEE at a time)
        survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)

        print("Putting " + str(survey_df_yeari.shape[0]) + " observations into " + tfr_i)

        proto_examples_all = []
        for chunk_i in list(np.unique(survey_df_yeari.chunk_id)):
            time.sleep(6)
            print("Observation: " + str(len(proto_examples_all)) + "/" + str(survey_df_yeari.shape[0]))

            survey_df_yeari_chunki = survey_df_yeari[survey_df_yeari['chunk_id'] == chunk_i]

            proto_examples_i = utils.prep_cnn_np(survey_df_yeari_chunki, SATELLITE, KERNEL_SIZE, year_i)
            proto_examples_all.extend(proto_examples_i)

        ### Save data as tf record
        out_path_i = os.path.join(out_path, tfr_i)
        print(out_path_i)
        with tf.io.TFRecordWriter(out_path_i) as writer:
            for tf_example in proto_examples_all:
                writer.write(tf_example.SerializeToString())
                
    except:
        pass

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Putting 203 observations into BO_5_1.tfrecord
Observation: 0/203
Observation: 10/203
Observation: 20/203
Observation: 30/203


  return (values2 - values1)/(values2 + values1)


Observation: 40/203
Observation: 50/203
Putting 63 observations into BO_5_2.tfrecord
Observation: 0/63
Observation: 10/63
Putting 99 observations into BU_4_1.tfrecord
Observation: 0/99
Observation: 10/99
Observation: 20/99
Observation: 30/99
Observation: 40/99
Observation: 50/99
Observation: 60/99
Observation: 70/99
Observation: 80/99
Observation: 90/99
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/BU_4_1.tfrecord
Putting 72 observations into CI_1_1.tfrecord
Observation: 0/72
Observation: 10/72
Observation: 20/72
Observation: 30/72
Observation: 40/72
Observation: 50/72
Observation: 60/72
Observation: 70/72
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/CI_1_1.tfrecord
Putting 44 observations into CI_2_1.tfrecord
Observation: 0/44
Observation: 10/44
Observation: 20/44
Observation: 30/44
Observation: 40/44
/Users/robmarty/Googl

  return (values2 - values1)/(values2 + values1)


Putting 200 observations into CO_1_2.tfrecord
Observation: 0/200
Observation: 10/200
Observation: 20/200
Observation: 30/200
Observation: 40/200
Observation: 50/200
Observation: 60/200
Observation: 70/200
Observation: 80/200
Observation: 90/200
Observation: 100/200
Observation: 110/200
Observation: 120/200
Observation: 130/200
Observation: 140/200
Observation: 150/200
Observation: 160/200
Observation: 170/200
Observation: 180/200
Observation: 190/200
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/CO_1_2.tfrecord
Putting 200 observations into CO_1_3.tfrecord
Observation: 0/200
Observation: 10/200
Observation: 20/200
Observation: 30/200
Observation: 40/200
Observation: 50/200
Observation: 60/200
Observation: 70/200
Observation: 80/200
Putting 200 observations into CO_1_4.tfrecord
Observation: 0/200
Observation: 10/200
Observation: 20/200
Observation: 30/200
Observation: 40/200
Observation: 50/200
Observation

W0911 03:44:36.413815 4697423296 http.py:171] Sleeping 0.41 seconds before retry 1 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 502


Observation: 120/200
Observation: 130/200
Observation: 140/200
Observation: 150/200
Observation: 160/200
Observation: 170/200
Observation: 180/200
Observation: 190/200
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/CO_2_5.tfrecord
Putting 149 observations into CO_2_6.tfrecord
Observation: 0/149
Observation: 10/149
Observation: 20/149
Observation: 30/149
Observation: 40/149


W0911 03:49:29.080321 4697423296 http.py:171] Sleeping 1.91 seconds before retry 1 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 502


Observation: 50/149
Observation: 60/149
Observation: 70/149
Observation: 80/149
Observation: 90/149
Observation: 100/149
Observation: 110/149
Observation: 120/149
Observation: 130/149
Observation: 140/149
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/CO_2_6.tfrecord
Putting 200 observations into CO_3_1.tfrecord
Observation: 0/200
Observation: 10/200
Observation: 20/200
Observation: 30/200
Observation: 40/200
Observation: 50/200
Observation: 60/200
Observation: 70/200
Observation: 80/200
Observation: 90/200
Observation: 100/200
Observation: 110/200
Observation: 120/200
Putting 200 observations into CO_3_2.tfrecord
Observation: 0/200
Observation: 10/200
Observation: 20/200
Observation: 30/200
Observation: 40/200
Observation: 50/200
Observation: 60/200
Observation: 70/200
Observation: 80/200
Observation: 90/200
Observation: 100/200
Observation: 110/200
Observation: 120/200
Observation: 130/200
Observation: 1

W0911 04:29:02.969713 4697423296 http.py:171] Sleeping 1.52 seconds before retry 1 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 502


Observation: 10/200
Observation: 20/200
Observation: 30/200
Observation: 40/200
Observation: 50/200
Observation: 60/200
Putting 200 observations into CO_4_3.tfrecord
Observation: 0/200
Observation: 10/200
Observation: 20/200
Observation: 30/200
Observation: 40/200
Observation: 50/200
Observation: 60/200
Observation: 70/200
Observation: 80/200
Observation: 90/200
Observation: 100/200
Observation: 110/200
Observation: 120/200
Observation: 130/200
Observation: 140/200
Observation: 150/200
Observation: 160/200
Observation: 170/200
Observation: 180/200
Observation: 190/200
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/CO_4_3.tfrecord
Putting 200 observations into CO_4_4.tfrecord
Observation: 0/200
Observation: 10/200
Observation: 20/200
Observation: 30/200
Observation: 40/200
Observation: 50/200
Observation: 60/200
Observation: 70/200
Observation: 80/200
Observation: 90/200
Observation: 100/200
Observation: 11

W0911 04:54:29.974401 4697423296 http.py:171] Sleeping 1.04 seconds before retry 1 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 502
W0911 04:54:34.991994 4697423296 http.py:171] Sleeping 1.87 seconds before retry 2 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 502


Observation: 90/200


W0911 04:55:20.562443 4697423296 http.py:171] Sleeping 0.08 seconds before retry 1 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 502
W0911 04:55:20.757349 4697423296 http.py:171] Sleeping 1.99 seconds before retry 2 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 500
W0911 04:55:48.211127 4697423296 http.py:171] Sleeping 0.61 seconds before retry 3 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 502
W0911 04:55:54.354794 4697423296 http.py:171] Sleeping 14.00 seconds before retry 4 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 502
W0911 04:56:09.683632 4697423296 http.py:171] Sleeping 5.68 seconds before retry 5 

Putting 86 observations into CO_4_6.tfrecord
Observation: 0/86


W0911 04:57:03.477098 4697423296 http.py:171] Sleeping 0.64 seconds before retry 1 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 502
W0911 04:57:24.163892 4697423296 http.py:171] Sleeping 0.48 seconds before retry 2 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 502
W0911 04:57:44.541125 4697423296 http.py:171] Sleeping 4.77 seconds before retry 3 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 502
W0911 04:58:25.196431 4697423296 http.py:171] Sleeping 14.57 seconds before retry 4 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 502
W0911 04:59:02.060926 4697423296 http.py:171] Sleeping 0.02 seconds before retry 1 

Observation: 10/86
Observation: 20/86
Observation: 30/86
Observation: 40/86
Observation: 50/86
Observation: 60/86
Observation: 70/86
Observation: 80/86
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/CO_4_6.tfrecord
Putting 200 observations into CO_5_1.tfrecord
Observation: 0/200
Observation: 10/200
Observation: 20/200
Observation: 30/200
Observation: 40/200
Observation: 50/200
Observation: 60/200
Observation: 70/200
Observation: 80/200
Observation: 90/200
Observation: 100/200
Observation: 110/200
Observation: 120/200
Observation: 130/200


W0911 05:07:02.640877 4697423296 http.py:171] Sleeping 1.71 seconds before retry 1 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 502


Observation: 140/200
Observation: 150/200
Observation: 160/200
Observation: 170/200
Observation: 180/200
Observation: 190/200
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/CO_5_1.tfrecord
Putting 200 observations into CO_5_2.tfrecord
Observation: 0/200
Observation: 10/200
Observation: 20/200
Observation: 30/200
Observation: 40/200
Observation: 50/200
Observation: 60/200
Observation: 70/200
Observation: 80/200
Observation: 90/200
Observation: 100/200
Observation: 110/200
Observation: 120/200
Observation: 130/200
Observation: 140/200
Observation: 150/200
Observation: 160/200
Observation: 170/200
Observation: 180/200
Observation: 190/200
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/CO_5_2.tfrecord
Putting 200 observations into CO_5_3.tfrecord
Observation: 0/200
Observation: 10/200
Observation: 20/200
Observation: 30/200
Observ

In [None]:
for row_i in range(0,10):
    print(row_i)
    proto_examples_i = utils.prep_cnn_np(survey_df_yeari_chunki.iloc[[row_i]], SATELLITE, KERNEL_SIZE, year_i)

In [None]:
survey_df_yeari_chunki.iloc[[1]]