# Prepare CNN Data

## Setup

In [11]:
# USE ACCOUNT: ieconnectlagosproject10@gmail.com
import ee
ee.Authenticate()
ee.Initialize()

Enter verification code:  4/1AX4XfWjXRkq7jQigSBDqHeuMc7R8xUh4S4x_jAsvrUUr2vulqWw2bUbVCoU



Successfully saved authorization token.


In [21]:
import numpy as np
#import geetools
#from geetools import ui, cloud_mask
import os, datetime
import pandas as pd
import itertools
import tensorflow as tf
import config as cf
import ee_utils as utils
import eeconvert
import time

#cloud_mask_landsatSR = cloud_mask.landsatSR()
#cloud_mask_sentinel2 = cloud_mask.sentinel2()

## Parameters

In [22]:
#224
SURVEY_NAME = 'DHS'
SATELLITE = 'l8'
SKIP_IF_SCRAPED = True
CHUNK_SIZE = 10 # Number of observtaions to scrape in GEE at any given time
DROPBOX_DIR = cf.DROPBOX_DIRECTORY
GOOGLEDRIVE_DIR = cf.GOOGLEDRIVE_DIRECTORY

if SATELLITE == 's2':
    KERNEL_SIZE = 500
elif SATELLITE == 'l8':
    KERNEL_SIZE = 167

print(KERNEL_SIZE)

167


In [23]:
# Directory to store tfrecords
out_path = os.path.join(GOOGLEDRIVE_DIR, 
            'Data', 
            SURVEY_NAME, 
            'FinalData',
            'Individual Datasets',
            'cnn_' + SATELLITE,
            'tfrecords')

## Load Data

In [24]:
# Relies on VIIRs Values
survey_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'survey_socioeconomic.csv'))

viirs_2_5km_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'survey_viirs_2_5km.csv'))
viirs_5km_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'survey_viirs_5km.csv'))

viirs_2_5km_df.viirs_avg_rad = np.log(viirs_2_5km_df.viirs_avg_rad + 1)
viirs_5km_df.viirs_avg_rad = np.log(viirs_5km_df.viirs_avg_rad + 1)

viirs_2_5km_df = viirs_2_5km_df.rename(columns={"viirs_avg_rad": "viirs_avg_rad_2_5km"})
viirs_5km_df = viirs_5km_df.rename(columns={"viirs_avg_rad": "viirs_avg_rad_5km"})

survey_df = survey_df.merge(viirs_2_5km_df, on = 'uid')
survey_df = survey_df.merge(viirs_5km_df, on = 'uid')

In [25]:
# Sentinel doesn't capture Lakshadweep (island off coast of mainland India)
#survey_df = survey_df[survey_df['GID_2'] != "IND.18.1_1"]

# List of TF Records
tf_record_list = list(np.unique(survey_df.tfrecord_name))

In [26]:
# Remove if issues extracting
#survey_df = survey_df[survey_df['uid'] != "IA201400180079"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180052"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180112"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180081"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180011"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180048"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180058"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180028"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180072"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180047"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180012"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180040"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180055"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180140"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180030"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180104"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180123"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180062"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180080"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180050"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180116"]

In [27]:
# If skip already scraped, remove existing tfrecords from tf_record_list
if SKIP_IF_SCRAPED:
    tf_records_exist = os.listdir(out_path)
    tf_record_list = [x for x in tf_record_list if x not in tf_records_exist]

In [28]:
index = np.int(np.floor(len(tf_record_list)/2))
tf_record_list = tf_record_list[1:index]
tf_record_list.reverse()

In [29]:
### Loop through all tfrecords
for tfr_i in tf_record_list:
    
    survey_df_yeari = survey_df[survey_df['tfrecord_name'] == tfr_i]
    year_i = survey_df_yeari['year'].iloc[0]

    ### Loop through chunks within tfrecord (can only pull so much data from GEE at a time)
    survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)

    print("Putting " + str(survey_df_yeari.shape[0]) + " observations into " + tfr_i)
    
    proto_examples_all = []
    for chunk_i in list(np.unique(survey_df_yeari.chunk_id)):
        time.sleep(5)
        print("Observation: " + str(len(proto_examples_all)) + "/" + str(survey_df_yeari.shape[0]))
        
        survey_df_yeari_chunki = survey_df_yeari[survey_df_yeari['chunk_id'] == chunk_i]
      
        proto_examples_i = utils.prep_cnn_np(survey_df_yeari_chunki, SATELLITE, KERNEL_SIZE, year_i)
        proto_examples_all.extend(proto_examples_i)

    ### Save data as tf record
    out_path_i = os.path.join(out_path, tfr_i)
    print(out_path_i)
    with tf.io.TFRecordWriter(out_path_i) as writer:
        for tf_example in proto_examples_all:
            writer.write(tf_example.SerializeToString())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Putting 200 observations into IA_3_3.tfrecord
Observation: 0/200
Observation: 10/200
Observation: 20/200
Observation: 30/200
Observation: 40/200
Observation: 50/200
Observation: 60/200
Observation: 70/200
Observation: 80/200
Observation: 90/200
Observation: 100/200
Observation: 110/200
Observation: 120/200
Observation: 130/200
Observation: 140/200
Observation: 150/200
Observation: 160/200


W0910 10:17:34.927026 4436827584 http.py:171] Sleeping 0.21 seconds before retry 1 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 503


Observation: 170/200
Observation: 180/200
Observation: 190/200
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/IA_3_3.tfrecord
Putting 174 observations into IA_3_27.tfrecord
Observation: 0/174
Observation: 10/174
Observation: 20/174
Observation: 30/174
Observation: 40/174
Observation: 50/174
Observation: 60/174
Observation: 70/174
Observation: 80/174
Observation: 90/174
Observation: 100/174
Observation: 110/174
Observation: 120/174
Observation: 130/174
Observation: 140/174
Observation: 150/174
Observation: 160/174
Observation: 170/174
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/IA_3_27.tfrecord
Putting 200 observations into IA_3_26.tfrecord
Observation: 0/200
Observation: 10/200
Observation: 20/200
Observation: 30/200
Observation: 40/200
Observation: 50/200


  return (values2 - values1)/(values2 + values1)


Observation: 60/200
Observation: 70/200
Observation: 80/200
Observation: 90/200
Observation: 100/200
Observation: 110/200
Observation: 120/200
Observation: 130/200
Observation: 140/200
Observation: 150/200
Observation: 160/200
Observation: 170/200
Observation: 180/200
Observation: 190/200
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/IA_3_26.tfrecord
Putting 200 observations into IA_3_25.tfrecord
Observation: 0/200
Observation: 10/200
Observation: 20/200
Observation: 30/200
Observation: 40/200
Observation: 50/200
Observation: 60/200


KeyboardInterrupt: 

In [None]:
for row_i in range(0,10):
    print(row_i)
    proto_examples_i = utils.prep_cnn_np(survey_df_yeari_chunki.iloc[[row_i]], SATELLITE, KERNEL_SIZE, year_i)

In [None]:
survey_df_yeari_chunki.iloc[[1]]