# Prepare CNN Data

## Setup

In [None]:
import ee
#ee.Authenticate()
ee.Initialize()

In [None]:
import numpy as np
import os, datetime
import pandas as pd
import itertools
import tensorflow as tf
import config as cf
import ee_utils as utils
import eeconvert
import time
from datetime import datetime
import glob

DROPBOX_DIR = cf.DROPBOX_DIRECTORY

In [None]:
def decode_fn_uid(record_bytes):
    return tf.io.parse_single_example(
        # Data
        record_bytes,

        # Schema
        {"uid": tf.io.FixedLenFeature([], dtype=tf.string)}
    )

def extract_uid(TF_FILES):
    actual_values = []
    for batch in tf.data.TFRecordDataset([TF_FILES]).map(decode_fn_uid):
        value = batch['uid'].numpy()
        actual_values.append(value)

    return actual_values

## Parameters

In [None]:
# Datasets -------------------------------------
SURVEY_NAME = 'LAGOS_POINTS'

VERSION = 2
if VERSION == 1:

    SATELLITE         = 's2' 
    OUTCOME_VAR       = "viirs" 
    UNDERSAMPLE_INDIA = True
    
if VERSION == 2:

    SATELLITE         = 'landsat' 
    OUTCOME_VAR       = "ntlharmon" 
    UNDERSAMPLE_INDIA = True

# Processing data ------------------------------
SKIP_IF_SCRAPED = True ## Skip if filename has already been created
CHECK_IF_UID_SCRAPED = False ## Load data already scraped and skip if scraped; add date to filename
IGNORE_ERRORS = False ## Load dataset of errors and remove from ones to scrape

CHUNK_SIZE = 1 # Number of observtaions to scrape in GEE at any given time

# Parameters based on dataset ---------------------
if SATELLITE == 's2':
    KERNEL_SIZE = 224
elif SATELLITE == 'landsat':
    KERNEL_SIZE = 224 #167
elif SATELLITE == 'landsat_7':
    KERNEL_SIZE = 224 #167

print(KERNEL_SIZE)

In [None]:
# Directory to store tfrecords
out_path = os.path.join(DROPBOX_DIR, 
            'Data', 
            SURVEY_NAME, 
            'FinalData',
            'Individual Datasets',
            'cnn_' + SATELLITE + '_' + OUTCOME_VAR + '_underia' + str(UNDERSAMPLE_INDIA),
            'tfrecords')

out_path_errors = os.path.join(DROPBOX_DIR, 
            'Data', 
            SURVEY_NAME, 
            'FinalData',
            'Individual Datasets',
            'cnn_' + SATELLITE + '_' + OUTCOME_VAR + '_underia' + str(UNDERSAMPLE_INDIA))

## Load Data

In [None]:
### Load data
if UNDERSAMPLE_INDIA == True:
    UNDERSAMPLE_INDIA_str = "TRUE"
else:
    UNDERSAMPLE_INDIA_str = "FALSE"
    
survey_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 
                                     'data_for_cnn_' + OUTCOME_VAR + '_iaunder' + UNDERSAMPLE_INDIA_str + '_' + SATELLITE + '.csv'))

### If sentinel, only use most recent
if SATELLITE == 's2':
    survey_df = survey_df[survey_df.most_recent_survey == True]
        
### N Observations      
print(survey_df.shape)
print(survey_df.ntl_group.value_counts())

### Check if UID Already Scraped

(1) Filter surveys to those that have been scraped, and (2) add date/time to filename (so process of checking if file has been scraped doesnt skip it)

In [38]:
if IGNORE_ERRORS:
    error_files = glob.glob(out_path_errors + '/*.csv')
    error_df = pd.concat([pd.read_csv(f) for f in error_files])

    survey_df = survey_df[~survey_df['uid'].isin(error_df['uid'])]

In [39]:
## Check if everything processed, ignoring errors
if False:
    tf_paths = glob.glob(out_path + '/*.tfrecord')
    processed_uids = extract_uid(tf_paths)

    ## List of IDs already processed
    processed_uids = [x.decode('utf-8') for x in processed_uids]

    ## Subset survey to uids not scraped
    survey_df = survey_df[~survey_df['uid'].isin(processed_uids)]

In [40]:
if CHECK_IF_UID_SCRAPED:

    tf_paths = glob.glob(out_path + '/*.tfrecord')
    processed_uids = extract_uid(tf_paths)
    
    ## List of IDs already processed
    processed_uids = [x.decode('utf-8') for x in processed_uids]
    
    ## Subset survey to uids not scraped
    survey_df = survey_df[~survey_df['uid'].isin(processed_uids)]
    
    ## Change name of tfrecords
    txt_to_add = datetime.now().strftime("%Y%m%d%H%M%S")
    
    survey_df['tfrecord_name'] = survey_df.apply(lambda row: row['tfrecord_name'].replace('.tfrecord', "_" + txt_to_add + '.tfrecord'), axis='columns')

### Skip tfrecords already processed

In [41]:
survey_df.head()

Unnamed: 0,uid,GID_2,year,most_recent_survey,ntl_group,longitude,latitude,tfrecord_name,use_for_cnn
0,id95,NGA.25.10_1,2021.0,True,4.0,3.302937,6.686937,forcnn_test_fold1_1_all.tfrecord,yes
1,id79,NGA.25.12_1,2021.0,True,3.0,3.591917,6.661515,forcnn_test_fold1_1_all.tfrecord,yes
2,id83,NGA.25.7_1,2021.0,True,1.0,3.794574,6.655686,forcnn_test_fold1_1_all.tfrecord,yes
3,,,,,,,,forcnn_test_NA_1_all.tfrecord,yes
4,id60,NGA.25.14_1,2021.0,True,4.0,3.386748,6.462572,forcnn_train_fold1_1_all.tfrecord,yes


In [42]:
survey_df.shape

(106, 9)

In [43]:
# List of TF Records
tf_record_list = list(np.unique(survey_df.tfrecord_name))

len(tf_record_list)

tf_record_list

['forcnn_test_NA_1_all.tfrecord',
 'forcnn_test_fold1_1_all.tfrecord',
 'forcnn_train_fold1_1_all.tfrecord',
 'nocnn_fold1_1_all.tfrecord']

In [44]:
# If skip already scraped, remove existing tfrecords from tf_record_list
if SKIP_IF_SCRAPED:
    tf_records_exist = os.listdir(out_path)
    tf_record_list = [x for x in tf_record_list if x not in tf_records_exist]
    
print(len(tf_record_list))

4


In [45]:
# https://gist.github.com/erdemarslan/3ec02009f38f8df84c8e4807e7954af3
if False:
    import urllib3

    def check_internet_conn():
        http = urllib3.PoolManager(timeout=3.0)
        r = http.request('GET', 'google.com', preload_content=False)
        code = r.status
        r.release_conn()
        if code == 200:
            return True
        else:
            return False

## Query Data

In [46]:
## Blank error dataframe
errors_df = pd.DataFrame()

## Error file name
now = datetime.now()
current_time = now.strftime("%d_%m_%y_%H_%M_%S")
error_file_name = 'errors_' + current_time + '.csv'

if True:
    ### Loop through all tfrecords
    for tfr_i in tf_record_list:

        # Sometimes we get computational time out errors. If occurs, just skip and go to next.
        # We can then go back and rescrape missed ones.

        survey_df_yeari = survey_df[survey_df['tfrecord_name'] == tfr_i]
        year_i = survey_df_yeari['year'].iloc[0]

        ### Loop through chunks within tfrecord (can only pull so much data from GEE at a time)
        survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)

        print("Putting " + str(survey_df_yeari.shape[0]) + " observations into " + tfr_i)

        proto_examples_all = []
        for chunk_i in list(np.unique(survey_df_yeari.chunk_id)):
            

            try:

                time.sleep(3)
                print("Observation: " + str(len(proto_examples_all)) + "/" + str(survey_df_yeari.shape[0]))

                survey_df_yeari_chunki = survey_df_yeari[survey_df_yeari['chunk_id'] == chunk_i]

                proto_examples_i = utils.prep_cnn_np(survey_df_yeari_chunki, SATELLITE, KERNEL_SIZE, year_i)
                proto_examples_all.extend(proto_examples_i)

            except:

                print("Error ---")
                print(survey_df_yeari_chunki['uid'])

                errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)
                errors_df.to_csv(os.path.join(out_path_errors, error_file_name))

                time.sleep(15)
                pass

        ### Save data as tf record
        out_path_i = os.path.join(out_path, tfr_i)
        print(out_path_i)
        with tf.io.TFRecordWriter(out_path_i) as writer:
            for tf_example in proto_examples_all:
                writer.write(tf_example.SerializeToString())

        print("Success \o/")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Putting 1 observations into forcnn_test_NA_1_all.tfrecord
Observation: 0/1
Error ---
3    NaN
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/LAGOS_POINTS/FinalData/Individual Datasets/cnn_landsat_ntlharmon_underiaTrue/tfrecords/forcnn_test_NA_1_all.tfrecord
Success \o/
Putting 3 observations into forcnn_test_fold1_1_all.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/3
Error ---
0    id95
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/3
Error ---
1    id79
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/3
Error ---
2    id83
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/LAGOS_POINTS/FinalData/Individual Datasets/cnn_landsat_ntlharmon_underiaTrue/tfrecords/forcnn_test_fold1_1_all.tfrecord
Success \o/
Putting 8 observations into forcnn_train_fold1_1_all.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/8
Error ---
4    id60
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Error ---
4    id60
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/8
Error ---
6    id2
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/8
Error ---
7    id14
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/8
Error ---
8    id82
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/8
Error ---
9    id80
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/8
Error ---
10    id84
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/8
Error ---
11    id77
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/LAGOS_POINTS/FinalData/Individual Datasets/cnn_landsat_ntlharmon_underiaTrue/tfrecords/forcnn_train_fold1_1_all.tfrecord
Success \o/
Putting 94 observations into nocnn_fold1_1_all.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/94
Error ---
12    id26
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/94
Error ---
13    id15
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/94
Error ---
14    id99
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/94
Error ---
15    id71
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/94
Error ---
16    id17
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/94
Error ---
17    id35
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


KeyboardInterrupt: 