# Estimate CNN Models

In [1]:
#TODO:
#* Lets have forcnn be training and nocnn be validation. No need for separate test set. Hmmmm no., wont have all classes.
#*In prepping, may want to ensure balance within (a) train and (b) validation

# Functions up top, then parameters / for loop below (some stuff doesn't need to be repeated for the for loop)

Adapted from:

https://codelabs.developers.google.com/codelabs/keras-flowers-transfer-learning#0

https://colab.research.google.com/github/GoogleCloudPlatform/training-data-analyst/blob/master/courses/fast-and-lean-data-science/04_Keras_Flowers_transfer_learning_solution.ipynb#scrollTo=M3G-2aUBQJ-H

## Setup

In [2]:
## Satellite and survey params
SURVEY_NAME = 'DHS_nga_policy_experiment'

# Parameters ------------------------------
VERSION = 2

if VERSION == 1:

    SATELLITE         = 's2' 
    OUTCOME_VAR       = "viirs" 
    UNDERSAMPLE_INDIA = True
    
if VERSION == 2:

    SATELLITE         = 'landsat' 
    OUTCOME_VAR       = "ntlharmon" 
    UNDERSAMPLE_INDIA = True

# Objects based on parameters ------------
OUT_NAME_SUFFIX   = SATELLITE + '_' + OUTCOME_VAR + '_underia' + str(UNDERSAMPLE_INDIA)

## CNN params
if SATELLITE == 's2':
    IMAGE_SIZE = [224, 224]
elif SATELLITE == 'landsat':
    IMAGE_SIZE = [224, 224]

if OUTCOME_VAR == 'viirs':
    NUM_GROUPS = 5
elif OUTCOME_VAR == 'ntlharmon':
    NUM_GROUPS = 5

EPOCHS           = 200
BATCH_SIZE       = 16 #16, 32
PATIENCE         = 10

In [3]:
import os, sys, math
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import random
from skimage import exposure
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import config as cf

import tensorflow as tf
print("Tensorflow version " + tf.__version__)
AUTOTUNE = tf.data.AUTOTUNE

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model, Model

Tensorflow version 2.4.0


In [4]:
# Authenticate Google Drive
#from google.colab import drive
#drive.mount('/content/gdrive')

In [5]:
# Authenticate Google Cloud
#from google.colab import auth
#auth.authenticate_user()

In [6]:
#GOOGLEDRIVE_DIRECTORY = os.path.join('/Volumes/GoogleDrive/My Drive/World Bank/IEs/Pakistan Poverty Estimation')
#GOOGLEDRIVE_DIRECTORY = os.path.join('/content/gdrive/My Drive/World Bank/IEs/Pakistan Poverty Estimation')

## Functions

### Utility Functions

In [7]:
# Get actual values function
# https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset

#### NTL Group
def decode_fn_ntl_group(record_bytes):
    return tf.io.parse_single_example(
        # Data
        record_bytes,

        # Schema
        {"viirs_ntl_group": tf.io.FixedLenFeature([], dtype=tf.int64)}
    )

def extract_ntl_group(TF_FILES):
    actual_values = []
    for batch in tf.data.TFRecordDataset([TF_FILES]).map(decode_fn_ntl_group):
        value = batch['viirs_ntl_group'].numpy()
        actual_values.append(value)

    return actual_values

#### UID
def decode_fn_uid(record_bytes):
    return tf.io.parse_single_example(
        # Data
        record_bytes,

        # Schema
        {"uid": tf.io.FixedLenFeature([], dtype=tf.string)}
    )

def extract_uid(TF_FILES):
    actual_values = []
    for batch in tf.data.TFRecordDataset([TF_FILES]).map(decode_fn_uid):
        value = batch['uid'].numpy()
        actual_values.append(value)

    return actual_values

In [8]:
def dataset_to_numpy_util(dataset, N, process_image = True):
    dataset = dataset.batch(N)
    
    for images, labels in dataset:
        numpy_images = images.numpy()
        numpy_labels = labels.numpy()

        if process_image:
            p2, p98 = np.percentile(numpy_images, (2,98))
            numpy_images = exposure.rescale_intensity(numpy_images, in_range=(p2, p98)) 
        break;

    return numpy_images, numpy_labels

def display_one_image(image, title, subplot, red=False):
    plt.subplot(subplot)
    plt.axis('off')
    plt.imshow(image)
    plt.title(title, fontsize=16, color='red' if red else 'black')
    return subplot+1

def display_9_images_from_dataset(dataset):
    subplot=331
    plt.figure(figsize=(13,13))
    images, labels = dataset_to_numpy_util(dataset, 9)
    for i, image in enumerate(images):
        title = labels[i] # CLASSES[labels[i]]
        subplot = display_one_image(image, title, subplot)
        if i >= 8:
            break;
              
    #plt.tight_layout()
    plt.subplots_adjust(wspace=0.1, hspace=0.1)
    plt.show()

def display_training_curves(training, validation, title, subplot):
    if subplot%10==1: # set up the subplots on the first call
        plt.subplots(figsize=(10,10), facecolor='#F0F0F0')
        #plt.tight_layout()
    ax = plt.subplot(subplot)
    ax.set_facecolor('#F8F8F8')
    ax.plot(training)
    ax.plot(validation)
    ax.set_title('model '+ title)
    ax.set_ylabel(title)
    ax.set_xlabel('epoch')
    ax.legend(['train', 'valid.'])

In [9]:
## To extract uid & ntl_group
def dataset_to_numpy_util_single_val(dataset, N):
    dataset = dataset.batch(N)
    
    for val in dataset:
        val = val.numpy()
        break;

    return val

In [10]:
# https://www.geeksforgeeks.org/break-list-chunks-size-n-python/
def divide_chunks(l, n):
        
    # looping till length l
    for i in range(0, len(l), n): 
        yield l[i:i + n]

### Functions for reading images and labels from TFRecords

In [11]:
def load_dataset(filenames, sat_var, exp_det, train):
    # read from TFRecords. For optimal performance, read from multiple
    # TFRecord files at once and set the option experimental_deterministic = False
    # to allow order-altering optimizations.

    #### Define read_tfrcord
    # Define here. Later map over this function, and not sure how to
    # enter sat_var into the mapping
    def read_tfrecord(example, sat_var = sat_var):
        features = {'viirs_ntl_group': tf.io.FixedLenFeature([], tf.int64),
                    sat_var: tf.io.FixedLenFeature([], tf.string)}
        parsed_features = tf.io.parse_single_example(example, features)

        image = tf.io.decode_png(parsed_features[sat_var], dtype=tf.dtypes.uint16)
        image = image / 10000 # within 0 and 1

        if sat_var != 'b_rgb':
            image = tf.repeat(image, repeats = 3, axis=2)

        # If training sample, augment the data
        if train:
            image = tf.image.random_flip_left_right(image)
            image = tf.image.random_flip_up_down(image)
            image = tf.image.random_brightness(image, 0.025)

            if sat_var == 'b_rgb':
                image = tf.image.random_contrast(image, 0.5, 1.5)

        label = tf.one_hot(parsed_features["viirs_ntl_group"], NUM_GROUPS)

        return image, label

    #### load_dataset function
    option_no_order = tf.data.Options()
    option_no_order.experimental_deterministic = exp_det

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
    dataset = dataset.with_options(option_no_order)

    dataset = dataset.map(read_tfrecord, num_parallel_calls=AUTOTUNE)

    return dataset

In [12]:
def read_tfrecord_uid(example):
    features = {'uid': tf.io.FixedLenFeature([], tf.string)}

    parsed_features = tf.io.parse_single_example(example, features)

    return parsed_features['uid']

def load_dataset_uid(filenames, exp_det):
    # read from TFRecords. For optimal performance, read from multiple
    # TFRecord files at once and set the option experimental_deterministic = False
    # to allow order-altering optimizations.

    option_no_order = tf.data.Options()
    option_no_order.experimental_deterministic = exp_det

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
    dataset = dataset.with_options(option_no_order)
    dataset = dataset.map(read_tfrecord_uid, num_parallel_calls=AUTOTUNE)
    return dataset

In [13]:
def read_tfrecord_ntl_group(example):
    features = {'viirs_ntl_group': tf.io.FixedLenFeature([], tf.int64)}
    parsed_features = tf.io.parse_single_example(example, features)

    label = tf.one_hot(parsed_features["viirs_ntl_group"], NUM_GROUPS)

    return label

def load_dataset_ntl_group(filenames, exp_det):
    # read from TFRecords. For optimal performance, read from multiple
    # TFRecord files at once and set the option experimental_deterministic = False
    # to allow order-altering optimizations.

    option_no_order = tf.data.Options()
    option_no_order.experimental_deterministic = exp_det

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
    dataset = dataset.with_options(option_no_order)
    dataset = dataset.map(read_tfrecord_ntl_group, num_parallel_calls=AUTOTUNE)
    return dataset

### Functions to create batched datasets

In [14]:
def get_batched_dataset(filenames, sat_var, exp_det, train=False):
    dataset = load_dataset(filenames, sat_var, exp_det = exp_det, train = train)
    dataset = dataset.cache() # This dataset fits in RAM
    if train:
        # Best practices for Keras:
        # Training dataset: repeat then batch
        # Evaluation dataset: do not repeat
        dataset = dataset.repeat()

    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE) # prefetch next batch while training (autotune prefetch buffer size)
    # should shuffle too but this dataset was well shuffled on disk already
    return dataset
    # source: Dataset performance guide: https://www.tensorflow.org/guide/performance/datasets

def get_batched_dataset_uid(filenames, exp_det, train=False):
    dataset = load_dataset_uid(filenames, exp_det = exp_det)
    dataset = dataset.cache() # This dataset fits in RAM
    if train:
        # Best practices for Keras:
        # Training dataset: repeat then batch
        # Evaluation dataset: do not repeat
        dataset = dataset.repeat()
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE) # prefetch next batch while training (autotune prefetch buffer size)
    # should shuffle too but this dataset was well shuffled on disk already
    return dataset
    # source: Dataset performance guide: https://www.tensorflow.org/guide/performance/datasets

## Load TFRecords and divide into train/test Set

In [15]:
TF_PATH = os.path.join(cf.DROPBOX_DIRECTORY, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets',
                       'cnn_' + OUT_NAME_SUFFIX, 'tfrecords')
GCS_PATTERN = os.path.join(TF_PATH, '*.tfrecord')

#GCS_PATTERN = 'gs://ieconnectpovest/cnn_' + OUT_NAME_SUFFIX + '/tfrecords/*.tfrecord'
all_filenames = tf.io.gfile.glob(GCS_PATTERN)
len(all_filenames)

25

In [16]:
all_filenames

['/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS_nga_policy_experiment/FinalData/Individual Datasets/cnn_landsat_ntlharmon_underiaTrue/tfrecords/forcnn_test_NG_5_1_all.tfrecord',
 '/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS_nga_policy_experiment/FinalData/Individual Datasets/cnn_landsat_ntlharmon_underiaTrue/tfrecords/nocnn_NG_3_1_all.tfrecord',
 '/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS_nga_policy_experiment/FinalData/Individual Datasets/cnn_landsat_ntlharmon_underiaTrue/tfrecords/forcnn_test_NG_2_1_all.tfrecord',
 '/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS_nga_policy_experiment/FinalData/Individual Datasets/cnn_landsat_ntlharmon_underiaTrue/tfrecords/nocnn_NG_4_1_all.tfrecord',
 '/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS_nga_policy_experiment/

In [17]:
#forcnn_filenames = [x for x in all_filenames if 'forcnn_' in x]
#split = int(len(forcnn_filenames) * VALIDATION_SPLIT)
#training_filenames = forcnn_filenames[split:]
#validation_filenames = forcnn_filenames[:split]

# forcnn_filenames = [x for x in all_filenames if 'forcnn_' in x] # TODO: Not sure need?

training_filenames = [x for x in all_filenames if 'forcnn_train_' in x]
validation_filenames = [x for x in all_filenames if 'forcnn_test_' in x]

In [18]:
#TOTAL_OBS = len(extract_uid(all_filenames))
#print(TOTAL_OBS)

TOTAL_OBS_VALIDATION = len(extract_uid(validation_filenames))
print(TOTAL_OBS_VALIDATION)

TOTAL_OBS_TRAINING = len(extract_uid(training_filenames))
print(TOTAL_OBS_TRAINING)

2023-06-27 09:27:31.344092: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-27 09:27:31.395050: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)


215
435


In [19]:
validation_steps = TOTAL_OBS_VALIDATION // BATCH_SIZE
steps_per_epoch  = TOTAL_OBS_TRAINING   // BATCH_SIZE

## Example Images

In [20]:
for sat_var in ['b_rgb', 'b_ndvi', 'b_bu']: # 'b_rgb', 'b_ndvi', 'b_bu'
    
    print(sat_var)
    
    # File names -----------------------------------------------------------------
    # Paths for saving model, predictions (on test) and featurs (on training)
    name_suffix = OUT_NAME_SUFFIX + "_" + sat_var

    CNN_MODEL_PATH = os.path.join(cf.DROPBOX_DIRECTORY, 'Data', 'DHS', 'FinalData', "Individual Datasets",
                                    'cnn_models', 
                                    'model_' + name_suffix + '.h5')

    FEATURES_PATH = os.path.join(cf.DROPBOX_DIRECTORY, 'Data', SURVEY_NAME, 'FinalData', "Individual Datasets",
                                    'cnn_features', "split_into_data_subsets",
                                    'features_' + name_suffix)
    
    ## Load model ------------------------------------------------------------------
    model = load_model(CNN_MODEL_PATH)

    ## Load data -------------------------------------------------------------------
    all_dataset_exdtT = get_batched_dataset(all_filenames, sat_var, exp_det = True, train=False)

    ## Grab features ---------------------------------------------------------------
    feature_extractor = Model(inputs=model.inputs,
                    outputs=model.get_layer(name='fc1').output,)

    def extract_features(all_dataset_i_exdtT, all_filenames_i, i, FEATURES_PATH):
        print(i)

        features_i = feature_extractor.predict(all_dataset_i_exdtT)
        features_i_df = pd.DataFrame(features_i).add_prefix('cnn_feat_')
        features_i_df['uid'] = dataset_to_numpy_util_single_val(load_dataset_uid(all_filenames_i, exp_det = True),features_i_df.shape[0])
        features_i_df.to_csv(FEATURES_PATH + '_' + str(i) + '.csv', index=False)


    extract_features(all_dataset_exdtT, all_filenames, 0, FEATURES_PATH)
    
    


b_rgb
0


OSError: Cannot save file into a non-existent directory: '/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS_nga_policy_experiment/FinalData/Individual Datasets/cnn_features/split_into_data_subsets'