# __Predicting NTL using DTL__

CNN Model for predicting nighttime lights using daytime images.

## Parameters

In [3]:
#PARAM_NAME = "Nbands3_nNtlBins3_minNTLbinCount16861"
PARAM_NAME = "Nbands3_nNtlBins3_minNTLbinCount100"
YEAR = 2015

## Setup

In [4]:
from numpy.random import seed

import os, datetime
import numpy as np
import pandas as pd
import json

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix

from keras.utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, GlobalAveragePooling2D, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from keras.applications.vgg16 import VGG16
from keras.applications.inception_v3 import preprocess_input

import logging, os 
import random
import tensorflow as tf

import boto3
from sagemaker import get_execution_role
from s3fs.core import S3FileSystem 
s3 = S3FileSystem()
role = get_execution_role()

# Set seeds. Note that using a GPU can still introduce randomness.
# (also not taking into account tensorflow randomness)
seed(42)

In [5]:
bucket = 'worldbank-pakistan-data'

#### CNN directry in s3 bucket with data
CNN_DIR = os.path.join('CNN', PARAM_NAME)
LOCAL_DIR = '/home/ec2-user/SageMaker/'

#### FILES ALREADY CREATED

# JSON file with parameters for CNN
CNN_PARAMS_FILENAME = os.path.join(CNN_DIR, 'CNN_parameters.json')

# Nighttime lights and daytime imagery path (numpy files prepped for CNN)
NTL_FILENAME = os.path.join(CNN_DIR, f'ntl_{str(YEAR)}.npy')
DTL_FILENAME = os.path.join(CNN_DIR, f'dtl_{str(YEAR)}.npy')

#### FILES TO CREATE

# CNN model (h5 file) 
# -- 1. Name
# -- 2. Checkpoint (where to store model locally; cnn model saves best model durig training)
# -- 3. s3 path, where to upload back to s3 bucket
CNN_MODEL_NAME = f'script_CNN_{str(YEAR)}.h5'
CNN_MODEL_CHECKPOINT = os.path.join(LOCAL_DIR, CNN_MODEL_NAME)
CNN_MODEL_S3_PATH = os.path.join(CNN_DIR, CNN_MODEL_NAME)

# CSV file with predicted NTL values from CNN (upload to s3)
PREDICTION_FILENAME = f'cnn_predictions_truth_values_{str(YEAR)}.csv'
#PREDICTION_FILENAME = os.path.join(CNN_DIR, f'cnn_predictions_truth_values_{str(YEAR)}.csv')

## Functions

In [6]:
def normalize(X):
    '''
    Normalizes features.
    '''
    return X.astype('float32') / 255.0

def define_model_imagenet(height, width, channels, num_classes):
    '''
    Defines and compiles CNN model.
    
    Inputs:
        height, width, channels, num_classes (int)
    Returns:
        model (keras.Model object)
    '''

    # https://medium.com/abraia/first-steps-with-transfer-learning-for-custom-image-classification-with-keras-b941601fcad5
    # https://towardsdatascience.com/cnn-transfer-learning-fine-tuning-9f3e7c5806b2

    #### Base model
    input_shape = (height, width, channels)
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=input_shape)

    for layer in base_model.layers:
        layer.trainable = False

    #### Model Customization
    # We take the last layer of our the model and add it to our classifier
    last = base_model.layers[-1].output
    x = Flatten()(last)
    x = Dense(100, activation='relu', name='fc1')(x)
    x = Dropout(0.3)(x)
    x = Dense(num_classes, activation='softmax', name='predictions')(x)
    model = Model(base_model.input, x)
    # We compile the model
    model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

    return model

def evaluate_model(model, trainX, trainY, testX, testY, CNN_MODEL_CHECKPOINT):
    '''
    Fits model, evaluates model, saves best model over epochs and cross-validations.
    
    Inputs:
        model (CNN model) keras.Model object
        trainX, trainY (numpy.ndarray) 4D array of DTL features and 2D array of targets for training
        testX, testY (numpy.ndarray) 4D array of DTL features and 2D array of targets for testing
        current_kfold (int) iteration in kfold cross-val, default=None for no cross-val
        display_metrics (bool) Default=False
    Returns:
        None
    # https://towardsdatascience.com/step-by-step-guide-to-using-pretrained-models-in-keras-c9097b647b29
    '''

    # Use early stopping to help with overfitting
    es = EarlyStopping(monitor='val_loss', mode='min', patience=10, verbose=False)

    # Save best model based on accuracy
    mc = ModelCheckpoint(CNN_MODEL_CHECKPOINT, monitor='val_loss', mode='min', 
                         verbose=True, save_best_only=True)

    # Fit model
    model.fit(trainX, trainY, 
            epochs=100, 
            batch_size=500, 
            validation_data=(testX, testY), 
            callbacks=[es, mc], 
            verbose=False)

    # Show accuracy
    loss, accuracy = model.evaluate(testX, testY, verbose=False)
    print(f'                              Accuracy: {accuracy}')

    #return model
        
def evaluate_with_crossval(model, dataX, dataY, k=2):
    '''
    Performs evaulation with K-fold cross validation.
    
    Inputs:
        model (keras.Model object)
        dataX, dataY (numpy.ndarray) 4D array of DTL features and 2D array of targets 
                                     for training
        k (int)
    Returns:
        None
    '''
    # Define k-fold cross-val
    kfold = KFold(k, shuffle=True, random_state=1)
    # Loop through folds
    count = 1
    for train_idx, test_idx in kfold.split(dataX):
        print(f'{datetime.datetime.now()}    --- Current K-fold: {count} ---')
        # Select subsets for training and testing
        trainX, trainY, testX, testY = dataX[train_idx], dataY[train_idx], \
                                       dataX[test_idx], dataY[test_idx]
        # Pass to evaluate_model function
        evaluate_model(model, trainX, trainY, testX, testY)
        count += 1

def display_eval_metrics(model, testX, testY, n_ntl_bins):
    '''
    Displays evaluation metrics for a given trained model.
    '''
    # Get predictions
    predY = model.predict(testX)
    predY = np.argmax(predY, axis = 1)
    testY_bins = np.argmax(testY, axis = 1)
    # Generate classification report
    classes = ['Radiance Level %01d' %i for i in range(1,n_ntl_bins+1)]
    print(classification_report(testY_bins, predY, target_names=classes))

## Load Parameters

In [7]:
# load json from s3
#s3 = boto3.resource('s3')
content_object = boto3.resource('s3').Object(bucket, CNN_PARAMS_FILENAME)
file_content = content_object.get()['Body'].read().decode('utf-8')
cnn_param_dict = json.loads(file_content)

# grab parameters
N_bands = cnn_param_dict['N_bands']
n_ntl_bins = cnn_param_dict['n_ntl_bins']
image_height = cnn_param_dict['image_height']
image_width = cnn_param_dict['image_width']
bands = cnn_param_dict['bands']
min_ntl_bin_count = cnn_param_dict['bands']

## Load and Prep Data

In [8]:
# Load Data
NTL = np.load(s3.open('{}/{}'.format(bucket, NTL_FILENAME)))
DTL = np.load(s3.open('{}/{}'.format(bucket, DTL_FILENAME)))

# SPLIT DATA INTO TRAINING AND TESTING
trainX, testX, raw_trainY, raw_testY = train_test_split(DTL, NTL, 
                                                        test_size=0.2)

# PREP TRAINING AND TESTING DATA
trainY = to_categorical(raw_trainY)
testY = to_categorical(raw_testY)

# PREP PIXELS IN FEATURES
trainX, testX = normalize(trainX), normalize(testX)

## Run Model

In [9]:
model = define_model_imagenet(image_height, image_width, N_bands, n_ntl_bins)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [10]:
evaluate_model(model, trainX, trainY, testX, testY, CNN_MODEL_CHECKPOINT)



Epoch 00001: val_loss improved from inf to 1.02153, saving model to /home/ec2-user/SageMaker/script_CNN_2014.h5

Epoch 00002: val_loss did not improve from 1.02153

Epoch 00003: val_loss did not improve from 1.02153

Epoch 00004: val_loss improved from 1.02153 to 0.97990, saving model to /home/ec2-user/SageMaker/script_CNN_2014.h5

Epoch 00005: val_loss improved from 0.97990 to 0.93317, saving model to /home/ec2-user/SageMaker/script_CNN_2014.h5

Epoch 00006: val_loss improved from 0.93317 to 0.91217, saving model to /home/ec2-user/SageMaker/script_CNN_2014.h5

Epoch 00007: val_loss improved from 0.91217 to 0.90201, saving model to /home/ec2-user/SageMaker/script_CNN_2014.h5

Epoch 00008: val_loss improved from 0.90201 to 0.89584, saving model to /home/ec2-user/SageMaker/script_CNN_2014.h5

Epoch 00009: val_loss improved from 0.89584 to 0.89406, saving model to /home/ec2-user/SageMaker/script_CNN_2014.h5

Epoch 00010: val_loss improved from 0.89406 to 0.89263, saving model to /home/e

In [11]:
# DISPLAY IN-DEPTH EVALUTAION METRICS
best_model = load_model(CNN_MODEL_CHECKPOINT)
display_eval_metrics(model, testX, testY, n_ntl_bins)

                  precision    recall  f1-score   support

Radiance Level 1       0.65      0.59      0.62        22
Radiance Level 2       0.58      0.69      0.63        16
Radiance Level 3       0.71      0.68      0.70        22

        accuracy                           0.65        60
       macro avg       0.65      0.65      0.65        60
    weighted avg       0.65      0.65      0.65        60



## Save Best Model and Predicted Values to s3

In [12]:
## Save predicted values to s3
# TODO: I save file locally then send to s3; might be a way to send df directly to s3

# Predict Values
predY = best_model.predict(testX) # model.predict(testX)
predY = np.argmax(predY, axis = 1)
testY_bins = np.argmax(testY, axis = 1)

# Make Dataframe
results_df = pd.DataFrame({'predY': predY, 'testY': testY_bins})

# Save locally
results_df.to_csv(os.path.join(LOCAL_DIR, PREDICTION_FILENAME), index=False) 

# Send to s3
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(CNN_DIR, PREDICTION_FILENAME)).upload_file(os.path.join(LOCAL_DIR, PREDICTION_FILENAME))

In [13]:
## Save best model to s3
boto3.Session().resource('s3').Bucket(bucket).Object(CNN_MODEL_S3_PATH).upload_file(CNN_MODEL_CHECKPOINT)