# Prepare Data for CNN

Prepares data for CNN. 
1. Outputs numpy arrays of DTL values and NTL labels
2. Creates parameter dictionary (eg, number of NTL labels)


## Setup

In [1]:
### Libraries ###
import os, datetime
import numpy as np
import pandas as pd
import geopandas as gpd
import json
import rasterio
from rasterio.plot import show

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix

import logging, os 

### User Defined Libraries ###
import config as cf
import feature_extraction as fe

### Set Seeds ###
seed_value = 42
# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED'] = str(seed_value)
# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)
# 3. Set the `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

### Parameters / Paths ###
FINAL_TARGET_NAME = 'ntl_bins'
#VIIRS_GDF_FILEPATH = cf.VIIRS_GDF_FILEPATH
#DTL_DIRECTORY = cf.DTL_DIRECTORY

import boto3
from sagemaker import get_execution_role
from s3fs.core import S3FileSystem 
s3 = S3FileSystem()
role = get_execution_role()

bucket = 'worldbank-pakistan-data'
LOCAL_DIR = '/home/ec2-user/SageMaker/'

## Functions

In [None]:
def transform_target(gdf, orig_target_name, n_bins):
    '''
    Creates log NTL variable and bins into 5 classes using k-means clutering.
    '''
    # Perform log(x+1) for defined domain
    transformed_target_name = f'log_{orig_target_name}'
    gdf[transformed_target_name] = np.log(gdf[orig_target_name] + 1)
    # Bin target
    target = gdf[transformed_target_name].to_numpy().reshape(-1,1)
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='kmeans')
    gdf[FINAL_TARGET_NAME] = discretizer.fit_transform(target)

def sample_by_target(input_gdf, target_col_name, n):
    '''
    Create a sample dataframe containing n observations from each target bin.
    '''

    gdf = gpd.GeoDataFrame()
    for x in input_gdf[target_col_name].unique():
        bin_gdf = input_gdf[input_gdf[target_col_name] == x]
        sample_gdf = bin_gdf.sample(n=n, random_state=1)
        gdf = gdf.append(sample_gdf)
    return gdf

def normalize(X):
    '''
    Normalizes features.
    '''
    return X.astype('float32') / 255.0

def prep_cnn_data(bands, n_ntl_bins, min_ntl_bin_count, year):

    # PARAMETERS -------------------------------------------------------------

    ## Define Parameters
    # Daytime image parameters
    image_height = 48 # VGG16 needs images to be rescale to 224x224
    image_width = 48
    N_bands = len(bands)

    ## Save parameters for later use
    cnn_param_dict = {'image_height': image_height, 
                    'image_width': image_width,
                    'bands': bands,
                    'N_bands': N_bands,
                    'n_ntl_bins': n_ntl_bins,
                    'min_ntl_bin_count': min_ntl_bin_count}

    # Make directory for these parameters
    params_str = 'Nbands' + str(N_bands) + "_nNtlBins" + str(n_ntl_bins) + "_minNTLbinCount" + str(min_ntl_bin_count)

    # Save Locally
    with open(os.path.join(LOCAL_DIR, 'CNN_parameters.json'), 'w') as fp:
        json.dump(cnn_param_dict, fp)
        
    # Send to s3
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('CNN', params_str, 'CNN_parameters.json')).upload_file(os.path.join(LOCAL_DIR, 'CNN_parameters.json'))

    # Run --------------------------------------------------------------------

    ## LOAD DATA
    viirs = pd.read_pickle(s3.open('{}/{}'.format(bucket, os.path.join('VIIRS', 'FinalData', 'viirs_annual_polygon.pkl'))))
    viirs_gdf = gpd.GeoDataFrame(viirs, geometry='geometry')
    viirs_gdf = viirs_gdf[ ~ np.isnan(viirs_gdf['tile_id'])]

    ## PREP NTL
    transform_target(viirs_gdf, 'median_rad_' + str(year), n_ntl_bins)

    ## Total pixels in each category
    print(viirs_gdf[FINAL_TARGET_NAME].value_counts())

    ## Create Sample
    # Subsets VIIRS dataframe
    min_bin_count = min(viirs_gdf[FINAL_TARGET_NAME].value_counts())
    gdf = sample_by_target(viirs_gdf, FINAL_TARGET_NAME, min_ntl_bin_count)

    ## Path to DTL Files
    DTL_DIRECTORY_DATA = os.path.join('Landsat','l8', str(year))
    
    ## Match DTL TO NTL
    DTL, processed_gdf = fe.map_DTL_NTL(gdf, DTL_DIRECTORY_DATA, bands, image_height, image_width, year)
    NTL = processed_gdf[FINAL_TARGET_NAME].to_numpy()
    NTL_continuous = processed_gdf['median_rad_'+ str(year)].to_numpy()
    
    ## Save Locally
    print("Saving")
    np.save(os.path.join(LOCAL_DIR, f'ntl_{str(year)}.npy'), NTL)
    np.save(os.path.join(LOCAL_DIR, f'ntl_continuous_{str(year)}.npy'), NTL_continuous)
    np.save(os.path.join(LOCAL_DIR, f'dtl_{str(year)}.npy'), DTL)
    
    ## Send to s3
    print("Sending to s3")
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('CNN', param_str, f'ntl_{str(year)}.npy')).upload_file(os.path.join(LOCAL_DIR, f'ntl_{str(year)}.npy'))
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('CNN', param_str, f'ntl_continuous_{str(year)}.npy')).upload_file(os.path.join(LOCAL_DIR, f'ntl_continuous_{str(year)}.npy'))
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('CNN', param_str, f'dtl_{str(year)}.npy')).upload_file(os.path.join(LOCAL_DIR, f'dtl_{str(year)}.npy'))



## Implement Function

In [None]:
prep_cnn_data(bands = ['4', '3', '2'], n_ntl_bins = 3, min_ntl_bin_count = 50, year = 2014)


In [8]:
viirs_gdf = gpd.GeoDataFrame(viirs, geometry='geometry')
viirs_gdf = viirs_gdf[ ~ np.isnan(viirs_gdf['tile_id'])]

Unnamed: 0,median_rad_2012,median_rad_2013,median_rad_2014,median_rad_2015,median_rad_2016,median_rad_2017,median_rad_2018,id,tile_id,geometry
0,0.121902,0.217595,0.179258,0.213304,0.148595,0.352908,0.34,1,42.0,"POLYGON ((74.66347 37.06224, 74.67021 37.06224..."
1,0.20411,0.223363,0.207353,0.283295,0.15623,0.375,0.36,2,42.0,"POLYGON ((74.67021 37.06224, 74.67695 37.06224..."
2,0.180872,0.232932,0.176871,0.262644,0.129477,0.374304,0.33,3,42.0,"POLYGON ((74.67695 37.06224, 74.68369 37.06224..."
3,0.112223,0.195961,0.218169,0.306576,0.129625,0.340344,0.32,4,42.0,"POLYGON ((74.68369 37.06224, 74.69042 37.06224..."
4,0.129297,0.234316,0.215273,0.288344,0.148281,0.385,0.34,5,42.0,"POLYGON ((74.69042 37.06224, 74.69716 37.06224..."
