# Extract DTL Data to OPM

Extracts numpy array of DTL around BISP coordinates and saves a pandas dataframe (pickled) of the UIDs corresponding to each numpy array. Processes data so that features can be extracted from the numpy array using the trained CNN model.

__Use the conda_python3 kernel; works better for installing geopandas and rasterio__

## Setup

In [1]:
# This takes a while
#%conda install geopandas
#%conda install rasterio

Collecting package metadata (current_repodata.json): done
Solving environment: | 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - conda-forge/noarch::imageio==2.9.0=py_0
  - conda-forge/linux-64::jupyter_server==1.4.1=py36h5fab9bb_0
  - conda-forge/noarch::black==20.8b1=py_1
  - conda-forge/linux-64::bokeh==2.2.3=py36h5fab9bb_0
  - defaults/linux-64::_anaconda_depends==5.1.0=py36_2
  - conda-forge/noarch::pyls-black==0.4.6=pyh9f0ad1d_0
  - conda-forge/noarch::aiobotocore==1.2.1=pyhd8ed1ab_0
  - conda-forge/noarch::pyls-spyder==0.3.2=pyhd8ed1ab_0
  - conda-forge/linux-64::anyio==2.1.0=py36h5fab9bb_0
  - conda-forge/noarch::jupyterlab_server==2.3.0=pyhd8ed1ab_0
  - conda-forge/linux-64::matplotlib-base==3.3.4=py36hd391965_0
  - conda-forge/linux-64::spyder==4.2.0=py36h5fab9bb_0
  - conda-forge/noarch::python-language-server==0.36.2=pyhd8ed1ab_0
  - conda-forge/noarch::seaborn-base==0.11.1=pyhd8ed1ab_1
  -

In [2]:
import os, math, pickle, datetime, json
import numpy as np
import pandas as pd
import geopandas as gpd
import json
import rasterio
from rasterio.plot import show

from geopandas import GeoDataFrame
from shapely.geometry import Point

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (BaggingClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, RandomForestClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, classification_report)
#from keras.models import load_model

import warnings
import random
#import tensorflow as tf
warnings.filterwarnings('ignore')

import boto3
from sagemaker import get_execution_role
from s3fs.core import S3FileSystem 
s3 = S3FileSystem()
role = get_execution_role()

## User Defined
import config as cf
import feature_extraction as fe

bucket = 'worldbank-pakistan-data'
LOCAL_DIR = '/home/ec2-user/SageMaker/'
  

## Functions

In [3]:
def pd_to_gdp(df, lat_name = 'latitude', lon_name = 'longitude'):
    '''
    Converts a pandas dataframe with lat and long variables into
    geopandas point data

    Input:  df - pandas dataframe
            lat_name - name of latitude variable in df
            lon_name - name of longitude variable in df
    Output: geopandas dataframe
    '''

    geometry = [Point(xy) for xy in zip(df[lon_name], df[lat_name])]
    df = df.drop([lon_name, lat_name], axis=1)
    gdf = GeoDataFrame(df, crs="EPSG:4326", geometry=geometry)

    return gdf

def extract_dtl_opm(bands_type, year, param_name):
    '''
    Function to extract daytime lights to OPM survey data locations. Creates two objects:
    
    Input: bands_type - Right now, just RBG
           year - Year of daytime imagery data
           param_name -  Name of CNN parameters to use 
    
    Outputs: (1) A numpy array with daytime imagery values
             (2) A pandas dataframe with uids of OPM data, in the same order as the daytime imagery values
    '''

    # 0. Load Bands
    if bands_type == "RGB":
        bands = ['4', '3', '2']

    # 1. Load Grid for DTL Tiles
    dtl_tiles = gpd.read_file(s3.open('{}/{}'.format(bucket, os.path.join('Country Grid', 'FinalData', 'pak_grid_200km.geojson'))))
    #dtl_tiles = gpd.read_file(os.path.join(cf.DROPBOX_DIRECTORY, 'Data', 'Country Grid', 'FinalData', 'pak_grid_200km.geojson'))
    dtl_tiles.rename(columns = {'id': 'tile_id'}, inplace=True)

    # 2. Prep BISP data
    # Load, convert to geopandas, extract dtl tile
    bisp_df = pd.read_csv(s3.open('{}/{}'.format(bucket, os.path.join('OPM', 'FinalData - PII', 'GPS_uid_crosswalk.csv'))), engine = 'python')
    #bisp_df = pd.read_csv(os.path.join(cf.SECURE_DATA_DIRECTORY, 'Data', 'BISP', 'FinalData - PII', 'GPS_uid_crosswalk.csv'), engine = 'python')
    bisp_gdf = pd_to_gdp(bisp_df)
    bisp_gdf = gpd.sjoin(bisp_gdf, dtl_tiles, how="inner", op='intersects').reset_index(drop=True)
    bisp_gdf['geometry'] = bisp_gdf.buffer(distance = 0.75/111.12).envelope
    
    # 3. Extract DTL to BISP Coordinates

    # Load CNN parameters 
    # Use this for paramers; just effects img_height and img_weidth
    PARAM_PATH_JSON = os.path.join('CNN', param_name, 'CNN_parameters.json')

    content_object = boto3.resource('s3').Object(bucket, PARAM_PATH_JSON)
    file_content = content_object.get()['Body'].read().decode('utf-8')
    cnn_param_dict = json.loads(file_content)
    
    #with open(PARAM_PATH_JSON, 'r') as fp:
    #    cnn_param_dict = json.load(fp)

    # Extract
    DTL, bisp_gdf_processed = fe.map_DTL_NTL(input_gdf = bisp_gdf, 
                                        directory = os.path.join('Landsat', 'l8', str(year)), 
                                        bands = bands, 
                                        img_height = cnn_param_dict['image_height'], 
                                        img_width = cnn_param_dict['image_width'],
                                        year = year)

    bisp_gdf_processed = bisp_gdf_processed[['uid']]

    # 4. Export
    np.save(os.path.join(LOCAL_DIR, 'bisp_dtl_bands' + bands_type + "_" + str(year) + '.npy'), DTL)
    bisp_gdf_processed.to_pickle(os.path.join(LOCAL_DIR, 'bisp_dtl_uids_bands' + bands_type + "_" + str(year) + '.pkl'))

    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_dtl_bands' + bands_type + "_" + str(year) + '.npy')).upload_file(os.path.join(LOCAL_DIR, 'bisp_dtl_bands' + bands_type + "_" + str(year) + '.npy'))
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_dtl_uids_bands' + bands_type + "_" + str(year) + '.pkl')).upload_file(os.path.join(LOCAL_DIR, 'bisp_dtl_uids_bands' + bands_type + "_" + str(year) + '.pkl'))

## Extract Daytime Imagery to OPM

In [None]:
extract_dtl_opm("RGB", 2014, "Nbands3_nNtlBins3_minNTLbinCount16861")

0/5361
10/5361
20/5361
30/5361
40/5361
50/5361
60/5361
70/5361
80/5361
90/5361
100/5361
110/5361
120/5361
130/5361
140/5361
150/5361
160/5361
170/5361
180/5361
190/5361
200/5361
210/5361
220/5361
230/5361
240/5361
250/5361
260/5361
270/5361
280/5361
290/5361
300/5361
310/5361
320/5361
330/5361
340/5361
350/5361
360/5361
370/5361
380/5361
390/5361
400/5361
410/5361
420/5361
430/5361
440/5361
450/5361
460/5361
470/5361
480/5361
490/5361
500/5361
510/5361
520/5361
530/5361
540/5361
550/5361
560/5361
570/5361
580/5361
590/5361
600/5361
610/5361
620/5361
630/5361
640/5361
650/5361
660/5361
670/5361
680/5361
690/5361
700/5361
710/5361
720/5361
730/5361
740/5361
750/5361
760/5361
770/5361
780/5361
790/5361
800/5361
810/5361
820/5361
830/5361
840/5361
850/5361
860/5361
870/5361
880/5361
890/5361
900/5361
910/5361
920/5361
930/5361
940/5361
950/5361
960/5361
970/5361
980/5361
990/5361
1000/5361
1010/5361
1020/5361
1030/5361
1040/5361
1050/5361
1060/5361
1070/5361
1080/5361
1090/5361
1100/5361
1

In [None]:
#function for just 1 band at a time
def extract_dtl_opm_1band(bands, year, param_name):
    '''
    Function to extract daytime lights to OPM survey data locations. Creates two objects:
    
    Input: bands_type - Right now, just RBG
           year - Year of daytime imagery data
           param_name -  Name of CNN parameters to use 
    
    Outputs: (1) A numpy array with daytime imagery values
             (2) A pandas dataframe with uids of OPM data, in the same order as the daytime imagery values
    '''

  

    # 1. Load Grid for DTL Tiles
    dtl_tiles = gpd.read_file(s3.open('{}/{}'.format(bucket, os.path.join('Country Grid', 'FinalData', 'pak_grid_200km.geojson'))))
    #dtl_tiles = gpd.read_file(os.path.join(cf.DROPBOX_DIRECTORY, 'Data', 'Country Grid', 'FinalData', 'pak_grid_200km.geojson'))
    dtl_tiles.rename(columns = {'id': 'tile_id'}, inplace=True)

    # 2. Prep BISP data
    # Load, convert to geopandas, extract dtl tile
    bisp_df = pd.read_csv(s3.open('{}/{}'.format(bucket, os.path.join('OPM', 'FinalData - PII', 'GPS_uid_crosswalk.csv'))), engine = 'python')
    #bisp_df = pd.read_csv(os.path.join(cf.SECURE_DATA_DIRECTORY, 'Data', 'BISP', 'FinalData - PII', 'GPS_uid_crosswalk.csv'), engine = 'python')
    bisp_gdf = pd_to_gdp(bisp_df)
    bisp_gdf = gpd.sjoin(bisp_gdf, dtl_tiles, how="inner", op='intersects').reset_index(drop=True)
    bisp_gdf['geometry'] = bisp_gdf.buffer(distance = 0.75/111.12).envelope
    
    # 3. Extract DTL to BISP Coordinates

    # Load CNN parameters 
    # Use this for paramers; just effects img_height and img_weidth
    PARAM_PATH_JSON = os.path.join('CNN', param_name, 'CNN_parameters.json')

    content_object = boto3.resource('s3').Object(bucket, PARAM_PATH_JSON)
    file_content = content_object.get()['Body'].read().decode('utf-8')
    cnn_param_dict = json.loads(file_content)
    
    #with open(PARAM_PATH_JSON, 'r') as fp:
    #    cnn_param_dict = json.load(fp)

    # Extract
    DTL, bisp_gdf_processed = fe.map_DTL_NTL(input_gdf = bisp_gdf, 
                                        directory = os.path.join('Landsat', 'l8', str(year)), 
                                        bands = bands, 
                                        img_height = cnn_param_dict['image_height'], 
                                        img_width = cnn_param_dict['image_width'],
                                        year = year)

    bisp_gdf_processed = bisp_gdf_processed[['uid']]

    # 4. Export
    np.save(os.path.join(LOCAL_DIR, 'bisp_dtl_band' + str(bands[0]) + "_" + str(year) + '.npy'), DTL)
    bisp_gdf_processed.to_pickle(os.path.join(LOCAL_DIR, 'bisp_dtl_uids_band' + str(bands[0]) + "_" + str(year) + '.pkl'))

    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_dtl_band' + str(bands[0]) + "_" + str(year) + '.npy')).upload_file(os.path.join(LOCAL_DIR, 'bisp_dtl_band' + str(bands[0]) + "_" + str(year) + '.npy'))
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_dtl_uids_band' + str(bands[0]) + "_" + str(year) + '.pkl')).upload_file(os.path.join(LOCAL_DIR, 'bisp_dtl_uids_band' + str(bands[0]) + "_" + str(year) + '.pkl'))


In [None]:
extract_dtl_opm_1band([1], 2014, "Band1_nNtlBins3_minNTLbinCount1861") #not as long as 01_prep_data for_cnn.ipynb