# Extract DTL Data to OPM

Extracts numpy array of DTL around BISP coordinates and saves a pandas dataframe (pickled) of the UIDs corresponding to each numpy array. Processes data so that features can be extracted from the numpy array using the trained CNN model.

TODO: Still need to get working with sagemaker -- (1) load raster tiles and (2) resolve issues installing geopandas and rasterio

__Use the conda_python3 kernel; works better for install geopandas and rasterio__

## Setup

In [1]:
# This takes a while
#%conda install geopandas
#%conda install rasterio

In [1]:
import os, math, pickle, datetime, json
import numpy as np
import pandas as pd
import geopandas as gpd
import json
import rasterio
from rasterio.plot import show

from geopandas import GeoDataFrame
from shapely.geometry import Point

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (BaggingClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, RandomForestClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, classification_report)
#from keras.models import load_model

import warnings
import random
#import tensorflow as tf
warnings.filterwarnings('ignore')

import boto3
from sagemaker import get_execution_role
from s3fs.core import S3FileSystem 
s3 = S3FileSystem()
role = get_execution_role()

## User Defined
import config as cf
import feature_extraction as fe

bucket = 'worldbank-pakistan-data'
LOCAL_DIR = '/home/ec2-user/SageMaker/'
  

## Functions

In [2]:
def pd_to_gdp(df, lat_name = 'latitude', lon_name = 'longitude'):
    '''
    Converts a pandas dataframe with lat and long variables into
    geopandas point data

    Input:  df - pandas dataframe
            lat_name - name of latitude variable in df
            lon_name - name of longitude variable in df
    Output: geopandas dataframe
    '''

    geometry = [Point(xy) for xy in zip(df[lon_name], df[lat_name])]
    df = df.drop([lon_name, lat_name], axis=1)
    gdf = GeoDataFrame(df, crs="EPSG:4326", geometry=geometry)

    return gdf

def extract_dtl_opm(bands_type, year):

    # 0. Load Bands
    if bands_type == "RGB":
        bands = ['4', '3', '2']

    # 1. Load Grid for DTL Tiles
    dtl_tiles = gpd.read_file(s3.open('{}/{}'.format(bucket, os.path.join('Country Grid', 'FinalData', 'pak_grid_200km.geojson'))))
    #dtl_tiles = gpd.read_file(os.path.join(cf.DROPBOX_DIRECTORY, 'Data', 'Country Grid', 'FinalData', 'pak_grid_200km.geojson'))
    dtl_tiles.rename(columns = {'id': 'tile_id'}, inplace=True)

    # 2. Prep BISP data
    # Load, convert to geopandas, extract dtl tile
    bisp_df = pd.read_csv(s3.open('{}/{}'.format(bucket, os.path.join('OPM', 'FinalData - PII', 'GPS_uid_crosswalk.csv'))), engine = 'python')
    #bisp_df = pd.read_csv(os.path.join(cf.SECURE_DATA_DIRECTORY, 'Data', 'BISP', 'FinalData - PII', 'GPS_uid_crosswalk.csv'), engine = 'python')
    bisp_gdf = pd_to_gdp(bisp_df)
    bisp_gdf = gpd.sjoin(bisp_gdf, dtl_tiles, how="inner", op='intersects').reset_index(drop=True)
    bisp_gdf['geometry'] = bisp_gdf.buffer(distance = 0.75/111.12).envelope
    
    # 3. Extract DTL to BISP Coordinates

    # Load CNN parameters 
    # Use this for paramers; just effects img_height and img_weidth
    param_name = "Nbands3_nNtlBins3_minNTLbinCount16861"
    PARAM_PATH_JSON = os.path.join('CNN', param_name, 'CNN_parameters.json')

    content_object = boto3.resource('s3').Object(bucket, CNN_PARAMS_FILENAME)
    file_content = content_object.get()['Body'].read().decode('utf-8')
    cnn_param_dict = json.loads(file_content)
    
    #with open(PARAM_PATH_JSON, 'r') as fp:
    #    cnn_param_dict = json.load(fp)

    # Extract
    DTL, bisp_gdf_processed = fe.map_DTL_NTL(input_gdf = bisp_gdf, 
                                        directory = os.path.join('Landsat', 'l8', str(year)), 
                                        bands = bands, 
                                        img_height = cnn_param_dict['image_height'], 
                                        img_width = cnn_param_dict['image_width'],
                                        year = year)

    bisp_gdf_processed = bisp_gdf_processed[['uid']]

    # 4. Export
    np.save(os.path.join(LOCAL_DIR, 'bisp_dtl_bands' + bands_type + "_" + str(year) + '.npy'), DTL)
    bisp_gdf_processed.to_pickle(os.path.join(LOCAL_DIR, 'bisp_dtl_uids_bands' + bands_type + "_" + str(year) + '.pkl'))

    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_dtl_bands' + bands_type + "_" + str(year) + '.npy')).upload_file(os.path.join(LOCAL_DIR, 'bisp_dtl_bands' + bands_type + "_" + str(year) + '.npy'))
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_dtl_uids_bands' + bands_type + "_" + str(year) + '.pkl')).upload_file(os.path.join(LOCAL_DIR, 'bisp_dtl_uids_bands' + bands_type + "_" + str(year) + '.pkl'))

    #np.save(os.path.join(        cf.DROPBOX_DIRECTORY, 'Data', 'BISP' , 'FinalData', 'Individual Datasets', 'bisp_dtl_bands' + bands_type + "_" + str(year) + '.npy'), DTL)
    #bisp_gdf_processed.to_pickle(os.path.join(cf.DROPBOX_DIRECTORY, 'Data', 'BISP' , 'FinalData', 'Individual Datasets', 'bisp_dtl_uids_bands' + bands_type + "_" + str(year) + '.pkl'))

## Extract Daytime Imagery to OPM

In [3]:
extract_dtl_opm("RGB", 2014)

NameError: name 'dtl_tiles' is not defined

In [11]:
bucket = 'worldbank-pakistan-data'

In [7]:
bisp_df = rasterio.open(s3.open('{}/{}'.format(bucket, os.path.join('Landsat', 'l8', '2014', 'l8_2014_tile1_b1.tif'))))

In [13]:
os.path.join('Landsat', 'l8')

'Landsat/l8/2014/l8_2014_tile1_b1.tif'

In [11]:
year = 2014