# Prepare Data for CNN

Prepares data for CNN. 
1. Outputs numpy arrays of DTL values and NTL labels
2. Creates parameter dictionary (eg, number of NTL labels)


Some notes on AWS:
1. Use the conda_python3 environment to install geopandas and rasterio (Takes a while).
2. Large minimum bin sizes will take a long time. A single band with a mostly full 16814 minimum bin size takes about 10-12 hours. 
3. A more powerful instance type does not seem to affect runtime.
4. Might be worth babysitting it while it runs. Time outs or connection issues seem to interrupt the process.

## Setup

In [36]:
### Libraries ###
import os, datetime
import numpy as np
import pandas as pd
import geopandas as gpd
import json
import rasterio
from rasterio.plot import show

from geopandas import GeoDataFrame
from shapely.geometry import Point

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix

import logging, os 

### User Defined Libraries ###
import config as cf
import feature_extraction as fe

### Set Seeds ###
seed_value = 42
# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED'] = str(seed_value)
# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)
# 3. Set the `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

### Parameters / Paths ###
FINAL_TARGET_NAME = 'ntl_bins'
VIIRS_GDF_FILEPATH = cf.VIIRS_GDF_FILEPATH
DTL_DIRECTORY = cf.DTL_DIRECTORY


## Functions

In [37]:
def pd_to_gdp(df, lat_name = 'latitude', lon_name = 'longitude'):
    '''
    Converts a pandas dataframe with lat and long variables into
    geopandas point data

    Input:  df - pandas dataframe
            lat_name - name of latitude variable in df
            lon_name - name of longitude variable in df
    Output: geopandas dataframe
    '''

    geometry = [Point(xy) for xy in zip(df[lon_name], df[lat_name])]
    df = df.drop([lon_name, lat_name], axis=1)
    gdf = GeoDataFrame(df, crs="EPSG:4326", geometry=geometry)

    return gdf

def normalize(X):
    '''
    Normalizes features.
    '''
    return X.astype('float32') / 255.0

In [38]:
def pre_cnn_data(gdf, 
                 dtl_directory, 
                 sat_suffix,
                 year, 
                 folder_name, 
                 image_height = 48, 
                 image_width = 48):
    
    '''
    Creates numpy arrays for CNN

    Input:  df - pandas dataframe
            lat_name - name of latitude variable in df
            lon_name - name of longitude variable in df
    Output: geopandas dataframe
    '''
    DTL_rgb, processed_gdf = fe.map_DTL_NTL(gdf, 
                                            dtl_directory, 
                                            bands = ['4', '3', '2'], 
                                            img_height = image_height, 
                                            img_width = image_width, 
                                            year = year, 
                                            sat_suffix = 'l7')
    
    DTL_b5, processed_gdf = fe.map_DTL_NTL(gdf, 
                                           dtl_directory, 
                                           bands = ['5'], 
                                           img_height = image_height, 
                                           img_width = image_width, 
                                           year = year, 
                                           sat_suffix = 'l7')
    
    DTL_b6, processed_gdf = fe.map_DTL_NTL(gdf, 
                                           dtl_directory, 
                                           bands = ['6'], 
                                           img_height = image_height, 
                                           img_width = image_width, 
                                           year = year, 
                                           sat_suffix = 'l7')
    
    DTL_b7, processed_gdf = fe.map_DTL_NTL(gdf, 
                                           dtl_directory, 
                                           bands = ['7'], 
                                           img_height = image_height, 
                                           img_width = image_width, 
                                           year = year, 
                                           sat_suffix = 'l7')
    
    print(processed_gdf.shape)
    print(DTL_rgb.shape)
    print(DTL_b5.shape)
    print(DTL_b6.shape)
    print(DTL_b7.shape)
    
    processed_gdf.to_pickle(os.path.join(cf.GD_CNN_DIRECTORY, folder_name, 'dep_var.pkl'))
        
    np.save(os.path.join(cf.GD_CNN_DIRECTORY, folder_name, f'dtl_rgb_{str(year)}.npy'), DTL_rgb)
    np.save(os.path.join(cf.GD_CNN_DIRECTORY, folder_name, f'dtl_b5_{str(year)}.npy'), DTL_b5)
    np.save(os.path.join(cf.GD_CNN_DIRECTORY, folder_name, f'dtl_b6_{str(year)}.npy'), DTL_b6)
    np.save(os.path.join(cf.GD_CNN_DIRECTORY, folder_name, f'dtl_b7_{str(year)}.npy'), DTL_b7)
    
    return "Done!"

## Params

In [39]:
DTL_DIRECTORY_l7_2014 = os.path.join(cf.DROPBOX_DIRECTORY, 'Data', 'Landsat', 'l7', '2014')
image_height = 48 # VGG16 needs images to be rescale to 224x224
image_width = 48

## Process - VIIRS

In [40]:
viirs = pd.read_pickle(os.path.join(cf.DROPBOX_DIRECTORY, 'Data', 'VIIRS', 'FinalData', 'random_samples', 'viirs_random_sample.pkl'))

In [41]:
pre_cnn_data(gdf = viirs, 
             dtl_directory = DTL_DIRECTORY_l7_2014, 
             sat_suffix = 'l7', 
             year = 2014, 
             folder_name = 'VIIRS', 
             image_height = 48, 
             image_width = 48)

0/5
0/5
0/5
0/5
(5, 12)
(5, 48, 48, 3)
(5, 48, 48, 1)
(5, 48, 48, 1)
(5, 48, 48, 1)


'Done!'

## Process OPM

In [94]:
# GPS_uid_crosswalk.csv
# opm_socioeconomic_geo.csv
opm_df = pd.read_csv(os.path.join(cf.SECURE_DATA_DIRECTORY, 'Data', 'OPM', 'FinalData - PII', 'GPS_uid_crosswalk.csv'))
#opm_df = opm_df[opm_df['latitude'].notnull()]
opm_df = pd_to_gdp(opm_df)
opm_df['geometry'] = opm_df.buffer(distance = 0.75/111.12).envelope

In [95]:
pre_cnn_data(gdf = opm_df, 
             dtl_directory = DTL_DIRECTORY_l7_2014, 
             sat_suffix = 'l7', 
             year = 2014, 
             folder_name = 'OPM', 
             image_height = 48, 
             image_width = 48)

0/5361
10/5361
20/5361
30/5361
40/5361
50/5361
60/5361
70/5361
80/5361
90/5361
100/5361
110/5361
120/5361
130/5361
140/5361
150/5361
160/5361
170/5361
180/5361
190/5361
200/5361
210/5361
220/5361
230/5361
240/5361
250/5361


KeyboardInterrupt: 

In [93]:
opm_df.head()

Unnamed: 0,uid,geometry
0,104989,"POLYGON ((72.69547276217901 33.8211672066235, ..."
1,100389,"POLYGON ((72.69597276217901 33.82111165106791,..."
2,101236,"POLYGON ((72.69686165106791 33.8208060955124, ..."
3,105557,"POLYGON ((72.69008387329011 33.82055609551241,..."
4,101915,"POLYGON ((72.69433387329011 33.8201949844012, ..."


In [84]:
opm_dfa['latitude']

5        33.827861
6        33.827861
7        33.827861
8        33.826611
9        33.826611
           ...    
33789    30.531444
33790    30.531444
33791    30.192111
33792    30.192111
33793    30.192111
Name: latitude, Length: 14042, dtype: float64