### Imports and Setup

In [1]:
%cd /home/ubuntu/eda/data

from osgeo import gdal, gdal_array, osr, ogr
import numpy as np
import rasterio
from rasterio import mask
import pyproj
from affine import Affine
import pandas as pd
from shapely.geometry import Point
from geopandas import GeoDataFrame
import geopandas as gpd
from rasterio.features import shapes
import xarray
import matplotlib.pyplot as plt
import rtree
import shapely
import warnings
import time

warnings.filterwarnings('ignore')

/home/ubuntu/eda/data


### Load census block and census tract data

In [2]:
#census blocks
cb_df = gpd.read_file('./shapefiles/nycb2020_22b/nycb2020.shp')
cb_df.columns = cb_df.columns.str.lower()

#census tracts
ct_df = gpd.read_file('./shapefiles/nyct2020_22b/nyct2020.shp')
ct_df.columns = ct_df.columns.str.lower()

### Create feature generation functions

In [3]:
def gen_lc_features(df, filepath, num_classes = 9, prefix = 'land_cvr_', default_val = 0, rebase = True):
    
    lc_data = rasterio.open(filepath)
    feat_names = [prefix+str(i) for i in range(num_classes)] 
    if rebase:
        rb_feat_names = [prefix + str(i) + '_rb' for i in range(1,num_classes)]
    for feat in (feat_names + rb_feat_names):
        df[feat] = default_val
    
    for i in range(len(df)):
        filt_raster, _ = mask.mask(lc_data, [df['geometry'][i]], crop = True)
        lc_proportion = np.histogram(filt_raster, bins = [i for i in range(num_classes + 1)])[0]/(filt_raster.shape[1]*filt_raster.shape[2])
        for j in range(len(lc_proportion)):
            var_name = prefix + str(j)
            df.loc[i, var_name] = round(lc_proportion[j],6)
            if j > 0:
                new_var_name = var_name + '_rb'
                df.loc[i, new_var_name] = round(df.loc[i, var_name]/(1 - df.loc[i, prefix+'0']),6)
    return df

In [4]:
def gen_elev_features(df, filepath, prefix = 'elev_', ct_level = False, default_val = 0):
    
    if ct_level:
        suffix = '_ct'
    else:
        suffix = ''
    
    elev_data = rasterio.open(filepath)
    feat_names = [prefix + i + suffix for i in ['mean','min','max','q1','q3']] 
    for feat in feat_names:
        df[feat] = default_val
    
    for i in range(len(df)):
        filt_raster, _ = mask.mask(elev_data, [df['geometry'][i]], crop = True)
        df.loc[i, prefix + 'mean' + suffix] = round(filt_raster.mean(),1)
        df.loc[i, prefix + 'min' + suffix], df.loc[i, prefix + 'q1' + suffix], df.loc[i, prefix + 'q3' + suffix], df.loc[i, prefix + 'max' + suffix] = np.percentile(filt_raster, [0,25,75,100])

    return df

In [5]:
def gen_drain_features(df, filepath):
    
    drain_data = gpd.read_file(filepath)
    drain_data.columns = drain_data.columns.str.lower()

    drain_data = drain_data.sjoin(df, how = 'inner', predicate = 'within')[['unitid','bctcb2020']]                                  
    df = df.merge(drain_data.groupby('bctcb2020')['unitid'].nunique(), how = 'left', on = 'bctcb2020').rename(columns = {'unitid':'catch_basin_count'})
    df['catch_basin_density'] = df['catch_basin_count']/df['shape_area']
    
    return df

In [6]:
def gen_subway_features(df, filepath):
    
    sub_data = gpd.read_file(filepath)
    sub_data.columns = sub_data.columns.str.lower()

    sub_data = sub_data.sjoin(df, how = 'inner', predicate = 'within')[['objectid','bctcb2020']]                                  
    df = df.merge(sub_data.groupby('bctcb2020')['objectid'].nunique(), how = 'left', on = 'bctcb2020').rename(columns = {'objectid':'sub_entr_count'})
    
    return df

In [7]:
def gen_ret_wall_features(df, filepath):
    
    rw_data = gpd.read_file(filepath)
    rw_data.columns = rw_data.columns.str.lower()
    
    rw_data = df.overlay(rw_data, how = 'intersection', keep_geom_type = False)[['bctcb2020','shape_leng_2','geometry']]
    
    #explode to split any multipart geometries
    rw_data = rw_data.explode(ignore_index = True)
    
    length = []
    avg_rw_elev = []
    cb_list = []
    for cb in rw_data.bctcb2020.unique():
        cb_list.append(cb)
        elev_list = []
        subset = rw_data[rw_data.bctcb2020 == cb]
        length.append(subset['shape_leng_2'].sum())
        for i in range(0,len(subset)):
            for j in list(subset.iloc[i].geometry.coords):
                 elev_list.append(j[2])
        avg_rw_elev.append(np.mean(elev_list))
    
    rw_agg = pd.DataFrame(zip(cb_list, length, avg_rw_elev),columns  = ['bctcb2020','rw_length','rw_avg_elev'])
    df = df.merge(rw_agg, how = 'left', on = 'bctcb2020')
    
    return df

In [8]:
def gen_hydro_features(df, filepath, hydro_dict):
    
    h_data = gpd.read_file(filepath)
    h_data.columns = h_data.columns.str.lower()
    
    h_data = df.overlay(h_data, how = 'intersection', keep_geom_type = False)
    
    for key in hydro_dict.keys():
        df[hydro_dict[key]] = np.where(df.bctcb2020.isin(h_data[h_data.feat_code == key].bctcb2020),1,0)
    
    return df

In [9]:
lc_path = './raster_data/NYC_2017_LiDAR_LandCover.img'
elev_path = './elevation/DEM_LiDAR_1ft_2010_Improved_NYC_int.tif'
dr_path = './DEPCatchbasins/DEPCATCHBASINS.shp'
se_path = './doitt_subway_entrances/DOITT_SUBWAY_ENTRANCE_04JAN2017.shp'
rw_path = './retaining_wall/RETAININGWALL.shp'
h_path = './hydro/HYDROGRAPHY.shp'

hydro_dict = {2600:'lake_res_ind',
              2610:'pond_ind',
              2620:'river_ind',
              2630:'stream_ind',
              2640:'wl_marsh_ind',
              2650:'beach_shore_ind',
              2660:'bay_ocean_ind'}

### Borough-Level Dataset Pipeline (WIP)

In [10]:
def gen_boro_dataset(cb_df, ct_df, name, name_abv):
    start_ = time.time()
    df = gen_lc_features(cb_df[cb_df.boroname == name].reset_index(drop = True), lc_path)
    print(f'Land cover features complete: {round((time.time()-start_)/60,1)} total time elapsed')
    df = gen_elev_features(df, elev_path)
    print(f'CB elevation features complete: {round((time.time()-start_)/60,1)} total time elapsed')
    df_c = gen_elev_features(ct_df[ct_df.boroname == name].reset_index(drop = True), elev_path, ct_level = True)
    print(f'CT elevation features complete: {round((time.time()-start_)/60,1)} total time elapsed')
    df = df.merge(df_c[['ct2020','ntaname','elev_mean_ct','elev_min_ct','elev_max_ct','elev_q1_ct','elev_q3_ct']], on = 'ct2020')
    df = gen_drain_features(df, dr_path)
    print(f'Catch basin features complete: {round((time.time()-start_)/60,1)} total time elapsed')
    df = gen_subway_features(df, se_path)
    print(f'Subway features complete: {round((time.time()-start_)/60,1)} total time elapsed')
    df = gen_ret_wall_features(df, rw_path)
    print(f'Retaining wall features complete: {round((time.time()-start_)/60,1)} total time elapsed')
    df = gen_hydro_features(df, h_path, hydro_dict)
    print(f'Hydrography features complete: {round((time.time()-start_)/60,1)} total time elapsed')
    df.to_csv(f'{name_abv}_data.csv')
    return df

### Generate full dataset

In [24]:
mht = gen_boro_dataset(cb_df, ct_df, 'Manhattan', 'mht')

Land cover features complete: 3.1 total time elapsed
CB elevation features complete: 5.9 total time elapsed
CT elevation features complete: 8.4 total time elapsed
Catch basin features complete: 8.9 total time elapsed
Subway features complete: 8.9 total time elapsed
Retaining wall features complete: 9.0 total time elapsed
Hydrography features complete: 9.0 total time elapsed


In [26]:
!aws s3 cp mht_data.csv s3://w210-flood-risk/modeling_data/mht_data.csv --acl public-read

upload: ./mht_data.csv to s3://w210-flood-risk/modeling_data/mht_data.csv


In [11]:
bk = gen_boro_dataset(cb_df, ct_df, 'Brooklyn', 'bk')

Land cover features complete: 7.8 total time elapsed
CB elevation features complete: 13.7 total time elapsed
CT elevation features complete: 19.2 total time elapsed
Catch basin features complete: 19.7 total time elapsed
Subway features complete: 19.8 total time elapsed
Retaining wall features complete: 19.9 total time elapsed
Hydrography features complete: 19.9 total time elapsed


In [16]:
!aws s3 cp bk_data.csv s3://w210-flood-risk/modeling_data/bk_data.csv --acl public-read

upload: ./bk_data.csv to s3://w210-flood-risk/modeling_data/bk_data.csv


In [11]:
qns = gen_boro_dataset(cb_df, ct_df, 'Queens', 'qns')

Land cover features complete: 12.9 total time elapsed
CT elevation features complete: 67.4 total time elapsed
Catch basin features complete: 67.9 total time elapsed
Subway features complete: 68.0 total time elapsed
Retaining wall features complete: 68.1 total time elapsed
Hydrography features complete: 68.2 total time elapsed


In [None]:
!aws s3 cp qns_data.csv s3://w210-flood-risk/modeling_data/qns_data.csv --acl public-read

upload: ./qns_data.csv to s3://w210-flood-risk/modeling_data/qns_data.csv


In [None]:
bx = gen_boro_dataset(cb_df, ct_df, 'Bronx', 'bx')

Land cover features complete: 5.4 total time elapsed
CB elevation features complete: 8.5 total time elapsed
CT elevation features complete: 11.1 total time elapsed
Catch basin features complete: 11.6 total time elapsed
Subway features complete: 11.6 total time elapsed
Retaining wall features complete: 11.7 total time elapsed
Hydrography features complete: 11.8 total time elapsed


In [None]:
!aws s3 cp bx_data.csv s3://w210-flood-risk/modeling_data/bx_data.csv --acl public-read

upload: ./bx_data.csv to s3://w210-flood-risk/modeling_data/bx_data.csv


In [None]:
si = gen_boro_dataset(cb_df, ct_df, 'Staten Island', 'si')

Land cover features complete: 7.4 total time elapsed
CB elevation features complete: 11.8 total time elapsed
CT elevation features complete: 15.4 total time elapsed
Catch basin features complete: 15.9 total time elapsed
Subway features complete: 15.9 total time elapsed
Retaining wall features complete: 15.9 total time elapsed
Hydrography features complete: 16.0 total time elapsed


In [None]:
!aws s3 cp si_data.csv s3://w210-flood-risk/modeling_data/si_data.csv --acl public-read

upload: ./si_data.csv to s3://w210-flood-risk/modeling_data/si_data.csv


In [None]:
# %%time
# start_ = time.time()
# df = gen_lc_features(cb_df, lc_path)
# print(f'Land cover features complete: {round((time.time()-start_)/60,1)} total time elapsed')
# df = gen_elev_features(df, elev_path)
# print(f'CB elevation features complete: {round((time.time()-start_)/60,1)} total time elapsed')
# df_c = gen_elev_features(ct_df, elev_path, ct_level = True)
# print(f'CT elevation features complete: {round((time.time()-start_)/60,1)} total time elapsed')
# df = df.merge(df_c[['ct2020','ntaname','avg_elev_ct','min_elev_ct','max_elev_ct','q1_elev_ct','q3_elev_ct']], on = 'ct2020')
# df = gen_drain_features(df, dr_path)
# print(f'Catch basin features complete: {round((time.time()-start_)/60,1)} total time elapsed')
# df = gen_subway_features(df, se_path)
# print(f'Subway features complete: {round((time.time()-start_)/60,1)} total time elapsed')
# df = gen_ret_wall_features(df, rw_path)
# print(f'Retaining wall features complete: {round((time.time()-start_)/60,1)} total time elapsed')
# df = gen_hydro_features(df, h_path)
# print(f'Hydrography features complete: {round((time.time()-start_)/60,1)} total time elapsed')

# df.to_csv('full_dataset.csv')