### Imports and Setup

In [1]:
%cd /home/ubuntu/eda/data

from osgeo import gdal, gdal_array, osr, ogr
import numpy as np
import rasterio
from rasterio import mask
import pyproj
from affine import Affine
import pandas as pd
from shapely.geometry import Point
from geopandas import GeoDataFrame
import geopandas as gpd
from rasterio.features import shapes
import xarray
import matplotlib.pyplot as plt
import rtree
import shapely
import warnings
import time

warnings.filterwarnings('ignore')

/home/ubuntu/eda/data


### Load census block and census tract data

In [2]:
#census blocks
cb_df = gpd.read_file('./shapefiles/nycb2020_22b/nycb2020.shp')
cb_df.columns = cb_df.columns.str.lower()

#census tracts
ct_df = gpd.read_file('./shapefiles/nyct2020_22b/nyct2020.shp')
ct_df.columns = ct_df.columns.str.lower()

### Create feature generation functions

In [11]:
def gen_lc_features(df, filepath, num_classes = 9, prefix = 'land_cvr_', default_val = 0, ct_level = False):
    
    if ct_level:
        suffix = '_ct'
    else:
        suffix = ''
        
    lc_data = rasterio.open(filepath)
    feat_names = [prefix+str(i)+suffix for i in range(1,num_classes)] 
    # if rebase:
    #     rb_feat_names = [prefix + str(i) + '_rb' for i in range(1,num_classes)]
    for feat in feat_names:
        df[feat] = default_val
    
    for i in range(len(df)):
        filt_raster, _ = mask.mask(lc_data, [df['geometry'][i]], crop = True)
        lc_proportion = np.histogram(filt_raster, bins = [i for i in range(num_classes + 1)])[0]/(filt_raster.shape[1]*filt_raster.shape[2])
        for j in range(1,len(lc_proportion)):
            var_name = prefix + str(j) + suffix
            df.loc[i, var_name] = round(lc_proportion[j]/(1-lc_proportion[0]),6)
    if ct_level:
        return df[['ct2020','ntaname']+feat_names]
    else:
        return df

In [12]:
def gen_elev_features(df, filepath, prefix = 'elev_', ct_level = False, default_val = 0):
    
    if ct_level:
        suffix = '_ct'
    else:
        suffix = ''
    
    elev_data = rasterio.open(filepath)
    feat_names = [prefix + i + suffix for i in ['mean','min','max','q1','q3']] 
    for feat in feat_names:
        df[feat] = default_val
    
    for i in range(len(df)):
        filt_raster, _ = mask.mask(elev_data, [df['geometry'][i]], crop = True)
        df.loc[i, prefix + 'mean' + suffix] = round(filt_raster.mean(),1)
        df.loc[i, prefix + 'min' + suffix], df.loc[i, prefix + 'q1' + suffix], df.loc[i, prefix + 'q3' + suffix], df.loc[i, prefix + 'max' + suffix] = np.percentile(filt_raster, [0,25,75,100])
    if ct_level:
        return df[['ct2020']+feat_names]
    else:
        return df

In [13]:
def gen_drain_features(df, filepath):
    
    drain_data = gpd.read_file(filepath)
    drain_data.columns = drain_data.columns.str.lower()

    drain_data = drain_data.sjoin(df, how = 'inner', predicate = 'within')[['unitid','bctcb2020']]                                  
    df = df.merge(drain_data.groupby('bctcb2020')['unitid'].nunique(), how = 'left', on = 'bctcb2020').rename(columns = {'unitid':'catch_basin_count'})
    df['catch_basin_density'] = df['catch_basin_count']/df['shape_area']
    
    return df

In [14]:
def gen_subway_features(df, filepath):
    
    sub_data = gpd.read_file(filepath)
    sub_data.columns = sub_data.columns.str.lower()

    sub_data = sub_data.sjoin(df, how = 'inner', predicate = 'within')[['objectid','bctcb2020']]                                  
    df = df.merge(sub_data.groupby('bctcb2020')['objectid'].nunique(), how = 'left', on = 'bctcb2020').rename(columns = {'objectid':'sub_entr_count'})
    
    return df

In [15]:
def gen_ret_wall_features(df, filepath):
    
    rw_data = gpd.read_file(filepath)
    rw_data.columns = rw_data.columns.str.lower()
    
    rw_data = df.overlay(rw_data, how = 'intersection', keep_geom_type = False)[['bctcb2020','shape_leng_2','geometry']]
    
    #explode to split any multipart geometries
    rw_data = rw_data.explode(ignore_index = True)
    
    length = []
    avg_rw_elev = []
    cb_list = []
    for cb in rw_data.bctcb2020.unique():
        cb_list.append(cb)
        elev_list = []
        subset = rw_data[rw_data.bctcb2020 == cb]
        length.append(subset['shape_leng_2'].sum())
        for i in range(0,len(subset)):
            for j in list(subset.iloc[i].geometry.coords):
                 elev_list.append(j[2])
        avg_rw_elev.append(np.mean(elev_list))
    
    rw_agg = pd.DataFrame(zip(cb_list, length, avg_rw_elev),columns  = ['bctcb2020','rw_length','rw_avg_elev'])
    df = df.merge(rw_agg, how = 'left', on = 'bctcb2020')
    
    return df

In [16]:
def gen_hydro_features(df, filepath, hydro_dict):
    
    h_data = gpd.read_file(filepath)
    h_data.columns = h_data.columns.str.lower()
    
    h_data = df.overlay(h_data, how = 'intersection', keep_geom_type = False)
    
    for key in hydro_dict.keys():
        df[hydro_dict[key]] = np.where(df.bctcb2020.isin(h_data[h_data.feat_code == key].bctcb2020),1,0)
    
    return df

In [17]:
lc_path = './raster_data/NYC_2017_LiDAR_LandCover.img'
elev_path = './elevation/DEM_LiDAR_1ft_2010_Improved_NYC_int.tif'
dr_path = './DEPCatchbasins/DEPCATCHBASINS.shp'
se_path = './doitt_subway_entrances/DOITT_SUBWAY_ENTRANCE_04JAN2017.shp'
rw_path = './retaining_wall/RETAININGWALL.shp'
h_path = './hydro/HYDROGRAPHY.shp'

hydro_dict = {2600:'lake_res_ind',
              2610:'pond_ind',
              2620:'river_ind',
              2630:'stream_ind',
              2640:'wl_marsh_ind',
              2650:'beach_shore_ind',
              2660:'bay_ocean_ind'}

### Borough-Level Dataset Pipeline (WIP)

In [27]:
def gen_boro_dataset(cb_df, ct_df, name, name_abv):
    start_ = time.time()
    #land cover
    df = gen_lc_features(cb_df[cb_df.boroname == name].reset_index(drop = True), lc_path)
    print(f'CB Land cover features complete: {round((time.time()-start_)/60,1)} total time elapsed')
    df_c = gen_lc_features(ct_df[ct_df.boroname == name].reset_index(drop = True), lc_path, ct_level = True)
    print(f'CT Land cover features complete: {round((time.time()-start_)/60,1)} total time elapsed')
    df = df.merge(df_c, on = 'ct2020')

    #elevation
    df = gen_elev_features(df, elev_path)
    print(f'CB elevation features complete: {round((time.time()-start_)/60,1)} total time elapsed')
    df_c = gen_elev_features(ct_df[ct_df.boroname == name].reset_index(drop = True), elev_path, ct_level = True)
    print(f'CT elevation features complete: {round((time.time()-start_)/60,1)} total time elapsed')
    df = df.merge(df_c, on = 'ct2020')
    
    #drainage
    df = gen_drain_features(df, dr_path)
    print(f'Catch basin features complete: {round((time.time()-start_)/60,1)} total time elapsed')
    df = gen_subway_features(df, se_path)
    print(f'Subway features complete: {round((time.time()-start_)/60,1)} total time elapsed')
    df = gen_ret_wall_features(df, rw_path)
    print(f'Retaining wall features complete: {round((time.time()-start_)/60,1)} total time elapsed')
    df = gen_hydro_features(df, h_path, hydro_dict)
    print(f'Hydrography features complete: {round((time.time()-start_)/60,1)} total time elapsed')
    df.to_csv(f'{name_abv}_data.csv')
    
    return df

### Generate full dataset

##### Manhattan

In [28]:
mht = gen_boro_dataset(cb_df, ct_df, 'Manhattan', 'mht')

CB Land cover features complete: 2.9 total time elapsed
CT Land cover features complete: 5.6 total time elapsed
CB elevation features complete: 7.6 total time elapsed
CT elevation features complete: 10.1 total time elapsed
Catch basin features complete: 10.6 total time elapsed
Subway features complete: 10.6 total time elapsed
Retaining wall features complete: 10.7 total time elapsed
Hydrography features complete: 10.7 total time elapsed


In [48]:
mht.head()

Unnamed: 0,cb2020,borocode,boroname,ct2020,bctcb2020,geoid,shape_leng,shape_area,geometry,land_cvr_1,...,sub_entr_count,rw_length,rw_avg_elev,lake_res_ind,pond_ind,river_ind,stream_ind,wl_marsh_ind,beach_shore_ind,bay_ocean_ind
0,1000,1,Manhattan,100,10001001000,360610001001000,6627.858318,1204255.0,"POLYGON ((973172.666 194632.348, 973310.630 19...",0.256364,...,,,,0,0,1,0,0,0,1
1,1001,1,Manhattan,100,10001001001,360610001001001,4395.190183,640166.4,"POLYGON ((972081.788 190733.467, 972184.766 19...",0.210609,...,,,,0,0,0,0,0,1,1
2,1000,1,Manhattan,201,10002011000,360610002011000,1569.384823,129276.3,"POLYGON ((988376.731 199328.618, 987837.811 19...",0.105964,...,,,,0,0,0,0,0,0,0
3,1001,1,Manhattan,201,10002011001,360610002011001,1594.262855,139360.4,"POLYGON ((988392.400 199070.298, 988285.301 19...",0.183447,...,,,,0,0,0,0,0,0,0
4,2000,1,Manhattan,201,10002012000,360610002012000,2055.295961,263308.4,"POLYGON ((988422.186 198807.188, 988449.289 19...",0.59333,...,,,,0,0,0,0,0,0,0


In [26]:
!aws s3 cp mht_data.csv s3://w210-flood-risk/modeling_data/mht_data.csv --acl public-read

upload: ./mht_data.csv to s3://w210-flood-risk/modeling_data/mht_data.csv


##### Brooklyn

In [30]:
bk = gen_boro_dataset(cb_df, ct_df, 'Brooklyn', 'bk')

CB Land cover features complete: 12.0 total time elapsed
CT Land cover features complete: 19.0 total time elapsed
CB elevation features complete: 23.7 total time elapsed
CT elevation features complete: 29.2 total time elapsed
Catch basin features complete: 29.7 total time elapsed
Subway features complete: 29.8 total time elapsed
Retaining wall features complete: 29.8 total time elapsed
Hydrography features complete: 29.9 total time elapsed


In [31]:
bk.head()

Unnamed: 0,cb2020,borocode,boroname,ct2020,bctcb2020,geoid,shape_leng,shape_area,geometry,land_cvr_1,...,sub_entr_count,rw_length,rw_avg_elev,lake_res_ind,pond_ind,river_ind,stream_ind,wl_marsh_ind,beach_shore_ind,bay_ocean_ind
0,1000,3,Brooklyn,100,30001001000,360470001001000,622.117458,22431.891296,"POLYGON ((985964.206 195274.629, 985891.647 19...",0.287187,...,,,,0,0,0,0,0,0,0
1,1001,3,Brooklyn,100,30001001001,360470001001001,740.8385,34130.450616,"POLYGON ((985807.425 195356.845, 985823.463 19...",0.112001,...,,,,0,0,0,0,0,0,0
2,1002,3,Brooklyn,100,30001001002,360470001001002,1976.512178,164829.78476,"POLYGON ((985891.647 195097.310, 986138.070 19...",0.310331,...,,92.089954,63.186595,0,0,0,0,0,0,0
3,1003,3,Brooklyn,100,30001001003,360470001001003,670.836712,18821.289611,"POLYGON ((985893.518 194679.512, 985851.375 19...",0.477843,...,,106.066077,61.246722,0,0,0,0,0,0,0
4,1004,3,Brooklyn,100,30001001004,360470001001004,888.299161,25306.626004,"POLYGON ((986176.023 194719.434, 986064.004 19...",0.236137,...,,,,0,0,0,0,0,0,0


In [16]:
!aws s3 cp bk_data.csv s3://w210-flood-risk/modeling_data/bk_data.csv --acl public-read

upload: ./bk_data.csv to s3://w210-flood-risk/modeling_data/bk_data.csv


##### Queens

In [40]:
qns = gen_boro_dataset(cb_df, ct_df, 'Queens', 'qns')

CB Land cover features complete: 12.2 total time elapsed
CT Land cover features complete: 23.8 total time elapsed
CB elevation features complete: 32.9 total time elapsed
CT elevation features complete: 40.3 total time elapsed
Catch basin features complete: 40.8 total time elapsed
Subway features complete: 40.8 total time elapsed
Retaining wall features complete: 41.0 total time elapsed
Hydrography features complete: 41.1 total time elapsed


In [41]:
qns.head()

Unnamed: 0,cb2020,borocode,boroname,ct2020,bctcb2020,geoid,shape_leng,shape_area,geometry,land_cvr_1,...,sub_entr_count,rw_length,rw_avg_elev,lake_res_ind,pond_ind,river_ind,stream_ind,wl_marsh_ind,beach_shore_ind,bay_ocean_ind
0,1,4,Queens,10701,40107010001,360810107010001,640.161347,2683.992,"POLYGON ((1013232.307 224070.073, 1013229.811 ...",0.0,...,,,,0,0,1,0,0,0,0
1,1000,4,Queens,10701,40107011000,360810107011000,17653.264361,13878740.0,"POLYGON ((1011320.227 223566.869, 1011348.605 ...",0.064292,...,,,,0,1,1,1,0,0,0
2,1001,4,Queens,10701,40107011001,360810107011001,13242.96242,3290979.0,"POLYGON ((1013229.811 224073.127, 1013232.307 ...",0.026659,...,,,,0,1,1,1,0,0,1
3,1,4,Queens,33100,40331000001,360810331000001,1849.372252,11652.46,"MULTIPOLYGON (((1020191.884 223932.946, 101969...",0.0,...,,,,0,0,1,0,0,0,0
4,2,4,Queens,33100,40331000002,360810331000002,1308.194665,4603.115,"MULTIPOLYGON (((1016054.598 223306.644, 101606...",0.0,...,,,,0,0,1,0,0,0,1


In [None]:
!aws s3 cp qns_data.csv s3://w210-flood-risk/modeling_data/qns_data.csv --acl public-read

upload: ./qns_data.csv to s3://w210-flood-risk/modeling_data/qns_data.csv


##### Bronx

In [42]:
bx = gen_boro_dataset(cb_df, ct_df, 'Bronx', 'bx')

CB Land cover features complete: 5.2 total time elapsed
CT Land cover features complete: 9.9 total time elapsed
CB elevation features complete: 12.1 total time elapsed
CT elevation features complete: 14.7 total time elapsed
Catch basin features complete: 15.2 total time elapsed
Subway features complete: 15.2 total time elapsed
Retaining wall features complete: 15.3 total time elapsed
Hydrography features complete: 15.4 total time elapsed


In [49]:
bx.head()

Unnamed: 0,cb2020,borocode,boroname,ct2020,bctcb2020,geoid,shape_leng,shape_area,geometry,land_cvr_1,...,sub_entr_count,rw_length,rw_avg_elev,lake_res_ind,pond_ind,river_ind,stream_ind,wl_marsh_ind,beach_shore_ind,bay_ocean_ind
0,2,2,Bronx,100,20001000002,360050001000002,1388.181303,14657.5,"MULTIPOLYGON (((1014488.168 229955.394, 101450...",0.0,...,,,,0,0,1,0,0,0,0
1,1000,2,Bronx,100,20001001000,360050001001000,21618.607957,5331208.0,"POLYGON ((1019454.697 225654.329, 1019451.101 ...",0.026208,...,,,,0,0,1,0,0,0,0
2,1001,2,Bronx,100,20001001001,360050001001001,9735.923094,5393515.0,"POLYGON ((1016289.515 226637.791, 1016293.724 ...",0.022199,...,,,,0,0,0,0,0,0,0
3,1002,2,Bronx,100,20001001002,360050001001002,4940.94766,144810.3,"POLYGON ((1016289.515 226637.791, 1016256.287 ...",0.002486,...,,,,0,0,0,0,0,0,0
4,1003,2,Bronx,100,20001001003,360050001001003,11361.191373,7279544.0,"POLYGON ((1016319.617 228852.196, 1016292.048 ...",0.040751,...,,,,0,0,1,0,0,0,0


In [None]:
!aws s3 cp bx_data.csv s3://w210-flood-risk/modeling_data/bx_data.csv --acl public-read

upload: ./bx_data.csv to s3://w210-flood-risk/modeling_data/bx_data.csv


##### Staten Island

In [50]:
si = gen_boro_dataset(cb_df, ct_df, 'Staten Island', 'si')

CT Land cover features complete: 13.9 total time elapsed
CB elevation features complete: 17.5 total time elapsed
CT elevation features complete: 21.1 total time elapsed
Catch basin features complete: 21.6 total time elapsed
Subway features complete: 21.6 total time elapsed
Retaining wall features complete: 21.6 total time elapsed
Hydrography features complete: 21.7 total time elapsed


In [51]:
si.head()

Unnamed: 0,cb2020,borocode,boroname,ct2020,bctcb2020,geoid,shape_leng,shape_area,geometry,land_cvr_1,...,sub_entr_count,rw_length,rw_avg_elev,lake_res_ind,pond_ind,river_ind,stream_ind,wl_marsh_ind,beach_shore_ind,bay_ocean_ind
0,1001,5,Staten Island,300,50003001001,360850003001001,2311.47365,140718.1,"POLYGON ((963024.764 174078.019, 963060.063 17...",0.027094,...,,824.193702,42.790569,0,0,0,0,0,0,0
1,1002,5,Staten Island,300,50003001002,360850003001002,1374.530898,115178.9,"POLYGON ((963117.038 173737.977, 963105.076 17...",0.079675,...,,,,0,0,0,0,0,0,0
2,1003,5,Staten Island,300,50003001003,360850003001003,1655.322812,155368.0,"POLYGON ((963369.724 173227.191, 963340.815 17...",0.25973,...,,,,0,0,0,0,0,0,0
3,1004,5,Staten Island,300,50003001004,360850003001004,2474.116014,355812.7,"POLYGON ((962869.984 173593.040, 963049.210 17...",0.096465,...,,183.156086,107.8789,0,0,0,0,0,0,0
4,2000,5,Staten Island,300,50003002000,360850003002000,10825.871059,1148589.0,"POLYGON ((964407.005 173676.002, 964412.548 17...",0.09024,...,,348.617478,26.64336,0,0,0,0,0,0,1


In [None]:
!aws s3 cp si_data.csv s3://w210-flood-risk/modeling_data/si_data.csv --acl public-read

upload: ./si_data.csv to s3://w210-flood-risk/modeling_data/si_data.csv


In [None]:
# %%time
# start_ = time.time()
# df = gen_lc_features(cb_df, lc_path)
# print(f'Land cover features complete: {round((time.time()-start_)/60,1)} total time elapsed')
# df = gen_elev_features(df, elev_path)
# print(f'CB elevation features complete: {round((time.time()-start_)/60,1)} total time elapsed')
# df_c = gen_elev_features(ct_df, elev_path, ct_level = True)
# print(f'CT elevation features complete: {round((time.time()-start_)/60,1)} total time elapsed')
# df = df.merge(df_c[['ct2020','ntaname','avg_elev_ct','min_elev_ct','max_elev_ct','q1_elev_ct','q3_elev_ct']], on = 'ct2020')
# df = gen_drain_features(df, dr_path)
# print(f'Catch basin features complete: {round((time.time()-start_)/60,1)} total time elapsed')
# df = gen_subway_features(df, se_path)
# print(f'Subway features complete: {round((time.time()-start_)/60,1)} total time elapsed')
# df = gen_ret_wall_features(df, rw_path)
# print(f'Retaining wall features complete: {round((time.time()-start_)/60,1)} total time elapsed')
# df = gen_hydro_features(df, h_path)
# print(f'Hydrography features complete: {round((time.time()-start_)/60,1)} total time elapsed')

# df.to_csv('full_dataset.csv')

In [52]:
ny = pd.concat([mht,bk,qns,bx,si],ignore_index = True)

In [55]:
ny.to_csv('full_ny.csv')

In [54]:
ny.head()

Unnamed: 0,cb2020,borocode,boroname,ct2020,bctcb2020,geoid,shape_leng,shape_area,geometry,land_cvr_1,...,sub_entr_count,rw_length,rw_avg_elev,lake_res_ind,pond_ind,river_ind,stream_ind,wl_marsh_ind,beach_shore_ind,bay_ocean_ind
0,1000,1,Manhattan,100,10001001000,360610001001000,6627.858318,1204255.0,"POLYGON ((973172.666 194632.348, 973310.630 19...",0.256364,...,,,,0,0,1,0,0,0,1
1,1001,1,Manhattan,100,10001001001,360610001001001,4395.190183,640166.4,"POLYGON ((972081.788 190733.467, 972184.766 19...",0.210609,...,,,,0,0,0,0,0,1,1
2,1000,1,Manhattan,201,10002011000,360610002011000,1569.384823,129276.3,"POLYGON ((988376.731 199328.618, 987837.811 19...",0.105964,...,,,,0,0,0,0,0,0,0
3,1001,1,Manhattan,201,10002011001,360610002011001,1594.262855,139360.4,"POLYGON ((988392.400 199070.298, 988285.301 19...",0.183447,...,,,,0,0,0,0,0,0,0
4,2000,1,Manhattan,201,10002012000,360610002012000,2055.295961,263308.4,"POLYGON ((988422.186 198807.188, 988449.289 19...",0.59333,...,,,,0,0,0,0,0,0,0


In [56]:
!aws s3 cp full_ny.csv s3://w210-flood-risk/modeling_data/full_ny.csv --acl public-read

upload: ./full_ny.csv to s3://w210-flood-risk/modeling_data/full_ny.csv
