# Get the greenspaces from San Francisco and set the flags

In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
from glob2 import glob
import osmnx as ox

In [2]:
#get California as gdf to check anomalies later. Points outside California will be excluded
ca=ox.geocode_to_gdf('California')

In [3]:
ca

Unnamed: 0,geometry,bbox_north,bbox_south,bbox_east,bbox_west,place_id,osm_type,osm_id,lat,lon,display_name,class,type,importance
0,"MULTIPOLYGON (((-124.48200 40.44032, -124.4813...",42.009499,32.529524,-114.130782,-124.482003,259044285,relation,165475,36.701463,-118.755997,"California, United States",boundary,administrative,0.922136


## Load observations (API)

In [4]:
data_folder = '../data/observations_final'
sf_df = pd.concat([pd.read_csv(f).assign(challenge=f.replace('.csv','')) for f in glob(data_folder+'/CNC_San_Francisco_*.csv')])

In [5]:
sf_df.shape

(139040, 39)

In [6]:
#check for incorrect points (lat)
sf_df['latitude'].min(), sf_df['latitude'].max()

(36.998305700500005, 48.458352000000005)

In [7]:
#check for incorrect points (lon). Clearly something wrong!
sf_df['longitude'].min(), sf_df['longitude'].max()

(-123.59913058040001, -35.15625)

## Get only the points inside CA

In [8]:
sf_df=sf_df[sf_df['latitude'].le(ca['bbox_north'].iloc[0]) \
                  & sf_df['latitude'].ge(ca['bbox_south'].iloc[0]) \
                  & sf_df['longitude'].le(ca['bbox_east'].iloc[0]) \
                  & sf_df['longitude'].ge(ca['bbox_west'].iloc[0])]

In [9]:
sf_df.shape

(139038, 39)

In [10]:
sf_df.head()

Unnamed: 0,id,observed_on_string,time_observed_at,created_time_zone,created_at,updated_at,description,user_id,user_login,quality_grade,...,iconic_taxon_name,taxon_rank,taxon_parent_id,taxon_native,taxon_endemic,taxon_threatened,taxon_search_rank,taxon_observations,identifications,challenge
0,5677489,Thu Apr 13 2017 12:40:50 GMT-0700 (PDT),2017-04-13T12:40:50-07:00,America/Los_Angeles,2017-04-13T13:29:41-07:00,2018-11-26T12:33:26-08:00,,8778,gyrrlfalcon,needs_id,...,Insecta,family,49025,False,False,False,127627,127627,"[{'user_id': 8778, 'category': 'maverick', 'di...",../data/observations_final\CNC_San_Francisco_2017
1,5682279,Thu Apr 13 2017 15:57:53 GMT-0700 (PDT),2017-04-13T15:57:53-07:00,America/Los_Angeles,2017-04-13T18:37:08-07:00,2018-11-26T06:03:56-08:00,,4797,paranger,research,...,Plantae,species,48245,True,False,False,8756,8756,"[{'user_id': 179103, 'category': 'supporting',...",../data/observations_final\CNC_San_Francisco_2017
2,5682371,Thu Apr 13 2017 16:08:07 GMT-0700 (PDT),2017-04-13T16:08:07-07:00,America/Los_Angeles,2017-04-13T18:44:41-07:00,2018-11-26T06:03:57-08:00,,4797,paranger,research,...,Plantae,species,48245,True,False,False,8756,8756,"[{'user_id': 179103, 'category': 'supporting',...",../data/observations_final\CNC_San_Francisco_2017
3,5684642,2017/04/14 12:13 AM PDT,2017-04-14T00:13:00-07:00,America/Los_Angeles,2017-04-14T00:20:21-07:00,2017-04-21T14:58:31-07:00,,1,kueda,needs_id,...,Insecta,family,55518,False,False,False,6696,6696,"[{'user_id': 1, 'category': None, 'disagreemen...",../data/observations_final\CNC_San_Francisco_2017
4,5684660,Fri Apr 14 2017 00:28:12 GMT-0700 (PDT),2017-04-14T00:28:12-07:00,America/Los_Angeles,2017-04-14T00:29:36-07:00,2020-02-15T14:31:49-08:00,,3494,damontighe,needs_id,...,Insecta,subgenus,67740,False,False,False,1246,1246,"[{'user_id': 3494, 'category': 'leading', 'dis...",../data/observations_final\CNC_San_Francisco_2017


## Get the greenspaces using OSMnx and the points we defined before

In [11]:
green=ox.geometries.geometries_from_bbox(sf_df['latitude'].max(),
                                         sf_df['latitude'].min(),
                                         sf_df['longitude'].max(),
                                         sf_df['longitude'].min(),
                                         {'leisure':'park', 'landuse':'forest'})

In [12]:
green.shape

(5513, 251)

In [13]:
green.head(5)

Unnamed: 0,unique_id,osmid,element_type,natural,geometry,nodes,attribution,csp:globalid,csp:unitcode,leisure,...,name:vi,addr:full,source:address,source:name,park,abandoned:landuse,addr:housename,source:geometry,path,key
0,way/40180966,40180966,way,,"POLYGON ((-123.27734 38.53263, -123.27735 38.5...","[484049095, 484049109, 484049122, 484049138, 4...",CASIL CSP_Opbdys072008,{C334A2B5-47D3-4FCC-A296-BEDD15BFEC9B},207.0,park,...,,,,,,,,,,
1,way/40181141,40181141,way,,"POLYGON ((-123.26371 38.52568, -123.26437 38.5...","[484078428, 484078426, 484078423, 484078461, 4...",CASIL CSP_Opbdys072008,{C334A2B5-47D3-4FCC-A296-BEDD15BFEC9B},207.0,park,...,,,,,,,,,,
2,way/64295487,64295487,way,,"POLYGON ((-123.32040 38.56862, -123.32039 38.5...","[791632646, 791644817, 791637347, 791649272, 7...","USDA-Forest Service, Pacific Southwest Region",,,,...,,,,,,,,,,
3,way/64295496,64295496,way,,"POLYGON ((-123.32971 38.57325, -123.32853 38.5...","[791652840, 791636321, 791657111, 791640229, 7...","USDA-Forest Service, Pacific Southwest Region",,,,...,,,,,,,,,,
4,way/64295506,64295506,way,,"POLYGON ((-123.33315 38.57426, -123.33296 38.5...","[791660111, 791652683, 791636224, 791653565, 7...","USDA-Forest Service, Pacific Southwest Region",,,,...,,,,,,,,,,


In [14]:
#keep only polygons

#exclude Point
green=green[green['geometry'].geom_type.ne('Point')]
#exclude LineString
green=green[green['geometry'].geom_type.ne('LineString')]

In [15]:
green.shape

(4879, 251)

## Exclude the overlapping geometries 

In [16]:
geom = green.geometry.unary_union

In [17]:
green_unique = gpd.GeoDataFrame(geometry=[geom],crs='epsg:4326')

In [18]:
green_unique.head()

Unnamed: 0,geometry
0,"MULTIPOLYGON (((-121.85857 37.01905, -121.8586..."


In [19]:
green_unique.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [20]:
green_unique = green_unique.explode().reset_index(drop=True)

In [21]:
green_unique.head()

Unnamed: 0,geometry
0,"POLYGON ((-121.85857 37.01905, -121.85863 37.0..."
1,"POLYGON ((-121.77653 37.02668, -121.77658 37.0..."
2,"POLYGON ((-122.24431 37.09292, -122.24920 37.0..."
3,"POLYGON ((-122.19112 37.12759, -122.19956 37.1..."
4,"POLYGON ((-121.55997 37.00585, -121.55924 37.0..."


In [22]:
green_unique.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [23]:
green_unique.shape

(4631, 1)

## Create gdf using the observations

In [24]:
sf_df['longitude'].isna().sum(), sf_df['latitude'].isna().sum()

(0, 0)

In [25]:
#create the points inverting the coordinates
geometry_invert = [Point(xy) for xy in zip(sf_df.longitude, sf_df.latitude)]

In [26]:
#crete a geodataframe with the observations
observations_gdf = gpd.GeoDataFrame(sf_df, crs='epsg:4326', geometry=geometry_invert)

In [27]:
observations_gdf.shape

(139038, 40)

In [28]:
observations_gdf.head()

Unnamed: 0,id,observed_on_string,time_observed_at,created_time_zone,created_at,updated_at,description,user_id,user_login,quality_grade,...,taxon_rank,taxon_parent_id,taxon_native,taxon_endemic,taxon_threatened,taxon_search_rank,taxon_observations,identifications,challenge,geometry
0,5677489,Thu Apr 13 2017 12:40:50 GMT-0700 (PDT),2017-04-13T12:40:50-07:00,America/Los_Angeles,2017-04-13T13:29:41-07:00,2018-11-26T12:33:26-08:00,,8778,gyrrlfalcon,needs_id,...,family,49025,False,False,False,127627,127627,"[{'user_id': 8778, 'category': 'maverick', 'di...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.07613 37.42187)
1,5682279,Thu Apr 13 2017 15:57:53 GMT-0700 (PDT),2017-04-13T15:57:53-07:00,America/Los_Angeles,2017-04-13T18:37:08-07:00,2018-11-26T06:03:56-08:00,,4797,paranger,research,...,species,48245,True,False,False,8756,8756,"[{'user_id': 179103, 'category': 'supporting',...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.18080 37.35370)
2,5682371,Thu Apr 13 2017 16:08:07 GMT-0700 (PDT),2017-04-13T16:08:07-07:00,America/Los_Angeles,2017-04-13T18:44:41-07:00,2018-11-26T06:03:57-08:00,,4797,paranger,research,...,species,48245,True,False,False,8756,8756,"[{'user_id': 179103, 'category': 'supporting',...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.18041 37.35305)
3,5684642,2017/04/14 12:13 AM PDT,2017-04-14T00:13:00-07:00,America/Los_Angeles,2017-04-14T00:20:21-07:00,2017-04-21T14:58:31-07:00,,1,kueda,needs_id,...,family,55518,False,False,False,6696,6696,"[{'user_id': 1, 'category': None, 'disagreemen...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.37434 37.89112)
4,5684660,Fri Apr 14 2017 00:28:12 GMT-0700 (PDT),2017-04-14T00:28:12-07:00,America/Los_Angeles,2017-04-14T00:29:36-07:00,2020-02-15T14:31:49-08:00,,3494,damontighe,needs_id,...,subgenus,67740,False,False,False,1246,1246,"[{'user_id': 3494, 'category': 'leading', 'dis...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.26543 37.80403)


## Spatial join

In [29]:
# spatial join observations and observations.
points_polys = gpd.sjoin(observations_gdf, green_unique, how='inner')

In [30]:
points_polys.head()

Unnamed: 0,id,observed_on_string,time_observed_at,created_time_zone,created_at,updated_at,description,user_id,user_login,quality_grade,...,taxon_parent_id,taxon_native,taxon_endemic,taxon_threatened,taxon_search_rank,taxon_observations,identifications,challenge,geometry,index_right
14,5686895,Fri Apr 14 2017 06:58:12 GMT-0700 (PDT),2017-04-14T06:58:12-07:00,America/Los_Angeles,2017-04-14T07:04:33-07:00,2017-04-14T19:07:05-07:00,,819,rebeccafay,needs_id,...,311249,False,False,False,229816,229816,"[{'user_id': 819, 'category': 'improving', 'di...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.43269 37.80152),1996
2993,43217084,Fri Apr 24 2020 12:07:24 GMT-0700 (PDT),2020-04-24T12:07:24-07:00,America/Los_Angeles,2020-04-24T17:18:05-07:00,2020-04-24T19:46:41-07:00,"Lots of Douglas iris, but the first Fernald's ...",819,rebeccafay,research,...,9091,True,False,False,62760,62760,"[{'user_id': 819, 'category': 'improving', 'di...",../data/observations_final\CNC_San_Francisco_2020,POINT (-122.43458 37.80227),1996
2995,43217209,Fri Apr 24 2020 12:05:31 GMT-0700 (PDT),2020-04-24T12:05:31-07:00,America/Los_Angeles,2020-04-24T17:18:43-07:00,2020-04-24T17:26:40-07:00,"Lots of Douglas iris, but the first Fernald's ...",819,rebeccafay,research,...,13847,False,False,False,107159,107159,"[{'user_id': 819, 'category': 'improving', 'di...",../data/observations_final\CNC_San_Francisco_2020,POINT (-122.43429 37.80120),1996
9890,43484693,Sat Apr 25 2020 12:21:51 GMT-0700 (PDT),2020-04-25T12:21:51-07:00,America/Los_Angeles,2020-04-25T16:24:08-07:00,2020-04-25T16:24:28-07:00,Kite Hill Ecological Restoration,819,rebeccafay,needs_id,...,47689,False,False,False,128961,128961,"[{'user_id': 819, 'category': 'leading', 'disa...",../data/observations_final\CNC_San_Francisco_2020,POINT (-122.43182 37.80275),1996
9894,43484777,Sat Apr 25 2020 12:20:56 GMT-0700 (PDT),2020-04-25T12:20:56-07:00,America/Los_Angeles,2020-04-25T16:24:32-07:00,2020-05-03T00:45:14-07:00,Kite Hill Ecological Restoration,819,rebeccafay,needs_id,...,738398,False,False,False,19325,19325,"[{'user_id': 819, 'category': 'improving', 'di...",../data/observations_final\CNC_San_Francisco_2020,POINT (-122.43183 37.80264),1996


In [31]:
points_polys.shape

(41060, 41)

In [32]:
points_polys.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

## Set flag for greenspaces

In [33]:
ids_with_greenspace = points_polys.id.to_list()

In [34]:
new_sf = sf_df.set_index('id')
new_sf.head()

Unnamed: 0_level_0,observed_on_string,time_observed_at,created_time_zone,created_at,updated_at,description,user_id,user_login,quality_grade,reviewed_by,...,taxon_rank,taxon_parent_id,taxon_native,taxon_endemic,taxon_threatened,taxon_search_rank,taxon_observations,identifications,challenge,geometry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5677489,Thu Apr 13 2017 12:40:50 GMT-0700 (PDT),2017-04-13T12:40:50-07:00,America/Los_Angeles,2017-04-13T13:29:41-07:00,2018-11-26T12:33:26-08:00,,8778,gyrrlfalcon,needs_id,"[8778, 275891, 425620]",...,family,49025,False,False,False,127627,127627,"[{'user_id': 8778, 'category': 'maverick', 'di...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.07613 37.42187)
5682279,Thu Apr 13 2017 15:57:53 GMT-0700 (PDT),2017-04-13T15:57:53-07:00,America/Los_Angeles,2017-04-13T18:37:08-07:00,2018-11-26T06:03:56-08:00,,4797,paranger,research,"[4797, 179103, 425620]",...,species,48245,True,False,False,8756,8756,"[{'user_id': 179103, 'category': 'supporting',...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.18080 37.35370)
5682371,Thu Apr 13 2017 16:08:07 GMT-0700 (PDT),2017-04-13T16:08:07-07:00,America/Los_Angeles,2017-04-13T18:44:41-07:00,2018-11-26T06:03:57-08:00,,4797,paranger,research,"[4797, 179103, 425620]",...,species,48245,True,False,False,8756,8756,"[{'user_id': 179103, 'category': 'supporting',...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.18041 37.35305)
5684642,2017/04/14 12:13 AM PDT,2017-04-14T00:13:00-07:00,America/Los_Angeles,2017-04-14T00:20:21-07:00,2017-04-21T14:58:31-07:00,,1,kueda,needs_id,"[1, 425620]",...,family,55518,False,False,False,6696,6696,"[{'user_id': 1, 'category': None, 'disagreemen...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.37434 37.89112)
5684660,Fri Apr 14 2017 00:28:12 GMT-0700 (PDT),2017-04-14T00:28:12-07:00,America/Los_Angeles,2017-04-14T00:29:36-07:00,2020-02-15T14:31:49-08:00,,3494,damontighe,needs_id,"[3494, 179103, 212655, 425620, 477431]",...,subgenus,67740,False,False,False,1246,1246,"[{'user_id': 3494, 'category': 'leading', 'dis...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.26543 37.80403)


In [35]:
new_sf['greenspace_flag'] = '0'
new_sf.loc[ids_with_greenspace,'greenspace_flag'] = '1'

In [36]:
new_sf.greenspace_flag.value_counts()

0    97978
1    41060
Name: greenspace_flag, dtype: int64

In [37]:
#export as csv
new_sf.to_csv('../data/outputs/sf_data_with_greenspace_flag.csv')