# Get the greenspaces from San Francisco and set the flags

In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
from glob2 import glob
import osmnx as ox

In [2]:
green=gpd.read_file('../data/outputs/greenspaces_sf_final.shp')

In [3]:
green.shape

(23264, 2)

In [4]:
green.head()

Unnamed: 0,FID,geometry
0,0,"MULTIPOLYGON (((-122.10455 36.96578, -122.1044..."
1,1,"POLYGON ((-121.96565 37.03789, -121.96560 37.0..."
2,2,"POLYGON ((-122.23961 37.05038, -122.23919 37.0..."
3,3,"POLYGON ((-121.65561 37.04986, -121.65529 37.0..."
4,4,"POLYGON ((-122.26762 37.07843, -122.26760 37.0..."


In [5]:
#get California as gdf to check anomalies later. Points outside California will be excluded
ca=ox.geocode_to_gdf('California')

In [6]:
ca

Unnamed: 0,geometry,bbox_north,bbox_south,bbox_east,bbox_west,place_id,osm_type,osm_id,lat,lon,display_name,class,type,importance
0,"MULTIPOLYGON (((-124.48200 40.44032, -124.4813...",42.009499,32.529524,-114.130782,-124.482003,259044285,relation,165475,36.701463,-118.755997,"California, United States",boundary,administrative,0.922136


## Load observations (API)

In [7]:
data_folder = '../data/observations_final'
df = pd.concat([pd.read_csv(f).assign(challenge=f.replace('.csv','')) for f in glob(data_folder+'/CNC_San_Francisco_*.csv')])

In [8]:
df.shape

(139040, 39)

In [9]:
#check for incorrect points (lat)
df['latitude'].min(), df['latitude'].max()

(36.998305700500005, 48.458352000000005)

In [10]:
#check for incorrect points (lon). Clearly something wrong!
df['longitude'].min(), df['longitude'].max()

(-123.59913058040001, -35.15625)

## Get only the points inside CA

In [11]:
df=df[df['latitude'].le(ca['bbox_north'].iloc[0]) \
                  & df['latitude'].ge(ca['bbox_south'].iloc[0]) \
                  & df['longitude'].le(ca['bbox_east'].iloc[0]) \
                  & df['longitude'].ge(ca['bbox_west'].iloc[0])]

In [12]:
df.shape

(139038, 39)

In [13]:
df.head()

Unnamed: 0,id,observed_on_string,time_observed_at,created_time_zone,created_at,updated_at,description,user_id,user_login,quality_grade,...,iconic_taxon_name,taxon_rank,taxon_parent_id,taxon_native,taxon_endemic,taxon_threatened,taxon_search_rank,taxon_observations,identifications,challenge
0,5677489,Thu Apr 13 2017 12:40:50 GMT-0700 (PDT),2017-04-13T12:40:50-07:00,America/Los_Angeles,2017-04-13T13:29:41-07:00,2018-11-26T12:33:26-08:00,,8778,gyrrlfalcon,needs_id,...,Insecta,family,49025,False,False,False,127627,127627,"[{'user_id': 8778, 'category': 'maverick', 'di...",../data/observations_final\CNC_San_Francisco_2017
1,5682279,Thu Apr 13 2017 15:57:53 GMT-0700 (PDT),2017-04-13T15:57:53-07:00,America/Los_Angeles,2017-04-13T18:37:08-07:00,2018-11-26T06:03:56-08:00,,4797,paranger,research,...,Plantae,species,48245,True,False,False,8756,8756,"[{'user_id': 179103, 'category': 'supporting',...",../data/observations_final\CNC_San_Francisco_2017
2,5682371,Thu Apr 13 2017 16:08:07 GMT-0700 (PDT),2017-04-13T16:08:07-07:00,America/Los_Angeles,2017-04-13T18:44:41-07:00,2018-11-26T06:03:57-08:00,,4797,paranger,research,...,Plantae,species,48245,True,False,False,8756,8756,"[{'user_id': 179103, 'category': 'supporting',...",../data/observations_final\CNC_San_Francisco_2017
3,5684642,2017/04/14 12:13 AM PDT,2017-04-14T00:13:00-07:00,America/Los_Angeles,2017-04-14T00:20:21-07:00,2017-04-21T14:58:31-07:00,,1,kueda,needs_id,...,Insecta,family,55518,False,False,False,6696,6696,"[{'user_id': 1, 'category': None, 'disagreemen...",../data/observations_final\CNC_San_Francisco_2017
4,5684660,Fri Apr 14 2017 00:28:12 GMT-0700 (PDT),2017-04-14T00:28:12-07:00,America/Los_Angeles,2017-04-14T00:29:36-07:00,2020-02-15T14:31:49-08:00,,3494,damontighe,needs_id,...,Insecta,subgenus,67740,False,False,False,1246,1246,"[{'user_id': 3494, 'category': 'leading', 'dis...",../data/observations_final\CNC_San_Francisco_2017


## Create gdf using the observations

In [14]:
df['longitude'].isna().sum(), df['latitude'].isna().sum()

(0, 0)

In [15]:
#create the points inverting the coordinates
geometry_invert = [Point(xy) for xy in zip(df.longitude, df.latitude)]

In [16]:
#crete a geodataframe with the observations
observations_gdf = gpd.GeoDataFrame(df, crs='epsg:4326', geometry=geometry_invert)

In [17]:
observations_gdf.shape

(139038, 40)

In [18]:
observations_gdf.head()

Unnamed: 0,id,observed_on_string,time_observed_at,created_time_zone,created_at,updated_at,description,user_id,user_login,quality_grade,...,taxon_rank,taxon_parent_id,taxon_native,taxon_endemic,taxon_threatened,taxon_search_rank,taxon_observations,identifications,challenge,geometry
0,5677489,Thu Apr 13 2017 12:40:50 GMT-0700 (PDT),2017-04-13T12:40:50-07:00,America/Los_Angeles,2017-04-13T13:29:41-07:00,2018-11-26T12:33:26-08:00,,8778,gyrrlfalcon,needs_id,...,family,49025,False,False,False,127627,127627,"[{'user_id': 8778, 'category': 'maverick', 'di...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.07613 37.42187)
1,5682279,Thu Apr 13 2017 15:57:53 GMT-0700 (PDT),2017-04-13T15:57:53-07:00,America/Los_Angeles,2017-04-13T18:37:08-07:00,2018-11-26T06:03:56-08:00,,4797,paranger,research,...,species,48245,True,False,False,8756,8756,"[{'user_id': 179103, 'category': 'supporting',...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.18080 37.35370)
2,5682371,Thu Apr 13 2017 16:08:07 GMT-0700 (PDT),2017-04-13T16:08:07-07:00,America/Los_Angeles,2017-04-13T18:44:41-07:00,2018-11-26T06:03:57-08:00,,4797,paranger,research,...,species,48245,True,False,False,8756,8756,"[{'user_id': 179103, 'category': 'supporting',...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.18041 37.35305)
3,5684642,2017/04/14 12:13 AM PDT,2017-04-14T00:13:00-07:00,America/Los_Angeles,2017-04-14T00:20:21-07:00,2017-04-21T14:58:31-07:00,,1,kueda,needs_id,...,family,55518,False,False,False,6696,6696,"[{'user_id': 1, 'category': None, 'disagreemen...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.37434 37.89112)
4,5684660,Fri Apr 14 2017 00:28:12 GMT-0700 (PDT),2017-04-14T00:28:12-07:00,America/Los_Angeles,2017-04-14T00:29:36-07:00,2020-02-15T14:31:49-08:00,,3494,damontighe,needs_id,...,subgenus,67740,False,False,False,1246,1246,"[{'user_id': 3494, 'category': 'leading', 'dis...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.26543 37.80403)


## Spatial join

In [19]:
# spatial join observations and observations.
points_polys = gpd.sjoin(observations_gdf, green, how='inner')

In [20]:
points_polys.head()

Unnamed: 0,id,observed_on_string,time_observed_at,created_time_zone,created_at,updated_at,description,user_id,user_login,quality_grade,...,taxon_native,taxon_endemic,taxon_threatened,taxon_search_rank,taxon_observations,identifications,challenge,geometry,index_right,FID
1,5682279,Thu Apr 13 2017 15:57:53 GMT-0700 (PDT),2017-04-13T15:57:53-07:00,America/Los_Angeles,2017-04-13T18:37:08-07:00,2018-11-26T06:03:56-08:00,,4797,paranger,research,...,True,False,False,8756,8756,"[{'user_id': 179103, 'category': 'supporting',...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.18080 37.35370),0,0
2,5682371,Thu Apr 13 2017 16:08:07 GMT-0700 (PDT),2017-04-13T16:08:07-07:00,America/Los_Angeles,2017-04-13T18:44:41-07:00,2018-11-26T06:03:57-08:00,,4797,paranger,research,...,True,False,False,8756,8756,"[{'user_id': 179103, 'category': 'supporting',...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.18041 37.35305),0,0
336,5695369,2017/04/14 10:53 AM PDT,2017-04-14T10:53:00-07:00,America/Los_Angeles,2017-04-14T12:02:48-07:00,2019-05-09T17:55:37-07:00,The previous observation of this plant is at h...,399831,brucewbailey,research,...,True,False,False,899,899,"[{'user_id': 13377, 'category': 'improving', '...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.05863 37.29381),0,0
370,5695852,Fri Apr 14 2017 08:27:54 GMT-0700 (PDT),2017-04-14T08:27:54-07:00,America/Los_Angeles,2017-04-14T12:18:38-07:00,2018-09-16T09:33:49-07:00,,8778,gyrrlfalcon,needs_id,...,False,False,False,26621,26621,"[{'user_id': 3494, 'category': 'supporting', '...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.24431 37.34123),0,0
372,5695858,Fri Apr 14 2017 08:28:53 GMT-0700 (PDT),2017-04-14T08:28:53-07:00,America/Los_Angeles,2017-04-14T12:18:59-07:00,2020-10-05T11:40:34-07:00,"Some kind of anise or fennel, flowers reach 2 ...",8778,gyrrlfalcon,needs_id,...,False,False,False,13268,13268,"[{'user_id': 8778, 'category': 'improving', 'd...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.24438 37.34126),0,0


In [21]:
points_polys.shape

(73885, 42)

In [22]:
points_polys.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

## Set flag for greenspaces

In [23]:
ids_with_greenspace = points_polys.id.to_list()

In [24]:
new_df = df.set_index('id')
new_df.head()

Unnamed: 0_level_0,observed_on_string,time_observed_at,created_time_zone,created_at,updated_at,description,user_id,user_login,quality_grade,reviewed_by,...,taxon_rank,taxon_parent_id,taxon_native,taxon_endemic,taxon_threatened,taxon_search_rank,taxon_observations,identifications,challenge,geometry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5677489,Thu Apr 13 2017 12:40:50 GMT-0700 (PDT),2017-04-13T12:40:50-07:00,America/Los_Angeles,2017-04-13T13:29:41-07:00,2018-11-26T12:33:26-08:00,,8778,gyrrlfalcon,needs_id,"[8778, 275891, 425620]",...,family,49025,False,False,False,127627,127627,"[{'user_id': 8778, 'category': 'maverick', 'di...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.07613 37.42187)
5682279,Thu Apr 13 2017 15:57:53 GMT-0700 (PDT),2017-04-13T15:57:53-07:00,America/Los_Angeles,2017-04-13T18:37:08-07:00,2018-11-26T06:03:56-08:00,,4797,paranger,research,"[4797, 179103, 425620]",...,species,48245,True,False,False,8756,8756,"[{'user_id': 179103, 'category': 'supporting',...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.18080 37.35370)
5682371,Thu Apr 13 2017 16:08:07 GMT-0700 (PDT),2017-04-13T16:08:07-07:00,America/Los_Angeles,2017-04-13T18:44:41-07:00,2018-11-26T06:03:57-08:00,,4797,paranger,research,"[4797, 179103, 425620]",...,species,48245,True,False,False,8756,8756,"[{'user_id': 179103, 'category': 'supporting',...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.18041 37.35305)
5684642,2017/04/14 12:13 AM PDT,2017-04-14T00:13:00-07:00,America/Los_Angeles,2017-04-14T00:20:21-07:00,2017-04-21T14:58:31-07:00,,1,kueda,needs_id,"[1, 425620]",...,family,55518,False,False,False,6696,6696,"[{'user_id': 1, 'category': None, 'disagreemen...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.37434 37.89112)
5684660,Fri Apr 14 2017 00:28:12 GMT-0700 (PDT),2017-04-14T00:28:12-07:00,America/Los_Angeles,2017-04-14T00:29:36-07:00,2020-02-15T14:31:49-08:00,,3494,damontighe,needs_id,"[3494, 179103, 212655, 425620, 477431]",...,subgenus,67740,False,False,False,1246,1246,"[{'user_id': 3494, 'category': 'leading', 'dis...",../data/observations_final\CNC_San_Francisco_2017,POINT (-122.26543 37.80403)


In [25]:
new_df['greenspace_flag'] = '0'
new_df.loc[ids_with_greenspace,'greenspace_flag'] = '1'

In [26]:
new_df.greenspace_flag.value_counts()

1    73885
0    65153
Name: greenspace_flag, dtype: int64

In [27]:
#export as csv
new_df.to_csv('../data/outputs/sf_data_with_greenspace_flag_v2.csv')