# Data Collection and Preparation

In [1]:
# Collect SF MUNI Stop Location Data
import pandas as pd

allMUNIstops = pd.read_csv('Data/Muni_Stops.csv')
allMUNIstops.head()

Unnamed: 0,OBJECTID,STOPNAME,TRAPEZESTOPABBR,RUCUSSTOPABBR,STOPID,LATITUDE,LONGITUDE,ACCESSIBILITYMASK,ATSTREET,ONSTREET,...,INSERT_TIMESTAMP,SDE_ID,SIGNUPID,SUPERVISOR_DISTRICT,shape,Neighborhoods,SF Find Neighborhoods,Current Police Districts,Current Supervisor Districts,Analysis Neighborhoods
0,42619,Polk St&Lombard St NW-NS/BZ,POLKLOM0,POLKLOMB,5990,37.80167,-122.42303,0.0,LOMBARD ST,POLK ST,...,20230512124615,14816781,141,,POINT (-122.42303 37.80167),107.0,107.0,4.0,6.0,32.0
1,40917,Chestnut St&Fillmore St NE-NS/BZ,CHESFIL0,CHESFILL,3941,37.800845,-122.436245,0.0,WEBSTER ST,CHESTNUT ST,...,20230512124615,14809056,141,,POINT (-122.43625 37.800846),17.0,17.0,4.0,6.0,13.0
2,41525,Geary Blvd&Arguello Blvd NE-NS/BZ,GEARARG0,GEARARGL,4287,37.781376,-122.458737,0.0,ARGUELLO BLVD,GEARY BLVD,...,20230512124615,14810393,141,,POINT (-122.45874 37.781376),11.0,11.0,8.0,6.0,31.0
3,40679,3rd St&Folsom St N-FS/BZ,.3STFOL0,3STFOLS,3124,37.784204,-122.399326,0.0,CLEMENTINA ST,03RD ST,...,20230512124615,14808200,141,,POINT (-122.39932 37.784203),32.0,32.0,1.0,10.0,8.0
4,43044,Potrero Ave&24th St SW-FS/BZ,POTR24S0,POTR24ST,6039,37.75267,-122.40649,0.0,24TH ST,POTRERO AVE,...,20230512124615,14815720,141,,POINT (-122.40649 37.75267),53.0,53.0,3.0,2.0,20.0


In [2]:
# Find the neighborhood names that correspond to SF neighborhood codes.
SFneighborhoods = pd.read_csv('Data/SFFind_Neighborhoods.csv')  # replace with your DataFrame
SFneighborhoods.head()

Unnamed: 0,LINK,the_geom,name
0,"http://en.wikipedia.org/wiki/Sea_Cliff,_San_Fr...",MULTIPOLYGON (((-122.49345526799993 37.7835181...,Seacliff
1,,MULTIPOLYGON (((-122.48715071499993 37.7837854...,Lake Street
2,http://www.nps.gov/prsf/index.htm,MULTIPOLYGON (((-122.47758017099994 37.8109931...,Presidio National Park
3,,MULTIPOLYGON (((-122.47241052999993 37.7873465...,Presidio Terrace
4,http://www.sfgate.com/neighborhoods/sf/innerri...,MULTIPOLYGON (((-122.47262578999994 37.7863148...,Inner Richmond


In [3]:
# Join the two datasets on SF Neighborhood Codes
joinedDF = allMUNIstops.join(SFneighborhoods, on='SF Find Neighborhoods', how='inner')

In [4]:
# Isolate stops in equity strategy neighborhoods.
TenderloinStops = joinedDF.loc[joinedDF['name'].isin(['Tenderloin'])]
ChinatownStops = joinedDF.loc[joinedDF['name'].isin(['Chinatown'])]
WesternAdditionStops = joinedDF.loc[joinedDF['name'].isin(['Western Addition'])]
MissionStops = joinedDF.loc[joinedDF['name'].isin(['Mission'])]
BayviewStops = joinedDF.loc[joinedDF['name'].isin(['Bayview'])]
VisitacionValleyStops = joinedDF.loc[joinedDF['name'].isin(['Visitacion Valley'])]
OuterMissionStops = joinedDF.loc[joinedDF['name'].isin(['Outer Mission'])]
OceanViewStops = joinedDF.loc[joinedDF['name'].isin(['Oceanview'])]

ESNstops = pd.concat([TenderloinStops, ChinatownStops, WesternAdditionStops, MissionStops, BayviewStops, 
                      VisitacionValleyStops, OuterMissionStops, OceanViewStops], ignore_index=True)

ESNstops['ESN'] = 1

ESNstops.sample(5)

Unnamed: 0,OBJECTID,STOPNAME,TRAPEZESTOPABBR,RUCUSSTOPABBR,STOPID,LATITUDE,LONGITUDE,ACCESSIBILITYMASK,ATSTREET,ONSTREET,...,shape,Neighborhoods,SF Find Neighborhoods,Current Police Districts,Current Supervisor Districts,Analysis Neighborhoods,LINK,the_geom,name,ESN
163,42371,Balboa Park BART Station SW-MB/BZ,GNVABAR,GNVABART,4805,37.720952,-122.447379,0.0,I-280 N OFF RAMP,GENEVA AVE,...,POINT (-122.44738 37.72095),80.0,80.0,9.0,1.0,28.0,http://en.wikipedia.org/wiki/Neighborhoods_in_...,MULTIPOLYGON (((-122.4626396249999 37.71793603...,Oceanview,1
76,41389,Diamond Heights Blvd&Berkeley Way SE-NS,DHTSBKL1,DHTSBKLY,4361,37.73867,-122.43668,0.0,BERKELEY WAY,DIAMOND HEIGHTS BLVD,...,POINT (-122.43668 37.73867),96.0,96.0,9.0,5.0,10.0,http://www.sfgate.com/neighborhoods/sf/western...,MULTIPOLYGON (((-122.4394803809999 37.78330848...,Western Addition,1
43,43020,Sutter St&Steiner St NE-NS/BZ,SUTTSTE0,SUTTSTEI,6609,37.785942,-122.434793,0.0,STEINER ST,SUTTER ST,...,POINT (-122.43479 37.785942),103.0,103.0,4.0,11.0,30.0,http://www.sfgate.com/neighborhoods/sf/chinatown/,MULTIPOLYGON (((-122.40954104799994 37.7938519...,Chinatown,1
164,43189,Ocean Ave&Otsego Ave SW-NS/BZ,OCENOTS1,OCENOTSE,5800,37.72326,-122.44124,0.0,OTSEGO AVE,OCEAN AVE,...,POINT (-122.44124 37.72326),80.0,80.0,9.0,1.0,28.0,http://en.wikipedia.org/wiki/Neighborhoods_in_...,MULTIPOLYGON (((-122.4626396249999 37.71793603...,Oceanview,1
46,41310,Divisadero St&California St SW-FS,DIVICAL0,DIVICALI,4410,37.787988,-122.4405,0.0,PINE ST,DIVISADERO ST,...,POINT (-122.4405 37.787987),103.0,103.0,4.0,6.0,30.0,http://www.sfgate.com/neighborhoods/sf/chinatown/,MULTIPOLYGON (((-122.40954104799994 37.7938519...,Chinatown,1


In [5]:
# Now create a dataset with all stops from non-equity strategy neighborhoods.
nonESNstops = joinedDF.loc[joinedDF['name'] != 'Tenderloin']
nonESNstops = nonESNstops.loc[nonESNstops['name'] != 'Chinatown']
nonESNstops = nonESNstops.loc[nonESNstops['name'] != 'Western Addition']
nonESNstops = nonESNstops.loc[nonESNstops['name'] != 'Mission']
nonESNstops = nonESNstops.loc[nonESNstops['name'] != 'Bayview']
nonESNstops = nonESNstops.loc[nonESNstops['name'] != 'Visitacion Valley']
nonESNstops = nonESNstops.loc[nonESNstops['name'] != 'Outer Mission']
nonESNstops = nonESNstops.loc[nonESNstops['name'] != 'Oceanview']

nonESNstops['ESN'] = 0

nonESNstops.sample(5)

Unnamed: 0,OBJECTID,STOPNAME,TRAPEZESTOPABBR,RUCUSSTOPABBR,STOPID,LATITUDE,LONGITUDE,ACCESSIBILITYMASK,ATSTREET,ONSTREET,...,shape,Neighborhoods,SF Find Neighborhoods,Current Police Districts,Current Supervisor Districts,Analysis Neighborhoods,LINK,the_geom,name,ESN
17,41900,Monterey Blvd&Foerster St NW-FS/BZ,MTRYFOE0,MTRYFOER,5432,37.731615,-122.449166,0.0,FOERSTER ST,MONTEREY BLVD,...,POINT (-122.449165 37.731613),95.0,95.0,9.0,8.0,41.0,"http://en.wikipedia.org/wiki/Glen_Park,_San_Fr...",MULTIPOLYGON (((-122.44005825799991 37.7388057...,Glen Park,0
2553,41780,Junipero Serra Blvd & Font Blvd NW,J S FON0,J S FONT,5136,37.71463,-122.47191,0.0,FONT BLVD,JUNIPERO SERRA BLVD,...,POINT (-122.47191 37.71463),42.0,42.0,10.0,8.0,16.0,http://en.wikipedia.org/wiki/Neighborhoods_in_...,MULTIPOLYGON (((-122.50853817799992 37.7354017...,Lakeshore,0
1356,42835,South Hill Blvd&Rolph St W-NS/SB,SOHLRLP1,SOHLRLPH,6472,37.711698,-122.432103,0.0,ROLPH ST,SOUTH HILL BLVD,...,POINT (-122.432106 37.711697),58.0,58.0,9.0,1.0,7.0,,MULTIPOLYGON (((-122.43555900699994 37.7431614...,Fairmount,0
3040,43674,Palou Ave&Lane St W-NS/PS,PALULAN0,PALULANE,5876,37.732912,-122.389176,0.0,LANE ST,PALOU AVE,...,POINT (-122.389175 37.73291),86.0,86.0,2.0,9.0,1.0,http://en.wikipedia.org/wiki/Neighborhoods_in_...,MULTIPOLYGON (((-122.3998669739999 37.73029192...,Silver Terrace,0
3166,40695,Balboa St&10TH Ave SW-NS/PS,BBOA10A1,BBOA10AV,3039,37.776967,-122.468388,0.0,10TH AVE,BALBOA ST,...,POINT (-122.46839 37.776966),5.0,5.0,8.0,4.0,11.0,,MULTIPOLYGON (((-122.50310471599994 37.7812963...,Sutro Heights,0


In [6]:
# Now we need neighborhood census data for 2017 and for 2023.
import cenpy
from cenpy import products

acs = cenpy.products.ACS()
census2017 = products.ACS(2017).from_place('San Francisco, CA', level='tract',
                                        variables=['B19019_001E','B25046_001E'])
census2017.rename(columns={'B19019_001E':'median_hh_income', 'B25046_001E':'vehicles_avail'}, inplace=True)

census2017.head()

Matched: San Francisco, CA to San Francisco city within layer Incorporated Places


  census2017 = products.ACS(2017).from_place('San Francisco, CA', level='tract',


Unnamed: 0,GEOID,geometry,median_hh_income,vehicles_avail,state,county,tract
0,6075032801,"POLYGON ((-13635048.760 4543918.550, -13634929...",110255.0,2167.0,6,75,32801
1,6075033100,"POLYGON ((-13636532.870 4541575.590, -13636426...",111333.0,2676.0,6,75,33100
2,6075033201,"POLYGON ((-13635142.160 4541306.060, -13635136...",28750.0,442.0,6,75,33201
3,6075030301,"POLYGON ((-13634050.780 4545554.170, -13633943...",140179.0,3607.0,6,75,30301
4,6075031000,"POLYGON ((-13632506.330 4541080.160, -13632485...",131544.0,2244.0,6,75,31000


In [7]:
acs = cenpy.products.ACS()
census2019 = products.ACS(2019).from_place('San Francisco, CA', level='tract',
                                        variables=['B19019_001E','B25046_001E'])
census2019.rename(columns={'B19019_001E':'median_hh_income', 'B25046_001E':'vehicles_avail'}, inplace=True)

census2019.head()

Matched: San Francisco, CA to San Francisco city within layer Incorporated Places


  census2019 = products.ACS(2019).from_place('San Francisco, CA', level='tract',


Unnamed: 0,GEOID,geometry,median_hh_income,vehicles_avail,state,county,tract
0,6075035202,"POLYGON ((-13637736.350 4546153.040, -13637685...",89732.0,2898.0,6,75,35202
1,6075042700,"POLYGON ((-13635913.040 4548886.330, -13635803...",93250.0,2522.0,6,75,42700
2,6075030202,"POLYGON ((-13633379.300 4546390.880, -13633366...",128417.0,2053.0,6,75,30202
3,6075030900,"POLYGON ((-13633895.820 4539985.070, -13633869...",177694.0,4716.0,6,75,30900
4,6075045100,"POLYGON ((-13632661.740 4548547.020, -13632647...",141912.0,2623.0,6,75,45100


In [8]:
race2017 = products.ACS(2017).from_place('San Francisco, CA', level='tract',
                                        variables='B02001')
race2017.rename(columns={'B02001_001E':'total pop', 'B02001_002E':'white','B02001_003E':'black','B02001_004E':'native','B02001_005E':'asian','B02001_006E':'hawaiian/pac islander','B02001_007E':'other'}, inplace=True)

race2017 = race2017.drop(columns=['B02001_008E', 'B02001_009E','B02001_010E'])

race2017.head()

Matched: San Francisco, CA to San Francisco city within layer Incorporated Places


  race2017 = products.ACS(2017).from_place('San Francisco, CA', level='tract',


Unnamed: 0,GEOID,geometry,total pop,white,black,native,asian,hawaiian/pac islander,other,state,county,tract
0,6075032801,"POLYGON ((-13635048.760 4543918.550, -13634929...",4505.0,1522.0,102.0,5.0,2681.0,0.0,46.0,6,75,32801
1,6075033100,"POLYGON ((-13636532.870 4541575.590, -13636426...",3978.0,1439.0,30.0,0.0,2339.0,0.0,48.0,6,75,33100
2,6075033201,"POLYGON ((-13635142.160 4541306.060, -13635136...",4281.0,1759.0,307.0,15.0,944.0,22.0,775.0,6,75,33201
3,6075030301,"POLYGON ((-13634050.780 4545554.170, -13633943...",5907.0,2694.0,120.0,0.0,2543.0,0.0,228.0,6,75,30301
4,6075031000,"POLYGON ((-13632506.330 4541080.160, -13632485...",3799.0,2015.0,71.0,5.0,1255.0,0.0,146.0,6,75,31000


In [9]:
race2019 = products.ACS(2019).from_place('San Francisco, CA', level='tract',
                                        variables='B02001')
race2019.rename(columns={'B02001_001E':'total pop', 'B02001_002E':'white','B02001_003E':'black','B02001_004E':'native','B02001_005E':'asian','B02001_006E':'hawaiian/pac islander','B02001_007E':'other'}, inplace=True)

race2019 = race2019.drop(columns=['B02001_008E', 'B02001_009E','B02001_010E'])

race2019.head()

Matched: San Francisco, CA to San Francisco city within layer Incorporated Places


  race2019 = products.ACS(2019).from_place('San Francisco, CA', level='tract',


Unnamed: 0,GEOID,geometry,total pop,white,black,native,asian,hawaiian/pac islander,other,state,county,tract
0,6075035202,"POLYGON ((-13637736.350 4546153.040, -13637685...",5244.0,2394.0,395.0,39.0,1541.0,274.0,280.0,6,75,35202
1,6075042700,"POLYGON ((-13635913.040 4548886.330, -13635803...",5379.0,2380.0,351.0,0.0,2337.0,0.0,108.0,6,75,42700
2,6075030202,"POLYGON ((-13633379.300 4546390.880, -13633366...",4438.0,2625.0,89.0,0.0,1483.0,0.0,113.0,6,75,30202
3,6075030900,"POLYGON ((-13633895.820 4539985.070, -13633869...",7103.0,3162.0,128.0,18.0,3180.0,2.0,155.0,6,75,30900
4,6075045100,"POLYGON ((-13632661.740 4548547.020, -13632647...",5126.0,2566.0,142.0,29.0,1954.0,0.0,338.0,6,75,45100


In [10]:
# Join race and census data.
race2019['tract']=race2019['tract'].astype(int)
census2019['tract']=census2019['tract'].astype(int)

demographics2017 = pd.merge(race2017, census2017, how='inner',
                  left_on=['tract', 'county', 'state', 'geometry', 'GEOID'],
                  right_on=['tract', 'county','state', 'geometry', 'GEOID'])

demographics2019 = pd.merge(race2019, census2019, how='inner',
                  left_on=['tract', 'county', 'state', 'geometry', 'GEOID'],
                  right_on=['tract', 'county','state', 'geometry', 'GEOID'])

demographics2019.head()

Unnamed: 0,GEOID,geometry,total pop,white,black,native,asian,hawaiian/pac islander,other,state,county,tract,median_hh_income,vehicles_avail
0,6075035202,"POLYGON ((-13637736.350 4546153.040, -13637685...",5244.0,2394.0,395.0,39.0,1541.0,274.0,280.0,6,75,35202,89732.0,2898.0
1,6075042700,"POLYGON ((-13635913.040 4548886.330, -13635803...",5379.0,2380.0,351.0,0.0,2337.0,0.0,108.0,6,75,42700,93250.0,2522.0
2,6075030202,"POLYGON ((-13633379.300 4546390.880, -13633366...",4438.0,2625.0,89.0,0.0,1483.0,0.0,113.0,6,75,30202,128417.0,2053.0
3,6075030900,"POLYGON ((-13633895.820 4539985.070, -13633869...",7103.0,3162.0,128.0,18.0,3180.0,2.0,155.0,6,75,30900,177694.0,4716.0
4,6075045100,"POLYGON ((-13632661.740 4548547.020, -13632647...",5126.0,2566.0,142.0,29.0,1954.0,0.0,338.0,6,75,45100,141912.0,2623.0


In [11]:
# This dataset contains neighborhood names for each SF census tract.
neighborhood_census_tracts = pd.read_csv('Data/Analysis_Neighborhoods_-_2020_census_tracts_assigned_to_neighborhoods.csv')  # replace with your DataFrame

# Add a leading zero to all tract numbers that are less than 6 digits.
neighborhood_census_tracts = neighborhood_census_tracts.astype({'tractce':'string'})
neighborhood_census_tracts['tractce'] = neighborhood_census_tracts['tractce'].apply(lambda x: x.zfill(6))

neighborhood_census_tracts.rename(columns={'tractce':'tract', 'state_fp':'state', 'county_fp':'county', 'geoid':'GEOID'}, inplace=True)
neighborhood_census_tracts = neighborhood_census_tracts.drop(columns=['the_geom', 'name','data_loaded_at', 'data_as_of'])


neighborhood_census_tracts.head()

Unnamed: 0,object_id,state,county,tract,neighborhoods_analysis_boundaries,sup_dist_2012,sup_dist_2022,GEOID
0,242,6,75,980900,Bayview Hunters Point,10,10,6075980900
1,241,6,75,980600,Bayview Hunters Point,10,10,6075980600
2,240,6,75,980501,McLaren Park,10,10,6075980501
3,239,6,75,980401,The Farallones,1,4,6075980401
4,226,6,75,61200,Bayview Hunters Point,10,10,6075061200


In [12]:
# Now join neighborhood names and demographic data on census tract codes.

# Cast nums as ints
demographics2017['GEOID'] = demographics2017['GEOID'].astype(int)
demographics2019['GEOID'] = demographics2019['GEOID'].astype(int)

demographics2017['tract'] = demographics2017['tract'].astype(int)
demographics2019['tract'] = demographics2019['tract'].astype(int)

demographics2017['county'] = demographics2017['county'].astype(int)
demographics2019['county'] = demographics2019['county'].astype(int)

demographics2017['state'] = demographics2017['state'].astype(int)
demographics2019['state'] = demographics2019['state'].astype(int)

neighborhood_census_tracts['GEOID'] = neighborhood_census_tracts['GEOID'].astype(int)
neighborhood_census_tracts['tract'] = neighborhood_census_tracts['tract'].astype(int)

# Merge 2017 data and clean it up.
data2017 = neighborhood_census_tracts.merge(demographics2017, on='tract')
data2017 = data2017.drop(columns=['GEOID_y', 'state_y','county_y','sup_dist_2012', 'sup_dist_2022', 'object_id'])
data2017.rename(columns={'state_x':'state', 'county_x':'county', 'neighborhoods_analysis_boundaries':'neighborhood', 'GEOID_x':'GEOID'}, inplace=True)

# Merge 2019 data and clean it up.
data2019 = neighborhood_census_tracts.merge(demographics2019, on='tract')
data2019 = data2019.drop(columns=['GEOID_y', 'state_y','county_y','sup_dist_2012', 'sup_dist_2022', 'object_id'])
data2019.rename(columns={'state_x':'state', 'county_x':'county', 'neighborhoods_analysis_boundaries':'neighborhood', 'GEOID_x':'GEOID'}, inplace=True)


data2019.head()

Unnamed: 0,state,county,tract,neighborhood,GEOID,geometry,total pop,white,black,native,asian,hawaiian/pac islander,other,median_hh_income,vehicles_avail
0,6,75,980900,Bayview Hunters Point,6075980900,"POLYGON ((-13626279.570 4542831.040, -13626266...",253.0,171.0,18.0,0.0,56.0,0.0,8.0,,
1,6,75,980600,Bayview Hunters Point,6075980600,"POLYGON ((-13624051.400 4540543.790, -13624050...",690.0,148.0,233.0,0.0,170.0,0.0,66.0,66042.0,375.0
2,6,75,980501,McLaren Park,6075980501,"POLYGON ((-13628536.120 4539319.990, -13628532...",507.0,28.0,114.0,0.0,258.0,34.0,67.0,12340.0,125.0
3,6,75,61200,Bayview Hunters Point,6075061200,"POLYGON ((-13625086.220 4542404.940, -13625054...",3842.0,540.0,1115.0,24.0,1129.0,22.0,961.0,67625.0,1705.0
4,6,75,980300,Golden Gate Park,6075980300,"POLYGON ((-13638558.550 4547081.610, -13638506...",63.0,58.0,0.0,0.0,5.0,0.0,0.0,139375.0,55.0


In [13]:
# Next step is to join MUNI Stop data with the above dataset on neighborhood names.

# Let's concatenate our ESN and non-ESN data.
stops = [ESNstops, nonESNstops]
allMUNIstops = pd.concat(stops)

# The MUNI stop data separates Bayview and Hunter's Point, while the census data combines the two neighborhoods. 
# Let's make all Hunter's Point labels into Bayview.
data2017['neighborhood'] = data2017['neighborhood'].str.replace('Hunters Point','Bayview')
data2017['neighborhood'] = data2017['neighborhood'].str.replace('Bayview Hunters Point','Bayview')
data2019['neighborhood'] = data2017['neighborhood'].str.replace('Hunters Point','Bayview')
data2019['neighborhood'] = data2017['neighborhood'].str.replace('Bayview Hunters Point','Bayview')
allMUNIstops['name'] = allMUNIstops['name'].str.replace('Hunters Point', 'Bayview')

allMUNIstops = allMUNIstops.drop(columns=['SUPERVISOR_DISTRICT','LINK', 'Current Police Districts','Current Supervisor Districts','SERVICEPLANNINGSTOPTYPE','SHELTER','INSERT_TIMESTAMP','Neighborhoods'])
allMUNIstops.rename(columns={'name':'neighborhood', 'shape':'stop_shape','the_geom':'neighborhood_shape','SF Find Neighborhoods': 'sf_find_code', 'Analysis Neighborhoods': 'analysis_neigh_code'}, inplace=True)

pd.set_option('display.max_columns', None)
allMUNIstops.head()

Unnamed: 0,OBJECTID,STOPNAME,TRAPEZESTOPABBR,RUCUSSTOPABBR,STOPID,LATITUDE,LONGITUDE,ACCESSIBILITYMASK,ATSTREET,ONSTREET,POSITION,ORIENTATION,SDE_ID,SIGNUPID,stop_shape,sf_find_code,analysis_neigh_code,neighborhood_shape,neighborhood,ESN
0,42263,Powell St&Sutter St SW-FS,POWLSUT1,POWLSUTT,6076,37.789061,-122.408642,0.0,SUTTER ST,POWELL ST,FS,SW,14816502,141,POINT (-122.408646 37.789062),19.0,21.0,MULTIPOLYGON (((-122.40987401699994 37.7871491...,Tenderloin,1
1,43428,O'Farrell St&Grant Ave S-MB/BB,OFARGRN1,OFARGRNT,5810,37.786642,-122.405629,0.0,GRANT AVE,OFARRELL ST,MB,SO,14819051,141,POINT (-122.40563 37.78664),19.0,8.0,MULTIPOLYGON (((-122.40987401699994 37.7871491...,Tenderloin,1
2,43484,Stockton St&Sutter St NE-FS/BB,STOKSUT0,STOKSUTT,6523,37.79013,-122.40705,0.0,STOCKTON ST,SUTTER ST,FS,NE,14816123,141,POINT (-122.40705 37.79013),19.0,8.0,MULTIPOLYGON (((-122.40987401699994 37.7871491...,Tenderloin,1
3,43683,Market St&Powell St N-NS/BZ,MRKTPOW0,MRKTPOWL,5688,37.784474,-122.407544,0.0,ELLIS ST,MARKET ST,NS,NO,14814764,141,POINT (-122.40755 37.784473),19.0,36.0,MULTIPOLYGON (((-122.40987401699994 37.7871491...,Tenderloin,1
4,42659,Geary St&Powell St NW-FS/BZ,GEARPOW0,GEARPOWL,4757,37.787401,-122.408391,0.0,POWELL ST,GEARY ST,FS,NW,14813783,141,POINT (-122.408394 37.7874),19.0,36.0,MULTIPOLYGON (((-122.40987401699994 37.7871491...,Tenderloin,1


In [14]:
# Now let's join the MUNI stop data with the census date from each year on neighborhood name.
final_df_2017 = pd.merge(data2017, allMUNIstops, how='inner',
                  left_on=['neighborhood'],
                  right_on=['neighborhood'])

final_df_2019 = pd.merge(data2019, allMUNIstops, how='inner',
                  left_on=['neighborhood'],
                  right_on=['neighborhood'])

# Lastly, let's add a column to each datasest specifying the year that its demorgraphic data was collected.
final_df_2017['year_collected'] = '2017'
final_df_2019['year_collected'] = '2019'

pd.set_option('display.max_columns', None)
final_df_2019.sample(5)

Unnamed: 0,state,county,tract,neighborhood,GEOID,geometry,total pop,white,black,native,asian,hawaiian/pac islander,other,median_hh_income,vehicles_avail,OBJECTID,STOPNAME,TRAPEZESTOPABBR,RUCUSSTOPABBR,STOPID,LATITUDE,LONGITUDE,ACCESSIBILITYMASK,ATSTREET,ONSTREET,POSITION,ORIENTATION,SDE_ID,SIGNUPID,stop_shape,sf_find_code,analysis_neigh_code,neighborhood_shape,ESN,year_collected
1817,6,75,22801,Mission,6075022801,"POLYGON ((-13627435.060 4546080.090, -13627312...",5769.0,2871.0,136.0,0.0,979.0,0.0,1526.0,144318.0,2072.0,43078,Church St&22ND St SW-FS/PS,CHUR22SO,ROW 22ST,6218,37.754607,-122.42775,0.0,22ND ST,CHURCH ST,NS,NW,14817246,141,POINT (-122.42775 37.75461),52.0,22.0,MULTIPOLYGON (((-122.42236481799989 37.7698676...,1,2019
3030,6,75,12700,Marina,6075012700,"POLYGON ((-13631025.340 4551655.130, -13631015...",4092.0,3331.0,36.0,0.0,384.0,0.0,131.0,153615.0,2375.0,41197,Clay St&Leavenworth St SW-NS/BZ,CLAYLEA1,CLAYLEAV,4023,37.792959,-122.416304,0.0,LEAVENWORTH ST,CLAY ST,NS,SW,14809883,141,POINT (-122.416306 37.792957),16.0,21.0,MULTIPOLYGON (((-122.44431861899994 37.7987902...,0,2019
3049,6,75,12700,Marina,6075012700,"POLYGON ((-13631025.340 4551655.130, -13631015...",4092.0,3331.0,36.0,0.0,384.0,0.0,131.0,153615.0,2375.0,42057,Pacific Ave&Jones St SW-NS/PS,PACFJON0,PACFJONE,5841,37.795791,-122.415105,0.0,JONES ST,PACIFIC AVE,NS,SW,14812744,141,POINT (-122.41511 37.79579),16.0,21.0,MULTIPOLYGON (((-122.44431861899994 37.7987902...,0,2019
2594,6,75,15500,Japantown,6075015500,"POLYGON ((-13630312.780 4549073.520, -13630129...",3551.0,1878.0,58.0,8.0,1163.0,27.0,2.0,79783.0,1280.0,43906,Van Ness Ave&Eddy St NS-N/SI,V N EDD2,,8103,37.782917,-122.420865,,EDDY ST,VAN NESS AVE,,,14819915,141,POINT (-122.42087 37.782917),100.0,39.0,MULTIPOLYGON (((-122.42798892699994 37.7849538...,0,2019
2244,6,75,17102,Haight Ashbury,6075017102,"POLYGON ((-13631428.540 4546738.700, -13631258...",4095.0,3130.0,81.0,1.0,414.0,0.0,181.0,177396.0,1771.0,43010,Hayes St&Central Ave NE-NS/BZ,HAYSCEN0,HAYSCENT,4985,37.774195,-122.444298,0.0,CENTRAL AVE,HAYES ST,NS,NE,14814125,141,POINT (-122.4443 37.774197),24.0,18.0,MULTIPOLYGON (((-122.44838968599993 37.7688646...,0,2019


In [15]:
final_df_2019.to_csv("Data/final_df_2019.csv")
final_df_2017.to_csv("Data/final_df_2017.csv")

### We finally have our final datasets for training/testing the Classification model!

The final_df_2017 and final_df_2019 datasets contain MUNI stops and their corresponding neighborhood's demographic information.
- The column 'ESN' tells us whether that stop is located in a city-designated Equity Strategy Neighborhood.
- If the ESN value is 0, the stop is not located in an ESN neighborhood. 
- If the ESN value is 1, it is located in an ESN neighborhood.

Let's use the final_df_2017 dataset to train a cluster model. Then, let's test the model using our final_df_2019 dataset.

The cluster model will be trained to identify which stops are located in ESN neighborhoods based on the 2017 demographic data for each SF MUNI stop. The model will then attempt to cluster the 2019 stops as either in/not in an ESN neighborhood based on their demographic data.

### Why does this matter?

If the cluster model successfully clusters the same MUNI stops in 2019 (compared to 2017) as belonging to ESN neighborhoods, then we know that the conditions of ESN neighborhoods in SF have not improved enough for the stops in these neighborhoods to graduate from their ESN-designation. 

If the model does not successfully cluster the same MUNI stops in 2019 beloning to ESN neighborhoods based on 2019 data, then it's possible that the conditions of ESN neighborhoods (first measured in 2017) may have improved, and that the MUNI stops who were not classified as ESN