In [1]:
import pandas as pd

In [2]:
#Read in HCI file for parks
parksurl = 'https://github.com/crsalviati/CA_Housing/raw/master/Raw_Data/ParkBeachOpen10_output4-12-13.xlsx'
parks = pd.read_excel(parksurl, sheetname=0, converters={'geotypevalue':str,'county_fips':str})

In [9]:
#Overview of file
print parks.shape[0]
print parks.geotype.unique()
print parks.reportyear.unique()
parks.head()

86886
[u'CO' u'CT' u'PL' u'RE' u'ST']
[2010]


Unnamed: 0,ind_id,ind_definition,reportyear,race_eth_code,race_eth_name,geotype,geotypevalue,geoname,county_name,county_fips,...,pop_park_acc,pop2010,p_parkacc,LL_95CI,UL_95CI,se,rse,CA_decile,CA_RR,version
0,469,"Percent of Population within ½ Mile of Park, B...",2010,1,AIAN,CO,6001,Alameda,Alameda,6001,...,3818.0,4189,0.911435,0.902831,0.920039,0.00439,4.956508,,1.235649,2013-04-12 05:45:17.545
1,469,"Percent of Population within ½ Mile of Park, B...",2010,1,AIAN,CO,6003,Alpine,Alpine,6003,...,201.0,210,0.957143,0.929749,0.984536,0.013976,32.611226,,1.297617,2013-04-12 05:45:17.545
2,469,"Percent of Population within ½ Mile of Park, B...",2010,1,AIAN,CO,6005,Amador,Amador,6005,...,176.0,547,0.321755,0.282606,0.360904,0.019974,6.207789,,0.436209,2013-04-12 05:45:17.545
3,469,"Percent of Population within ½ Mile of Park, B...",2010,1,AIAN,CO,6007,Butte,Butte,6007,...,1843.0,3395,0.542857,0.5261,0.559615,0.00855,1.870238,,0.735962,2013-04-12 05:45:17.545
4,469,"Percent of Population within ½ Mile of Park, B...",2010,1,AIAN,CO,6009,Calaveras,Calaveras,6009,...,191.0,526,0.363118,0.32202,0.404216,0.020968,5.77448,,0.492286,2013-04-12 05:45:17.545


In [62]:
#Keep data at Place level, for Total race only (Most granular level for which data is available for most indicators)
parks2 = parks[(parks.geotype=="PL") & (parks.race_eth_code == 9)]
#Limit to fields of interest
parks2 = parks2[['geotype', 'geotypevalue', 'geoname', 'county_name', 'county_fips', 'p_parkacc']]

#Note: geotypevalue (FIPS Place code) is a unique key, but geoname is not - 
#There are instances of the same place name in multiple counties
print parks2.shape[0]
print parks2.geotypevalue.nunique()
print parks2.geoname.nunique()

print parks2.loc[parks2.duplicated('geoname', keep=False)]
parks2.head()

1524
1524
1513
      geotype geotypevalue            geoname      county_name county_fips  \
85309      PL        04470        Bayview CDP     Contra Costa       06013   
85310      PL        04478        Bayview CDP         Humboldt       06023   
85313      PL        04716    Bear Valley CDP           Alpine       06003   
85314      PL        04730    Bear Valley CDP         Mariposa       06043   
85507      PL        14450   Cold Springs CDP        El Dorado       06017   
85508      PL        14454   Cold Springs CDP         Tuolumne       06109   
85661      PL        22454    El Sobrante CDP     Contra Costa       06013   
85662      PL        22457    El Sobrante CDP        Riverside       06065   
85718      PL        25488       Franklin CDP           Merced       06047   
85719      PL        25506       Franklin CDP       Sacramento       06067   
85776      PL        31092   Green Valley CDP      Los Angeles       06037   
85777      PL        31099   Green Valley CDP    

In [98]:
# Read in FIPS codes  and limit to codes for CA (NOT SURE IF THIS ACTUALLY PROVIDES ANY NEW INFO OF VALUE - REMOVE?)
fips = pd.read_csv('https://github.com/crsalviati/CA_Housing/raw/master/Raw_Data/fips_codes.txt', sep='|',converters={1:str,2:str})
fips = fips[(fips.STATE=="CA")]
fips = fips[['PLACEFP', 'PLACENAME']]
print fips.shape[0]
fips.head()

1523


Unnamed: 0,PLACEFP,PLACENAME
0,135,Acalanes Ridge CDP
1,156,Acampo CDP
2,212,Acton CDP
3,296,Adelanto city
4,310,Adin CDP


In [99]:
#Merge FIPS codes onto HCI indicators
fips.rename(columns={'PLACEFP':'geotypevalue'},inplace =True)
fips.rename(columns={'PLACENAME':'fips_name'},inplace =True)
combined = pd.merge(parks2, fips, on='geotypevalue', how='left')

#HCI data maps cleanly to FIPS codes, except for code 99999 (state average?) - drop this observation
print combined.shape[0]
print combined[combined.fips_name.isnull()]
combined = combined[combined.fips_name.notnull()]
print combined.shape[0]

#Clean up place names to merge with Zillow data
combined['clean_name'] = combined['fips_name'].str.replace(' city', '')
combined['clean_name'] = combined['clean_name'].str.replace(' CDP', '')
combined['clean_name'] = combined['clean_name'].str.replace(' town', '')
combined.head()
print combined.loc[combined.geoname != combined.fips_name]
combined.clean_name.replace('San Buenaventura (Ventura)', 'Ventura', inplace=True)
combined.clean_name.replace('El Paso de Robles (Paso Robles)', 'Paso Robles', inplace=True)

1524
     geotype geotypevalue geoname county_name county_fips  p_parkacc fips_name
1523      PL        99999     NaN         NaN         NaN   0.335702       NaN
1523
     geotype geotypevalue                    geoname     county_name  \
674       PL        39003  La Ca±ada Flintridge city     Los Angeles   
1057      PL        57302            Pi±on Hills CDP  San Bernardino   

     county_fips  p_parkacc                  fips_name            clean_name  
674        06037   0.824953  La Ca�ada Flintridge city  La Ca�ada Flintridge  
1057       06071   0.033416            Pi�on Hills CDP           Pi�on Hills  


In [22]:
#Read in Zillow rental index
zill_rent_ind = pd.read_csv('https://github.com/crsalviati/CA_Housing/raw/master/Raw_Data/City_ZriPerSqft_AllHomes.csv', converters={1:str})
zill_rent_ind.head()

Unnamed: 0,RegionID,RegionName,State,Metro,CountyName,SizeRank,2010-11,2010-12,2011-01,2011-02,...,2015-08,2015-09,2015-10,2015-11,2015-12,2016-01,2016-02,2016-03,2016-04,2016-05
0,6181,New York,NY,New York,Queens,1,,,,,...,1.752,1.764,1.782,1.79,1.794,1.796,1.798,1.802,1.804,1.81
1,12447,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,2,1.578,1.578,1.578,1.58,...,1.904,1.91,1.914,1.92,1.93,1.946,1.96,1.974,1.988,2.002
2,17426,Chicago,IL,Chicago,Cook,3,1.244,1.248,1.254,1.254,...,1.326,1.328,1.33,1.33,1.33,1.336,1.338,1.344,1.352,1.36
3,39051,Houston,TX,Houston,Harris,4,0.788,0.784,0.784,0.786,...,0.974,0.974,0.974,0.972,0.974,0.976,0.98,0.982,0.984,0.984
4,13271,Philadelphia,PA,Philadelphia,Philadelphia,5,0.854,0.858,0.858,0.858,...,0.934,0.932,0.932,0.93,0.93,0.934,0.936,0.942,0.948,0.954


In [43]:
#Limit to observations for CA, keep only vars of interest, and rename vars
zill_rent_ind2 = zill_rent_ind[zill_rent_ind.State == 'CA']
zill_rent_ind2 = zill_rent_ind2[['RegionID', 'RegionName', 'CountyName', '2010-12']]
zill_rent_ind2.rename(columns={'RegionName':'clean_name'},inplace =True)
zill_rent_ind2.rename(columns={'2010-12':'rent_sqft'},inplace =True)
zill_rent_ind2.rename(columns={'CountyName':'county_name'},inplace =True)
zill_rent_ind2.rename(columns={'RegionID':'region_id'},inplace =True)
#Note Zillow only has data on 737 cities, as opposed to 1523 FIPS places
print zill_rent_ind2.shape[0]
zill_rent_ind2.head()

737


Unnamed: 0,region_id,clean_name,county_name,rent_sqft
1,12447,Los Angeles,Los Angeles,1.578
8,54296,San Diego,San Diego,1.494
10,33839,San Jose,Santa Clara,1.544
12,20330,San Francisco,San Francisco,2.524
33,18203,Fresno,Fresno,0.838


In [93]:
combined2 = pd.merge(combined, zill_rent_ind2, on=['clean_name', 'county_name'], how='outer')
combined2.head()

Unnamed: 0,geotype,geotypevalue,geoname,county_name,county_fips,p_parkacc,fips_name,clean_name,region_id,rent_sqft
0,PL,135,Acalanes Ridge CDP,Contra Costa,6013,1.0,Acalanes Ridge CDP,Acalanes Ridge,,
1,PL,156,Acampo CDP,San Joaquin,6077,0.0,Acampo CDP,Acampo,,
2,PL,212,Acton CDP,Los Angeles,6037,0.403765,Acton CDP,Acton,16677.0,1.088
3,PL,296,Adelanto city,San Bernardino,6071,0.147017,Adelanto city,Adelanto,16684.0,0.744
4,PL,310,Adin CDP,Modoc,6049,0.610294,Adin CDP,Adin,,


In [89]:
print combined2.shape[0]
print combined2.clean_name.nunique()
print combined2.fips_name.nunique()
print combined2.region_id.count()

1578
1554
1513
737


In [77]:
#combined2.sortvalues()
print combined2.loc[combined2.duplicated('clean_name', keep=False)].count()
print combined2.loc[combined2.duplicated('clean_name', keep=False)]

geotype         41
geotypevalue    41
geoname         41
county_name     48
county_fips     41
p_parkacc       41
fips_name       41
clean_name      48
region_id       18
rent_sqft       18
dtype: int64
     geotype geotypevalue                geoname      county_name county_fips  \
82        PL        04470            Bayview CDP     Contra Costa       06013   
83        PL        04478            Bayview CDP         Humboldt       06023   
86        PL        04716        Bear Valley CDP           Alpine       06003   
87        PL        04730        Bear Valley CDP         Mariposa       06043   
150       PL        07974            Bradley CDP         Monterey       06053   
166       PL        08954           Burbank city      Los Angeles       06037   
167       PL        08968            Burbank CDP      Santa Clara       06085   
280       PL        14450       Cold Springs CDP        El Dorado       06017   
281       PL        14454       Cold Springs CDP         Tuolumne   

In [94]:
print combined2[['county_name', 'clean_name']].loc[combined2.geotypevalue.isnull()]

          county_name              clean_name
1523      Los Angeles    La Canada Flintridge
1524          Ventura        Westlake Village
1525             Kern              Mc Farland
1526         Monterey                  Carmel
1527           Tehama              Cottonwood
1528       Sacramento             Sloughhouse
1529           Merced                  Hilmar
1530           Sutter         South Yuba City
1531        El Dorado                  Rescue
1532       Sacramento   Mather Air Force Base
1533           Orange               Silverado
1534   San Bernardino               Helendale
1535        San Mateo       Broadmoor Village
1536         Monterey           Carmel Valley
1537        El Dorado                    Cool
1538         Tuolumne                Tuolumne
1539          Ventura                   Somis
1540   San Bernardino             Pinon Hills
1541           Plumas                Westwood
1542        El Dorado               El Dorado
1543   San Bernardino             