Pull in the data for CalEnviroScreen 4.0

# Data Wrangling: Getting and Cleaning Data, Creating Variables, and Preparing for Joins

## CalEnviroScreen4.0

In [61]:
import os
os.chdir('/Users/clairebekker/Documents/GitHub/urbandatascience/EJ_InlandEmpire')

In [62]:
# Change directory to location of CalEnviroScreen data
os.chdir('Data/CalEnviroScreen')

In [107]:
# Import pandas 
import pandas as pd
# Import geopandas
import geopandas as gpd
# Import numpy
import numpy as np

In [64]:
# Load in CalEnviroScreen Data
esGdf = gpd.read_file('CES4 Final Shapefile.shp')

In [65]:
# Restrict census tracts to Riverside and San Bernardino Counties
esGdf_InlandEmpire = esGdf[esGdf['County'].isin(['Riverside', 'San Bernardino'])]

In [66]:
#esGdf_InlandEmpire.columns

Index(['Tract', 'ZIP', 'County', 'ApproxLoc', 'TotPop19', 'CIscore',
       'CIscoreP', 'Ozone', 'OzoneP', 'PM2_5', 'PM2_5_P', 'DieselPM',
       'DieselPM_P', 'Pesticide', 'PesticideP', 'Tox_Rel', 'Tox_Rel_P',
       'Traffic', 'TrafficP', 'DrinkWat', 'DrinkWatP', 'Lead', 'Lead_P',
       'Cleanup', 'CleanupP', 'GWThreat', 'GWThreatP', 'HazWaste', 'HazWasteP',
       'ImpWatBod', 'ImpWatBodP', 'SolWaste', 'SolWasteP', 'PollBurd',
       'PolBurdSc', 'PolBurdP', 'Asthma', 'AsthmaP', 'LowBirtWt', 'LowBirWP',
       'Cardiovas', 'CardiovasP', 'Educatn', 'EducatP', 'Ling_Isol',
       'Ling_IsolP', 'Poverty', 'PovertyP', 'Unempl', 'UnemplP', 'HousBurd',
       'HousBurdP', 'PopChar', 'PopCharSc', 'PopCharP', 'Child_10',
       'Pop_10_64', 'Elderly65', 'Hispanic', 'White', 'AfricanAm', 'NativeAm',
       'OtherMult', 'Shape_Leng', 'Shape_Area', 'AAPI', 'geometry'],
      dtype='object')

In [110]:
# Use only raw scores as x-variables
# List comprehension to exclude percentile columns and scores
cols_to_exclude = [col for col in esGdf_InlandEmpire.columns if col.endswith('P') or col.endswith('score') or col.endswith('Sc')
                   or col=='PollBurd' or col=='PopChar']
esGdf_InlandEmpire_rawscores = [col for col in esGdf_InlandEmpire.columns if col not in cols_to_exclude]
# Subset dataframe to only include raw scores
esGdf_InlandEmpire= esGdf_InlandEmpire[esGdf_InlandEmpire_rawscores]

# Edit tract number to match the GEOID in other datasets
esGdf_InlandEmpire['GEOID'] = esGdf_InlandEmpire.Tract.astype(np.int64).astype(str).str.zfill(11)

# Set GEOID as index to join with other datasets
esGdf_InlandEmpire.set_index('GEOID', inplace=True)

Data from 2019 American Community Survey (ACS) 

## American Community Survey (2019)

In [68]:
import cenpy
from cenpy import products
# create a connection to the American Community Survey
acs = cenpy.products.ACS()

In [69]:
#pd.set_option('display.max_colwidth', None)
#acs.filter_variables('B25014')

In [70]:
census_var = ['B01001_001E', 'B01001_003E', 'B01001_027E', 'B01001_044E', 'B01001_045E', 'B01001_046E', 'B01001_047E', 'B01001_048E', 'B01001_049E', 'B01001_020E', 'B01001_021E', 'B01001_022E', 'B01001_023E', 'B01001_024E', 'B01001_025E',
              'B19013_001E', 'B14005_001E', 'B14005_007E', 'B14005_021E', 'B28002_002E', 'B25040_010E', 'B25040_001E', 'B25014_001E', 'B25014_005E', 'B25014_006E', 'B25014_007E', 'B25014_011E', 'B25014_012E', 'B25014_013E', 'B02001_002E', 
              'C18108_001E', 'C18108_003E', 'C18108_004E', 'C18108_007E','C18108_008E', 'C18108_011E', 'C18108_012E', 'B28002_001E', 'B28002_012E', 'B28002_013E', 'B25014H_003E', 'B25044_001E', 'B25044_003E', 'B25044_010E', 'B25044_004E', 'B25044_011E', 
              'B08301_001E', 'B08301_019E', 'B08301_018E', 'B08303_013E', 'B08303_001E'] 
# Total Population (used to scale for percentages): B01001_001E
# Estimate!!Total:!!White alone = B02001_002E
# Male under 5= B01001_003E, Female under 5= B01001_027E, 
# Older populations: Estimate!!Total:!!Female:!!65 and older = B01001_044E, B01001_045E, B01001_046E, B01001_047E, B01001_048E, B01001_049E
# Estimate!!Total:!!Male:!!65 and older = B01001_020E, B01001_021E, B01001_022E, B01001_023E, B01001_024E, B01001_025E,
# Median Household Income = B19013_001E,  
# Estimate!!Total:!!Male:!!Not Enrolled in school: SEX BY SCHOOL ENROLLMENT BY EDUCATIONAL ATTAINMENT BY EMPLOYMENT STATUS FOR THE POPULATION 16 TO 19 YEARS = B14005_007E,
# Estimate!!Total:!!Female:!!Not enrolled in school: SEX BY SCHOOL ENROLLMENT BY EDUCATIONAL ATTAINMENT BY EMPLOYMENT STATUS FOR THE POPULATION 16 TO 19 YEARS = B14005_021E, 
# Estimate!!Total: SEX BY SCHOOL ENROLLMENT BY EDUCATIONAL ATTAINMENT BY EMPLOYMENT STATUS FOR THE POPULATION 16 TO 19 YEARS = B14005_001E, 
# Estimate!!Total:!!No Internet access = B28002_013E,
# Estimate!!Total:!!Internet access without a subscription = B28002_012E,
# Estimate!!Total: PRESENCE AND TYPES OF INTERNET SUBSCRIPTIONS IN HOUSEHOLD = B28002_001E,
# Estimate!!Total:!!Under 18 years:!!With one type of disability = C18108_003E
# Estimate!!Total:!!Under 18 years:!!With one type of disability = C18108_004E
# Estimate!!Total:!!18 to 64 years!!With one type of disability = C18108_007E
# Estimate!!Total:!!18 to 64 years:!!With two or more disabilities = C18108_008E 
# Estimate!!Total:!!65 years and over:!!With one type of disability = C18108_011E
# Estimate!!Total:!!65 years and over:!!With two or more disabilities = C18108_012E
# Estimate!!Total!! AGE BY NUMBER OF DISABILITIES= C18108_001E
# Estimate!!Total:!!No Internet access = B28002_013E
# Estimate!!Total:!!1.01 or more occupants per room = B25014H_003E
# Estimate!!Total!!Owner occupied!!No vehicle available = B25044_003E 
# Estimate!!Total!!Renter occupied!!No vehicle available = B25044_010E
# Estimate!!Total!!Owner occupied!!1 vehicle available = B25044_004E
# Estimate!!Total!!Renter occupied!!1 vehicle available = B25044_011E
# Estimate!!Total: TENURE BY VEHICLES AVAILABLE = B25044_001E
# Estimate!!Total!! means of transportation to work = B08301_001E
# Estimate!!Total!!Walked = B08301_019E
# Estimate!!Total!!Bicycle = B08301_018E
# Estimate!!Total:!!90 or more minutes time to travel to work = B08303_013E
# Estimate!!Total: TRAVEL TIME TO WORK = B08303_001E 
# Estimate!!Total:!!No fuel used = B25040_010E 
# Estimate!!Total: HOUSE HEATING FUEL = B25040_001E 
# Estimate!!Total:!!Owner occupied:!!1.01 to 1.50 occupants per room = B25014_005E, 
# Estimate!!Total:!!Owner occupied:!!1.51 to 2.00 occupants per room = B25014_006E,
# Estimate!!Total:!!Owner occupied:!!2.01 or more occupants per room = B25014_007E, 
# Estimate!!Total:!!Renter occupied:!!1.01 to 1.50 occupants per room = B25014_011E,
# Estimate!!Total:!!Renter occupied:!!1.51 to 2.00 occupants per room = B25014_012E,
# Estimate!!Total:!!Renter occupied:!!2.01 or more occupants per room = B25014_013E, 
# Estimate!!Total: TENURE BY OCCUPANTS PER ROOM = B25014_001E

In [71]:
# Get 2019 ACS for Riverside and San Bernardino counties
censusDf_Riverside = products.ACS(2019).from_county('Riverside, CA', level='tract',
                                        variables=census_var)
censusDf_San_Bernardino = products.ACS(2019).from_county('San Bernardino, CA', level='tract',
                                        variables=census_var)

  return self._from_name(county, variables, level, "Counties", **kwargs)
  return self._from_name(county, variables, level, "Counties", **kwargs)


In [72]:
censusDf_IE = pd.concat((censusDf_Riverside, censusDf_San_Bernardino), axis=0)

In [78]:
# Race
censusDf_IE['pct_POC'] = ((censusDf_IE['B01001_001E'] - censusDf_IE['B02001_002E'])/censusDf_IE['B01001_001E'])*100 # (total population - white alone)/total population * 100

## SOCIOECONOMIC VULNERABILITY FACTORS ##
## Age
censusDf_IE['pct_youngchild'] = ((censusDf_IE['B01001_003E'] + censusDf_IE['B01001_027E'])/censusDf_IE['B01001_001E'])*100 # male under 5 + female under 5, divided by total population
older_pop = ['B01001_044E', 'B01001_045E', 'B01001_046E', 'B01001_047E', 'B01001_048E', 'B01001_049E', 'B01001_020E', 'B01001_021E', 'B01001_022E', 'B01001_023E', 'B01001_024E', 'B01001_025E'] # All variables for male and female populations greater than 65
censusDf_IE['pct_older'] = (censusDf_IE[older_pop].sum(axis=1)/censusDf_IE['B01001_001E'])*100 # Sum all counts of people 65+/total population
## Disability
disability= ['C18108_003E', 'C18108_004E', 'C18108_007E','C18108_008E', 'C18108_011E', 'C18108_012E']
censusDf_IE['pct_disability'] = (censusDf_IE[disability].sum(axis=1)/censusDf_IE['C18108_001E'])*100 # sum number of estimated total of people in each range with one or more disabilities, divide by total polled population
# Median Household Income
censusDf_IE['med_income']= censusDf_IE['B19013_001E']
# Disconnected Youth (16-19 year olds not in school) 
censusDf_IE['pct_disconnectedyouth']= ((censusDf_IE['B14005_007E'] + censusDf_IE['B14005_021E'])/censusDf_IE['B14005_001E'])*100 # sum number of 16-19 males and females not enrolled in school/population of 16-19 year olds
# Internet Access (percentage of households without internet subscription)
censusDf_IE['pct_woutinternet'] = ((censusDf_IE['B28002_013E'] + censusDf_IE['B28002_012E'])/censusDf_IE['B28002_001E'])*100 # number of households without any internet or internet subscription/total number of households polled

In [79]:
## TRANSPORTION ##   
# Vehicle ownership 
vehicles = ['B25044_003E', 'B25044_010E', 'B25044_004E', 'B25044_011E'] # all owner-occupied (0 or 1 cars) and rental (0 or 1 cars)                                     
censusDf_IE['pct_vehicleowner'] = ((censusDf_IE[vehicles].sum(axis=1))/censusDf_IE['B25044_001E'])*100 # sum number of owner-occupied or rental households with 0 or 1 cars/total households polled
# Active Commute
censusDf_IE['pct_activecommute'] = ((censusDf_IE['B08301_019E'] + censusDf_IE['B08301_018E'])/censusDf_IE['B08301_001E'])*100 # sum of walking and biking commuters/total number of commuters 
# Extreme Commutes 
censusDf_IE['pct_extremecommute'] = (censusDf_IE['B08303_013E']/censusDf_IE['B08303_001E'])*100 # proportion of commuters traveling 90+ minutes

In [80]:
## HOUSING ##
# House Heating
censusDf_IE['pct_nofuel']= (censusDf_IE['B25040_010E']/censusDf_IE['B25040_001E'])*100 # number of households with no heating fuel/total households
# Overcrowding
occupants = ['B25014_005E', 'B25014_006E', 'B25014_007E', 'B25014_011E', 'B25014_012E', 'B25014_013E'] # owner and rent-occupied 1+ people per room 
censusDf_IE['pct_1+occupants']= (censusDf_IE[occupants].sum(axis=1)/censusDf_IE['B25014_001E'])*100 # sum of households with 1+ people per room/total households

In [81]:
pd.set_option('display.max_colwidth', 50)
censusDf_IE.head()

Unnamed: 0,GEOID,geometry,B01001_001E,B01001_003E,B01001_020E,B01001_021E,B01001_022E,B01001_023E,B01001_024E,B01001_025E,...,pct_older,pct_disability,pct_disconnectedyouth,pct_woutinternet,pct_vehicleowner,pct_activecommute,pct_extremecommute,pct_nofuel,pct_1+occupants,med_income
0,6065041904,"POLYGON ((-13099233.990 4011396.270, -13099207...",5391.0,168.0,31.0,62.0,13.0,63.0,69.0,15.0,...,9.738453,7.546115,6.52819,2.588757,18.491124,1.54321,4.566536,0.517751,11.316568,107880.0
1,6065041702,"POLYGON ((-13089529.040 4010560.230, -13089521...",5185.0,246.0,60.0,57.0,114.0,63.0,9.0,9.0,...,11.95757,8.50234,0.0,12.636961,18.991965,1.045556,4.661532,1.314828,7.742878,79283.0
2,6065041500,"POLYGON ((-13096164.800 4014145.990, -13096149...",3263.0,128.0,8.0,9.0,14.0,32.0,8.0,16.0,...,6.282562,9.255287,29.230769,10.23766,47.714808,3.950777,4.367968,0.731261,7.952468,60735.0
3,6065040809,"POLYGON ((-13087670.230 4014380.140, -13087646...",4262.0,183.0,36.0,60.0,50.0,28.0,20.0,7.0,...,8.798686,6.475833,0.0,7.274321,27.51972,0.0,8.891109,1.314636,14.110429,74704.0
4,6065040615,"POLYGON ((-13087499.580 4023599.500, -13087495...",10810.0,858.0,63.0,198.0,65.0,38.0,46.0,37.0,...,8.612396,8.602948,6.048387,1.447424,12.260536,0.0,6.35275,0.0,7.364836,138651.0


In [82]:
# Drop all of the ACS variable names and keep variables we created 
censusDf_IE =censusDf_IE.drop(labels=['B01001_001E', 'B01001_003E', 'B01001_027E', 'B01001_044E', 'B01001_045E', 'B01001_046E', 'B01001_047E', 'B01001_048E', 'B01001_049E', 'B01001_020E', 'B01001_021E', 'B01001_022E', 'B01001_023E', 'B01001_024E', 'B01001_025E',
              'B19013_001E', 'B14005_001E', 'B14005_007E', 'B14005_021E', 'B28002_002E', 'B25040_010E', 'B25040_001E', 'B25014_001E', 'B25014_005E', 'B25014_006E', 'B25014_007E', 'B25014_011E', 'B25014_012E', 'B25014_013E', 'B02001_002E', 
              'C18108_001E', 'C18108_003E', 'C18108_004E', 'C18108_007E','C18108_008E', 'C18108_011E', 'C18108_012E', 'B28002_001E', 'B28002_012E', 'B28002_013E', 'B25014H_003E', 'B25044_001E', 'B25044_003E', 'B25044_010E', 'B25044_004E', 'B25044_011E', 
              'B08301_001E', 'B08301_019E', 'B08301_018E', 'B08303_013E', 'B08303_001E'], axis=1)

In [109]:
# Set index as GEOID to use for joining datasets 
censusDf_IE.set_index('GEOID', inplace=True)

Pull in the data from HUD CHAS

## US Dept. of Housing and Urban Development (HUD)


In [119]:
os.chdir('../HUD CHAS')
hamfiDf = pd.read_csv('Table16.csv')
len(hamfiDf)

74001

In [120]:
# Select HUD CHAS data for census tracts in the IE
hamfiDfIE = hamfiDf[hamfiDf['name'].str.contains("Riverside")|hamfiDf['name'].str.contains("San Bernardino")].reset_index()

Calculate percentage of households per tract that make 30% or less than the HUD Area Median Family Income (HAMFI).

In [125]:
# Identify percentage of households per tract that make 30% or less than the HUD Area Median Family Income (HAMFI).
hamfiDfIE['HAMFI30'] = ((hamfiDf['T16_est3'] + hamfiDf['T16_est88'])/hamfiDf['T16_est1'])*100
#hamfiDfIE.head()

# Get GEOID for HUD Area Median Family Income data 
hamfiDfIE['GEOID'] = hamfiDfIE['geoid'].str[7:]

# Subset dataframe to only include columns of interest
hamfiDfIE= hamfiDfIE[['GEOID', 'HAMFI30']]

# Set GEOID as index 
hamfiDfIE.set_index('GEOID', inplace=True)

## California Natural Resources Agency (CNRA) 

Pull in the California Natural Resources Agency (CNRA) California
Heat Assessment Tool (CHAT) 2015 data

In [127]:
os.chdir('../CHAT')
CHATRiversideDf = pd.read_csv('CHAT-Riverside County-projected.csv')
CHATSBDf = pd.read_csv('CHAT-San Bernardino County-projected.csv')

In [128]:
# Concatenate Riverside and San Bernardino Data
CHATIE = pd.concat((CHATRiversideDf, CHATSBDf), axis=0)
CHATIE.head()

Unnamed: 0,geoid_long,rcp,projections_ct,census_county,census_city,projections_time_frame,socioeconomic_group,time_of_year,model_percentiles,proj_ann_num_events,proj_avg_tmax,proj_avg_tmin,proj_avg_duration,proj_avg_rhmax,proj_avg_rhmin,ann_freq_rel_chg_perc,dur_rel_chg_perc
0,1400000US06065030101,rcp45,2147483647,Riverside County,Riverside,2011-2030,2006 HW,Total,5,0.0,,,,,,,
1,1400000US06065030101,rcp45,2147483647,Riverside County,Riverside,2011-2030,Vulnerable,Total,5,1.55,104.81,67.97,4.94,59.94,16.27,-21.19,2.9
2,1400000US06065030101,rcp45,2147483647,Riverside County,Riverside,2011-2030,General,AM,5,0.15,95.31,61.25,4.33,57.35,17.52,-35.71,1.11
3,1400000US06065030101,rcp45,2147483647,Riverside County,Riverside,2011-2030,General,JJA,5,0.1,104.03,68.97,4.0,68.38,17.69,-25.0,-30.43
4,1400000US06065030101,rcp45,2147483647,Riverside County,Riverside,2011-2030,General,SO,5,0.0,,,,,,-100.0,


Select time frame as 2021-2040 and time of year as June, July, and August, or September and October for general population and also total for Vulnerable SE group

In [130]:
CHATIE.projections_time_frame

0         2011-2030
1         2011-2030
2         2011-2030
3         2011-2030
4         2011-2030
            ...    
147595    2081-2099
147596    2081-2099
147597    2081-2099
147598    2081-2099
147599    2081-2099
Name: projections_time_frame, Length: 328800, dtype: object

In [134]:
CHATIE = CHATIE[CHATIE['projections_time_frame']== '2021-2040' & (CHATIE['time_of_year'].isin(['JJA', 'SO'])==True)]

SyntaxError: invalid syntax (1982789631.py, line 1)

In [14]:
# Select Heat Health Events Max Temperature
CHATIE_maxtemp = CHATIE['proj_avg_tmax']
CHATIE_maxtemp.head()

geoid_long
1400000US06065030101       NaN
1400000US06065030101    104.81
1400000US06065030101     95.31
1400000US06065030101    104.03
1400000US06065030101       NaN
Name: proj_avg_tmax, dtype: float64