# Exploratory Analysis

In [1]:
#| echo: false
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import requests
import json
np.random.seed(42)
import ipywidgets as widgets
from ipywidgets import interact
import seaborn as sns
import holoviews as hv
import pandas as pd
import panel as pn
import numpy as np
import hvplot.pandas 
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import requests
import cenpy

  def nb_dist(x, y):
  def get_faces(triangle):
  def build_faces(faces, triangles_is, num_triangles, num_faces_single):
  def nb_mask_faces(mask, faces):


## Philadelphia Social Progress Index Data
As social determinants of health influence the overall health of individuals, including asthma rates Philadelphia’s Social Progress Index, which measures social progress using a detailed framework of indicators applied in 372 census tracts, helps account for indicators related to conditions of environments that Philadelphians grew up in [5]. This analysis utilizes Health and Wellness data, represented as hw_value, to understand the overall health of communities, as more Black and Hispanic residents reported poor or fair health than any other group, with severe declines in life expectancy [6]. Philadelphia ranked among the worst 25 in the nation for ozone and particle pollution in the country, which are related to serious health effects such as asthma attacks. Further, particle pollution has been linked to the development of asthma [7]. To increase the accuracy of our prediction model, we used Ozone and Particulate Matter 2.5 data, represented as oc_value and pmc_value, respectively. 

In [5]:
#| echo: true
#| code-fold: true
health_and_wellness = pd.read_csv('health_and_wellness.csv')
health_and_wellness = health_and_wellness.drop(['rank', 'region_name', 'tract_name', 'variable', 'neighborhood_name', 'average_label'], axis=1)
health_and_wellness = health_and_wellness.rename(columns={'value':'hw_value'})

FileNotFoundError: [Errno 2] No such file or directory: 'health_and_wellness.csv'

In [6]:
#| echo: true
#| code-fold: true
ozone_concentration = pd.read_csv("ozone_concentration.csv")
ozone_concentration = ozone_concentration.drop(['rank', 'region_name', 'tract_name', 'variable','neighborhood_name', 'average_label'], axis=1)
ozone_concentration = ozone_concentration.rename(columns={'value':'oc_value'})

FileNotFoundError: [Errno 2] No such file or directory: 'ozone_concentration.csv'

In [7]:
#| echo: true
#| code-fold: true
pmc = pd.read_csv("particular_matter_concentration.csv")
pmc = pmc.drop(['rank', 'region_name', 'tract_name', 'variable','neighborhood_name', 'average_label'], axis=1)
pmc = pmc.rename(columns={'value':'pmc_value'})

FileNotFoundError: [Errno 2] No such file or directory: 'particular_matter_concentration.csv'

## CDC PLACES Data
This analysis builds off of the CDC’s PLACES dataset, a model-based population-level analysis and community estimates of health measures across the U.S [4]. Our model uses asthma prevalence data, represented as asthma_prevalance, from this dataset as our dependent variable. Tobacco Smoke, represented through the smoking prevalence data, smoking_prevalance, is known to be an indoor and outdoor pollution source that triggers asthma. In the City of Philadelphia, while the rates are declining, Philadelphians continue to have the highest smoking rate among large U.S cities, with non-Hispanic Black and Hispanic residents more likely to report smoking [6]. 

In [8]:
#| echo: true
#| code-fold: true
CDC_data = pd.read_csv("CDC_data.csv")
CDC_data = CDC_data[(CDC_data['StateAbbr'] == 'PA') &
                         (CDC_data['StateDesc'] == 'Pennsylvania') &
                         (CDC_data['CountyName'] == 'Philadelphia')]
CDC_data = CDC_data[['CASTHMA_CrudePrev', 'TractFIPS', 'CSMOKING_CrudePrev']]
CDC_data = CDC_data.rename(columns={'TractFIPS':'geoid'})
CDC_data = CDC_data.rename(columns={'CASTHMA_CrudePrev':'asthma_prevalance'})
CDC_data = CDC_data.rename(columns={'CSMOKING_CrudePrev': 'smoking_prevalance'})

FileNotFoundError: [Errno 2] No such file or directory: 'CDC_data.csv'

## Environmental Justice Index Data
This analysis uses data from the Environmental Justice Index, a place-based tool that is designed to measure cumulative impacts of environmental burden that affects health and health equity [3]. In Philadelphia, children are largely exposed to lead, mold and tobacco smoke in the homes of Philadelphia’s residents, which are common asthma triggers [8]. The predictive model reflects these indoor exposures, by including the percentage of houses built pre-1980, as nearly 90 percent of homes were built before 1978, represented as e_houage[8]. The Center of Excellence in Environmental Toxicology shares that children living close to busy roadways is also a major concern for asthma, justifying the data for the proportion of tract’s area within 1 mile buffer of a high-volume road or highway, e_road [8]. Green spaces remain a protective factor that improves air quality, including ozone and particle pollution, and increases the health of individuals, demonstrated in as a predictor variable, e_park, that provides the proportion of the tract’s area within a 1 mile buffer of greenspace[9].

In [9]:
#| echo: true
#| code-fold: true
eji_pa = pd.read_csv("eji_pa.csv")
eji_pa = eji_pa[(eji_pa['COUNTY'] == 'Philadelphia')]
eji_pa = eji_pa[['GEOID', 'E_PARK', 'E_HOUAGE','E_ROAD']]
eji_pa = eji_pa.rename(columns={'GEOID': 'geoid'})

FileNotFoundError: [Errno 2] No such file or directory: 'eji_pa.csv'

In [None]:
#| echo: true
#| code-fold: true
censustracts = gpd.read_file('Census_Tracts_2010.geojson')
tract_and_geoid = censustracts[['GEOID10', 'TRACTCE10']]
tract_and_geoid = tract_and_geoid.rename(columns = {'TRACTCE10':'tract'})
censustracts = censustracts[['GEOID10', 'geometry']]
censustracts = censustracts.rename(columns = {'GEOID10':'geoid'})
censustracts['geoid'] = censustracts['geoid'].astype(np.int64)

## Philly Demographics
As Black and Latino individuals encounter higher asthma burden, our prediction model uses the Census Bureau to include demographic data including: the percentage of Black, Latino, Asian and other races and ethnicities in Philadelphia. This is represented as blk_percent, white_percent, latino_percent, asian_percent, and other_percent. 

In [10]:
#| echo: true
#| code-fold: true
variables = [
    "NAME",
    "B03002_001E",
    "B03002_003E", 
    "B03002_004E", 
    "B03002_005E", 
    "B03002_006E", 
    "B03002_007E", 
    "B03002_008E", 
    "B03002_009E", 
    "B03002_012E", 
]

In [None]:
#| echo: true
#| code-fold: true
available = cenpy.explorer.available()
acs = cenpy.remote.APIConnection("ACSDT5Y2021")
philly_county_code = "101"
pa_state_code = "42"

In [None]:
#| echo: true
#| code-fold: true
philly_demographics = acs.query(
    cols=variables,
    geo_unit="block group:*",
    geo_filter={"state": pa_state_code, "county": philly_county_code, "tract": "*"},
)

philly_demographics = philly_demographics.rename(
    columns={
        "B03002_001E": "Total", 
        "B03002_003E": "White",  
        "B03002_004E": "Black",  
        "B03002_005E": "AI/AN", 
        "B03002_006E": "Asian",  
        "B03002_007E": "NH/PI", 
        "B03002_008E": "Other_",  
        "B03002_009E": "Two Plus",
        "B03002_012E": "Hispanic",
    }
)
philly_demographics = pd.merge(philly_demographics, tract_and_geoid, on='tract', how='inner')
philly_demographics = philly_demographics.rename(columns = {'GEOID10':'geoid'})

In [None]:
#| echo: true
#| code-fold: true
philly_demographics['Black'] = pd.to_numeric(philly_demographics['Black'])
philly_demographics['Total'] = pd.to_numeric(philly_demographics['Total'])
philly_demographics['blk_percent'] = (philly_demographics['Black'] / philly_demographics['Total']) * 100

In [None]:
#| echo: true
#| code-fold: true
philly_demographics['White'] = pd.to_numeric(philly_demographics['White'])
philly_demographics['white_percent'] = (philly_demographics['White'] / philly_demographics['Total']) * 100
philly_demographics['Hispanic'] = pd.to_numeric(philly_demographics['Hispanic'])
philly_demographics['latino_percent'] = (philly_demographics['Hispanic'] / philly_demographics['Total']) * 100
philly_demographics['Asian'] = pd.to_numeric(philly_demographics['Asian'])
philly_demographics['asian_percent'] = (philly_demographics['Asian'] / philly_demographics['Total']) * 100
philly_demographics['AI/AN'] = pd.to_numeric(philly_demographics['AI/AN'])
philly_demographics['NH/PI'] = pd.to_numeric(philly_demographics['NH/PI'])
philly_demographics['Other_'] = pd.to_numeric(philly_demographics['Other_'])
philly_demographics['Two Plus'] = pd.to_numeric(philly_demographics['Two Plus'])
columns_to_sum = ["AI/AN", "NH/PI", "Other_", "Two Plus"]
philly_demographics['other_percent'] = (philly_demographics[columns_to_sum].sum(axis=1) / philly_demographics['Total']) * 100
philly_demographics = philly_demographics[['blk_percent','white_percent','latino_percent', 'asian_percent', 'other_percent', 'geoid']] 

In [None]:
#| echo: true
#| code-fold: true
philly_demographics['geoid'] = philly_demographics['geoid'].astype('int64')
philly_demographics = philly_demographics.dropna()

## Correlation Plots
Below, asthma risk facors and the CDC's estimate of asthma prevalence within a census tract. In our analysis, we observed the most notable correlations with predictors such as the percentage of Black population in a census tract, indicating potential disparities. Prevalence of smoking appeared to have a positive correlation with the dependent variable. Furthermore, the health and wellness value appears to have a negative correlation with asthma prevalence in a given census tract. 

In [None]:
#| echo: true
#| code-fold: true
regression_df = pd.merge(health_and_wellness, ozone_concentration, on='geoid', how='inner')
regression_df = pd.merge(regression_df, pmc, on='geoid', how='inner')
regression_df = pd.merge(regression_df, CDC_data, on='geoid', how='inner')
regression_df = pd.merge(regression_df, eji_pa, on='geoid', how='inner')
regression_df = pd.merge(regression_df, philly_demographics, on='geoid', how='inner')

In [None]:
#| echo: true
#| code-fold: true
regression_df.to_csv('regression_df.csv', index=False)
regression_df.head()

In [None]:
#| echo: true
#| code-fold: true
def plot_regression(x_variable, y_variable):
    plt.figure(figsize=(8, 6))
    
    # Plot regression plot
    sns.regplot(x=regression_df[x_variable], y=regression_df[y_variable])
    
    plt.title(f'Regression Plot - Asthma Prevalance vs. {x_variable}')
    plt.xlabel('Predictor')
    plt.ylabel('Asthma Prevalance')
    plt.show()

# List of variables in the dataset excluding 'asthma_prevalance' and 'geoid'
variables = [col for col in regression_df.columns if col not in ["asthma_prevalance", "geoid"]]
y_variabless = [col for col in regression_df.columns if col in ["asthma_prevalance"]]

# Create interactive widgets for x-axis and y-axis variables
x_variable_dropdown = widgets.Dropdown(options=variables, description='X-axis Variable')
y_variable_dropdown = widgets.Dropdown(options=y_variabless, description='Y-axis Variable')

# Use the interact function to link the widgets to the plot function
correlationplot = interact(plot_regression, x_variable=x_variable_dropdown, y_variable=y_variable_dropdown);
display(correlationplot)

## Risk Factor Choropleth Maps
Lastly, we investigate how these predictors look spatially within the city of Philadelphia. The dashboard below displays each predictor for asthma prevalence by census tract.

In [None]:
#| echo: true
#| code-fold: true
regression_gdf = pd.merge(regression_df, censustracts, on='geoid', how='inner')
regression_gdf = gpd.GeoDataFrame(regression_gdf, geometry='geometry')

In [None]:
#| echo: true
#| code-fold: true
def generate_choropleth(variable):
    choropleth_map = regression_gdf.hvplot(geo=True, c=variable, cmap='viridis', colorbar=True, width=800, height=600)
    return choropleth_map

variables = [col for col in regression_gdf.columns if col not in ["geoid", "geometry"]]

# Create a widget for variable selection
variable_dropdown = pn.widgets.Select(options=variables, value=variables[0], name='Select Variable')

# Use interact from panel to generate the choropleth map based on the selected variable
@pn.depends(variable_dropdown.param.value)
def update_choropleth(variable):
    return generate_choropleth(variable)

# Combine the variable dropdown and the choropleth map
dashboard = pn.Column(
    pn.Row(variable_dropdown),
    pn.Row(update_choropleth)
)

dashboard