In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np

### Read the dataset about health and filter it by USA

In [None]:
df = pd.read_csv("PLACES_Local_Data_for_Better_Health_County_Data_2022_release_20250320.csv")

df = df[df["Data_Value_Type"].str.contains("Crude", na=False)]


# Rename the city column
df = df.rename(columns={"LocationName": "city"})


# Drop unnecessary columns
df = df.drop(['StateAbbr', 'StateDesc', 'DataSource', 'Category', 'LocationID', 'CategoryID', 'DataValueTypeID', 'Geolocation', 'TotalPopulation', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'Data_Value_Footnote_Symbol', 'MeasureId', 'Data_Value_Footnote' ], axis=1)
df

Unnamed: 0,Year,city,Measure,Data_Value_Unit,Data_Value_Type,Data_Value,Data_Value_Footnote,Short_Question_Text
0,2020,,Current lack of health insurance among adults ...,%,Crude prevalence,15.3,,Health Insurance
1,2020,Talladega,Diagnosed diabetes among adults aged >=18 years,%,Crude prevalence,15.1,,Diabetes
2,2020,Tallapoosa,Diagnosed diabetes among adults aged >=18 years,%,Crude prevalence,16.0,,Diabetes
4,2020,Aleutians East,Cervical cancer screening among adult women ag...,%,Crude prevalence,68.3,,Cervical Cancer Screening
5,2020,Anchorage,Depression among adults aged >=18 years,%,Crude prevalence,16.5,,Depression
...,...,...,...,...,...,...,...,...
188446,2019,St. Croix,High cholesterol among adults aged >=18 years ...,%,Crude prevalence,32.3,,High Cholesterol
188448,2020,Mineral,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,11.0,,COPD
188449,2020,Wirt,"Fecal occult blood test, sigmoidoscopy, or col...",%,Crude prevalence,68.5,,Colorectal Cancer Screening
188451,2019,Skagit,High cholesterol among adults aged >=18 years ...,%,Crude prevalence,34.6,,High Cholesterol


### Read the air quality dataset and clean the data

In [None]:
airdf = pd.read_excel("who_ambient_air_quality_database_version_2024_(v6.1).xlsx", sheet_name="Update 2024 (V6.1)")


# Filter by US
airdf = airdf[airdf["country_name"] == "United States of America"]

# Drop unnnecessary columns
airdf = airdf.drop(['version','reference','web_link', 'population_source','who_ms', 'type_of_stations', 'population', 'latitude', 'longitude', 'iso3', 'who_region', 'no2_concentration', 'pm10_tempcov',	'pm25_tempcov',	'no2_tempcov'], axis=1)

# Remove state from city names
airdf['city'] = airdf['city'].str.split(' ').str[0]

# Leave only year 2020
airdf = airdf[airdf['year'] == 2020.0]
airdf.head()

Unnamed: 0,country_name,city,year,pm10_concentration,pm25_concentration
99,United States of America,Aberdeen,2020.0,,
243,United States of America,Adrian,2020.0,,8.2
502,United States of America,Akron,2020.0,,8.15
676,United States of America,Albany,2020.0,,9.1
688,United States of America,Albany,2020.0,,6.4


### Merge the datasets

In [50]:

# Get unique diagnosis measures
diagnosis_values = np.unique(df["Measure"].values)


# Sort the data
airdf = airdf.sort_values(by="city", ascending=True)

# Extract the city name from the format "Adrian MI/USA" -> "Adrian"
# This handles multi-word city names like "New York" correctly
airdf['city'] = airdf['city'].str.extract(r'^(.*?)(?:\s+[A-Z]{2}/USA)$')[0]

# Convert city names to lowercase for case-insensitive comparison
cities_set = set(airdf['city'].str.lower())
health_locations_set = set(df['city'].str.lower())

# Find the intersection (cities that appear in both datasets)
common_cities = cities_set.intersection(health_locations_set) 

# Create a DataFrame with only the common cities
common_cities_df = airdf[airdf['city'].str.lower().isin(common_cities)]
common_cities_df

Unnamed: 0,country_name,city,year,pm10_concentration,pm25_concentration,city_clean


In [None]:
merged_df = pd.merge(df, airdf, on='city', how='inner')
merged_df

Unnamed: 0,Year,city,Measure,Data_Value_Unit,Data_Value_Type,Data_Value,Data_Value_Footnote_Symbol,Data_Value_Footnote,Low_Confidence_Limit,High_Confidence_Limit,TotalPopulation,TotalPop18plus,MeasureId,Short_Question_Text,country_name,year,pm10_concentration,pm25_concentration
0,2022,Macon,Arthritis among adults,%,Age-adjusted prevalence,30.0,,,26.6,33.7,11765,9604,ARTHRITIS,Arthritis,United States of America,2020.0,,7.75
1,2022,Macon,Frequent mental distress among adults,%,Age-adjusted prevalence,20.5,,,18.5,22.6,18516,15413,MHLTH,Frequent Mental Distress,United States of America,2020.0,,7.75
2,2022,Macon,Visited dentist or dental clinic in the past y...,%,Age-adjusted prevalence,47.6,,,43.6,51.6,18516,15413,DENTAL,Dental Visit,United States of America,2020.0,,7.75
3,2022,Macon,No leisure-time physical activity among adults,%,Age-adjusted prevalence,35.1,,,30.2,40.0,18516,15413,LPA,Physical Inactivity,United States of America,2020.0,,7.75
4,2022,Macon,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,8.8,,,7.9,9.8,18516,15413,COPD,COPD,United States of America,2020.0,,7.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15979,2022,Laramie,Short sleep duration among adults,%,Age-adjusted prevalence,34.4,,,30.4,38.4,100723,78289,SLEEP,Short Sleep Duration,United States of America,2020.0,25.067,6.20
15980,2022,Laramie,Coronary heart disease among adults,%,Age-adjusted prevalence,5.3,,,4.7,5.8,100723,78289,CHD,Coronary Heart Disease,United States of America,2020.0,25.067,6.20
15981,2022,Laramie,Self-care disability among adults,%,Age-adjusted prevalence,3.0,,,2.8,3.3,100723,78289,SELFCARE,Self-care Disability,United States of America,2020.0,25.067,6.20
15982,2022,Laramie,Received food stamps in the past 12 months amo...,%,Age-adjusted prevalence,5.9,,,5.0,6.8,100723,78289,FOODSTAMP,Food Stamps,United States of America,2020.0,25.067,6.20
