In [93]:
import pandas as pd
import matplotlib as plt
import numpy as np

### Read the dataset about health and filter it by USA

In [94]:
df = pd.read_csv("PLACES_Local_Data_for_Better_Health_County_Data_2022_release_20250320.csv")

df = df[df["Data_Value_Type"].str.contains("Crude", na=False)]


# Rename the city column
df = df.rename(columns={"LocationName": "city"})


# Drop unnecessary columns
df = df.drop(['StateAbbr', 'StateDesc', 'DataSource', 'Category', 'LocationID', 'CategoryID', 'DataValueTypeID', 'Geolocation', 'TotalPopulation', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'Data_Value_Footnote_Symbol', 'MeasureId', 'Data_Value_Footnote' ], axis=1)
# First, let's identify the unique cities and measures
unique_cities = df['city'].unique()
unique_measures = df['Short_Question_Text'].unique()

print(f"Number of unique cities: {len(unique_cities)}")
print(f"Number of unique measures: {len(unique_measures)}")

# Create a pivot table with cities as index and measures as columns
new_df = df.pivot_table(
    index='city',
    columns='Measure',
    values='Data_Value',
    aggfunc='mean'  # Use mean if there are duplicate entries
)

# Reset the index to make 'city' a column again
new_df = new_df.reset_index()

# Display the first few rows
list(df['Measure'].values)

Number of unique cities: 1841
Number of unique measures: 30


['Current lack of health insurance among adults aged 18-64 years',
 'Diagnosed diabetes among adults aged >=18 years',
 'Diagnosed diabetes among adults aged >=18 years',
 'Cervical cancer screening among adult women aged 21-65 years',
 'Depression among adults aged >=18 years',
 'Depression among adults aged >=18 years',
 'Obesity among adults aged >=18 years',
 'Arthritis among adults aged >=18 years',
 'Depression among adults aged >=18 years',
 'High blood pressure among adults aged >=18 years',
 'Visits to dentist or dental clinic among adults aged >=18 years',
 'Diagnosed diabetes among adults aged >=18 years',
 'Arthritis among adults aged >=18 years',
 'Arthritis among adults aged >=18 years',
 'High blood pressure among adults aged >=18 years',
 'Arthritis among adults aged >=18 years',
 'Chronic obstructive pulmonary disease among adults aged >=18 years',
 'Visits to doctor for routine checkup within the past year among adults aged >=18 years',
 'Current asthma among adults a

### Read the air quality dataset and clean the data

In [95]:
airdf = pd.read_excel("who_ambient_air_quality_database_version_2024_(v6.1).xlsx", sheet_name="Update 2024 (V6.1)")


# Filter by US
airdf = airdf[airdf["country_name"] == "United States of America"]

# Drop unnnecessary columns
airdf = airdf.drop(['country_name', 'version','reference','web_link', 'population_source','who_ms', 'type_of_stations', 'population', 'latitude', 'longitude', 'iso3', 'who_region', 'pm25_tempcov', 'pm10_tempcov','no2_tempcov'], axis=1)

# Remove state from city names
airdf['city'] = airdf['city'].str.split(' ').str[0]

# Leave only year 2020
airdf = airdf[airdf['year'] == 2020.0]
airdf

Unnamed: 0,city,year,pm10_concentration,pm25_concentration,no2_concentration
99,Aberdeen,2020.0,,,
243,Adrian,2020.0,,8.20,
502,Akron,2020.0,,8.15,
676,Albany,2020.0,,9.10,
688,Albany,2020.0,,6.40,
...,...,...,...,...,...
38850,Yakima,2020.0,,12.30,
39126,York,2020.0,,9.30,14.0
39151,Youngstown,2020.0,,7.65,
39192,Yuba,2020.0,,16.40,11.7


### Merge the datasets

In [96]:
merged_df = pd.merge(new_df, airdf, on='city', how='inner')
merged_df

Unnamed: 0,city,All teeth lost among adults aged >=65 years,Arthritis among adults aged >=18 years,Binge drinking among adults aged >=18 years,Cancer (excluding skin cancer) among adults aged >=18 years,Cervical cancer screening among adult women aged 21-65 years,Cholesterol screening among adults aged >=18 years,Chronic kidney disease among adults aged >=18 years,Chronic obstructive pulmonary disease among adults aged >=18 years,Coronary heart disease among adults aged >=18 years,...,Physical health not good for >=14 days among adults aged >=18 years,Sleeping less than 7 hours among adults aged >=18 years,Stroke among adults aged >=18 years,Taking medicine for high blood pressure control among adults aged >=18 years with high blood pressure,Visits to dentist or dental clinic among adults aged >=18 years,Visits to doctor for routine checkup within the past year among adults aged >=18 years,year,pm10_concentration,pm25_concentration,no2_concentration
0,Albany,9.35,19.40,18.75,5.85,80.10,83.35,2.50,5.35,5.15,...,7.95,30.60,2.40,72.35,67.05,71.25,2020.0,,9.10,
1,Albany,9.35,19.40,18.75,5.85,80.10,83.35,2.50,5.35,5.15,...,7.95,30.60,2.40,72.35,67.05,71.25,2020.0,,6.40,
2,Alexandria,6.20,19.50,16.30,5.60,87.60,88.90,2.30,4.00,4.30,...,7.10,34.40,2.20,72.10,74.00,73.40,2020.0,,7.30,
3,Anchorage,9.50,20.30,20.00,5.70,77.90,80.40,2.40,4.60,4.90,...,8.30,30.90,2.50,64.00,64.70,63.40,2020.0,9.767,5.25,
4,Athens,14.70,26.00,18.00,5.60,75.60,78.10,2.90,9.30,7.00,...,11.10,35.80,3.10,71.60,56.90,77.30,2020.0,,8.40,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,Worcester,9.40,29.75,14.55,8.25,84.25,89.40,3.20,6.90,7.35,...,9.35,30.80,3.30,78.10,68.55,80.20,2020.0,12.700,8.00,19.0
120,Yakima,13.50,24.20,13.80,6.00,77.90,79.60,3.50,7.50,7.10,...,12.90,31.10,3.60,66.40,59.60,67.60,2020.0,,12.30,
121,York,9.32,27.62,16.82,7.46,83.30,86.62,2.88,6.62,6.66,...,9.24,31.56,2.98,76.24,68.66,76.80,2020.0,,9.30,14.0
122,Yuba,11.70,23.20,18.10,5.90,80.40,83.80,3.00,7.30,6.30,...,11.60,34.40,3.10,68.40,59.30,66.40,2020.0,,16.40,11.7


In [None]:
# Check the actual column names first
print("Merged dataframe columns:", merged_df.columns.tolist())

# Update with the correct column names for pollutants
# For example, these might be the actual names:
pollutants = ['pm10_concentration', 'pm25_concentration', 'no2_concentration']  # Update with actual column names

# Get the disease columns
disease_columns = [col for col in new_df.columns if col != 'city']

# Create a correlation matrix between pollutants and diseases
correlation_matrix = merged_df[pollutants + disease_columns].corr()

# Extract only the relevant part of the correlation matrix - pollutants vs diseases
pollutant_disease_corr = correlation_matrix.loc[pollutants, disease_columns]

# Display the correlation matrix
print("Correlation between air pollutants and health measures:")
pollutant_disease_corr

# Visualize the correlation matrix as a heatmap using only matplotlib
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(14, 10))
im = plt.imshow(pollutant_disease_corr, cmap='coolwarm', vmin=-1, vmax=1)

# Add colorbar
cbar = plt.colorbar(im)
cbar.set_label('Correlation Coefficient')

# Add labels and title
plt.title('Correlation between Air Pollutants and Health Measures')
plt.xticks(np.arange(len(disease_columns)), disease_columns, rotation=90)
plt.yticks(np.arange(len(pollutants)), pollutants)

# Add correlation values as text annotations
for i in range(len(pollutants)):
    for j in range(len(disease_columns)):
        text = plt.text(j, i, f'{pollutant_disease_corr.iloc[i, j]:.2f}',
                      ha="center", va="center", color="black")

plt.tight_layout()
plt.show()

Merged dataframe columns: ['city', 'All teeth lost among adults aged >=65 years', 'Arthritis among adults aged >=18 years', 'Binge drinking among adults aged >=18 years', 'Cancer (excluding skin cancer) among adults aged >=18 years', 'Cervical cancer screening among adult women aged 21-65 years', 'Cholesterol screening among adults aged >=18 years', 'Chronic kidney disease among adults aged >=18 years', 'Chronic obstructive pulmonary disease among adults aged >=18 years', 'Coronary heart disease among adults aged >=18 years', 'Current asthma among adults aged >=18 years', 'Current lack of health insurance among adults aged 18-64 years', 'Current smoking among adults aged >=18 years', 'Depression among adults aged >=18 years', 'Diagnosed diabetes among adults aged >=18 years', 'Fair or poor self-rated health status among adults aged >=18 years', 'Fecal occult blood test, sigmoidoscopy, or colonoscopy among adults aged 50-75 years', 'High blood pressure among adults aged >=18 years', 'Hi

TypeError: 'module' object is not callable