### Chronic Disease Prevalence in the United States
## Trends

In [29]:
# load dataset
# for this we will start with PLACES dataset (PLACES__Local_Data_for_Better_Health__Place_Data_2023_release_20240314.csv)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# load dataset
df = pd.read_csv('PLACES__Local_Data_for_Better_Health__Place_Data_2023_release_20240314.csv')

# columns
print(df.columns)

Index(['Year', 'StateAbbr', 'StateDesc', 'LocationName', 'DataSource',
       'Category', 'Measure', 'Data_Value_Unit', 'Data_Value_Type',
       'Data_Value', 'Data_Value_Footnote_Symbol', 'Data_Value_Footnote',
       'Low_Confidence_Limit', 'High_Confidence_Limit', 'TotalPopulation',
       'Geolocation', 'LocationID', 'CategoryID', 'MeasureId',
       'DataValueTypeID', 'Short_Question_Text'],
      dtype='object')


In [30]:
# unique 'Year' values
print(df['Year'].unique())

# count of 'Year' values
print(df['Year'].value_counts())

[2020 2021]
Year
2021    1599234
2020     455534
Name: count, dtype: int64


In [31]:
# any missing values percentage
print(df.isnull().sum()/len(df)*100)

Year                           0.000000
StateAbbr                      0.000000
StateDesc                      0.000000
LocationName                   0.000000
DataSource                     0.000000
Category                       0.000000
Measure                        0.000000
Data_Value_Unit                0.000000
Data_Value_Type                0.000000
Data_Value                     1.875589
Data_Value_Footnote_Symbol    98.124411
Data_Value_Footnote           98.124411
Low_Confidence_Limit           1.875589
High_Confidence_Limit          1.875589
TotalPopulation                0.000000
Geolocation                    0.000000
LocationID                     0.000000
CategoryID                     0.000000
MeasureId                      0.000000
DataValueTypeID                0.000000
Short_Question_Text            0.000000
dtype: float64


In [32]:
# define observation columns
obs_cols = ['Year', 'LocationID']

# define dimensions for observations
obs_dims = ['LocationID', 'LocationName', 'StateAbbr', 'StateDesc', 'GeoLocation', 'TotalPopulation']

# define feature columns
feature_cols = ['Category', 'Short_Question_Text', 'Measure', 'Data_Value', 'Data_Value_Unit', 'Data_Value_Type', 'Low_Confidence_Limit', 'High_Confidence_Limit']

In [42]:
# profile 'Categroy', 'Short_Question_Text', 'Measure', 'Data_Value_Unit', 'Data_Value_Type' columns
# group by above columns and count
print(df.groupby(['Category']).size())
print("\n")

# for 'Category' = 'Health Outcomes' group by 'Short_Question_Text' and count
print(df[df['Category'] == 'Health Outcomes'].groupby(['Short_Question_Text']).size())


Category
Disability               386022
Health Outcomes          718680
Health Risk Behaviors    222406
Health Status            165438
Prevention               562222
dtype: int64


Short_Question_Text
All Teeth Lost            56928
Arthritis                 55146
COPD                      55146
Cancer (except skin)      55146
Chronic Kidney Disease    55146
Coronary Heart Disease    55146
Current Asthma            55146
Depression                55146
Diabetes                  55146
High Blood Pressure       55146
High Cholesterol          55146
Obesity                   55146
Stroke                    55146
dtype: int64


In [41]:
# for 'Category' = 'Prevention' group by 'Short_Question_Text' and count
print(df[df['Category'] == 'Prevention'].groupby(['Short_Question_Text']).size())

Short_Question_Text
Annual Checkup                              55146
Cervical Cancer Screening                   56968
Cholesterol Screening                       55146
Colorectal Cancer Screening                 56962
Core preventive services for older men      56892
Core preventive services for older women    56892
Dental Visit                                56968
Health Insurance                            55146
Mammography                                 56956
Taking BP Medication                        55146
dtype: int64


In [22]:
# for Short Question Text = 'Chronic Kidney Disease' get the data
df_ckd = df[df['Short_Question_Text'] == 'Chronic Kidney Disease']

# select cols - Category, Measure, Data_Value_Unit, Data_Value_Type
df_ckd = df_ckd[['Category', 'Measure', 'Data_Value_Unit', 'Data_Value_Type', 'Data_Value']]
df_ckd.head()

Unnamed: 0,Category,Measure,Data_Value_Unit,Data_Value_Type,Data_Value
668,Health Outcomes,Chronic kidney disease among adults aged >=18 ...,%,Crude prevalence,4.0
702,Health Outcomes,Chronic kidney disease among adults aged >=18 ...,%,Crude prevalence,3.4
755,Health Outcomes,Chronic kidney disease among adults aged >=18 ...,%,Crude prevalence,4.2
796,Health Outcomes,Chronic kidney disease among adults aged >=18 ...,%,Crude prevalence,4.4
812,Health Outcomes,Chronic kidney disease among adults aged >=18 ...,%,Crude prevalence,3.3


In [2]:
# distinct values in Data_Value_Type
df['Data_Value_Type'].unique()

array(['Crude prevalence', 'Age-adjusted prevalence'], dtype=object)

In [3]:
# distinct values in Data_Value_Unit
df['Data_Value_Unit'].unique()

array(['%'], dtype=object)

In [4]:
# distinct values in Short_Question_Text
df['Short_Question_Text'].unique()

array(['All Teeth Lost', 'Arthritis', 'Obesity', 'Any Disability',
       'Current Asthma', 'Stroke', 'High Blood Pressure', 'Mammography',
       'Depression', 'Vision Disability', 'Diabetes', 'Binge Drinking',
       'Mobility Disability', 'Hearing Disability',
       'Taking BP Medication', 'Coronary Heart Disease',
       'Cognitive Disability', 'Physical Inactivity',
       'Core preventive services for older men', 'Annual Checkup',
       'Cancer (except skin)', 'Mental Health',
       'Independent Living Disability', 'Dental Visit', 'COPD',
       'Self-care Disability', 'Current Smoking', 'Physical Health',
       'Health Insurance', 'Sleep <7 hours', 'High Cholesterol',
       'Cholesterol Screening', 'Colorectal Cancer Screening',
       'Chronic Kidney Disease', 'Cervical Cancer Screening',
       'General Health', 'Core preventive services for older women'],
      dtype=object)

In [16]:
# for each Short_Question_Text in (Arthritis, Obesity, Current Asthma, High Blood Pressure, Depression, Diabetes, Coronary Heart Disease, Cancer, Chronic Kidney Disease)
# plot the average Data_Value for each year

# Arthritis
df_arthritis = df[df['Short_Question_Text'] == 'Obesity']

# mean of Data_Value for each state
df_arthritis_mean = df_arthritis.groupby(['Year', 'StateAbbr'])['Data_Value'].mean().reset_index()
df_arthritis_mean.head(500)


Unnamed: 0,Year,StateAbbr,Data_Value
0,2021,AK,38.075954
1,2021,AL,40.614073
2,2021,AR,39.620837
3,2021,AZ,36.075155
4,2021,CA,30.668347
5,2021,CO,26.407305
6,2021,CT,29.533803
7,2021,DC,25.05
8,2021,DE,36.332026
9,2021,GA,37.733845


In [6]:


# Obesity
df_obesity = df[df['Short_Question_Text'] == 'Obesity']
df_obesity = df_obesity.groupby('Year').mean()
df_obesity = df_obesity.reset_index()
df_obesity = df_obesity[['Year', 'Data_Value']]
df_obesity = df_obesity.rename(columns={'Data_Value': 'Obesity'})
df_obesity

# Current Asthma
df_asthma = df[df['Short_Question_Text'] == 'Current Asthma']
df_asthma = df_asthma.groupby('Year').mean()
df_asthma = df_asthma.reset_index()
df_asthma = df_asthma[['Year', 'Data_Value']]
df_asthma = df_asthma.rename(columns={'Data_Value': 'Current Asthma'})
df_asthma

# High Blood Pressure
df_blood_pressure = df[df['Short_Question_Text'] == 'High Blood Pressure']
df_blood_pressure = df_blood_pressure.groupby('Year').mean()
df_blood_pressure = df_blood_pressure.reset_index()
df_blood_pressure = df_blood_pressure[['Year', 'Data_Value']]
df_blood_pressure = df_blood_pressure.rename(columns={'Data_Value': 'High Blood Pressure'})
df_blood_pressure

# Depression
df_depression = df[df['Short_Question_Text'] == 'Depression']
df_depression = df_depression.groupby('Year').mean()
df_depression = df_depression.reset_index()
df_depression = df_depression[['Year', 'Data_Value']]
df_depression = df_depression.rename(columns={'Data_Value': 'Depression'})
df_depression

# Diabetes
df_diabetes = df[df['Short_Question_Text'] == 'Diabetes']
df_diabetes = df_diabetes.groupby('Year').mean()
df_diabetes = df_diabetes.reset_index()
df_diabetes = df_diabetes[['Year', 'Data_Value']]
df_diabetes = df_diabetes.rename(columns={'Data_Value': 'Diabetes'})
df_diabetes

# Coronary Heart Disease
df_heart_disease = df[df['Short_Question_Text'] == 'Coronary Heart Disease']
df_heart_disease = df_heart_disease.groupby('Year').mean()
df_heart_disease = df_heart_disease.reset_index()
df_heart_disease = df_heart_disease[['Year', 'Data_Value']]
df_heart_disease = df_heart_disease.rename(columns={'Data_Value': 'Coronary Heart Disease'})
df_heart_disease

# Cancer
df_cancer = df[df['Short_Question_Text'] == 'Cancer']
df_cancer = df_cancer.groupby('Year').mean()
df_cancer = df_cancer.reset_index()
df_cancer = df_cancer[['Year', 'Data_Value']]
df_cancer = df_cancer.rename(columns={'Data_Value': 'Cancer'})
df_cancer

# Chronic Kidney Disease
df_kidney_disease = df[df['Short_Question_Text'] == 'Chronic Kidney Disease']
df_kidney_disease = df_kidney_disease.groupby('Year').mean()
df_kidney_disease = df_kidney_disease.reset_index()
df_kidney_disease = df_kidney_disease[['Year', 'Data_Value']]
df_kidney_disease = df_kidney_disease.rename(columns={'Data_Value': 'Chronic Kidney Disease'})
df_kidney_disease

# merge all dataframes
df_merged = pd.merge(df_arthritis, df_obesity, on='Year')
df_merged = pd.merge(df_merged, df_asthma, on='Year')
df_merged = pd.merge(df_merged, df_blood_pressure, on='Year')
df_merged = pd.merge(df_merged, df_depression, on='Year')
df_merged = pd.merge(df_merged, df_diabetes, on='Year')
df_merged = pd.merge(df_merged, df_heart_disease, on='Year')
df_merged = pd.merge(df_merged, df_cancer, on='Year')
df_merged = pd.merge(df_merged, df_kidney_disease, on='Year')
df_merged

# plot
df_merged.plot(x='Year', y=['Arthritis', 'Obesity', 'Current Asthma', 'High Blood Pressure', 'Depression', 'Diabetes', 'Coronary Heart Disease', 'Cancer', 'Chronic Kidney Disease'], kind='line', figsize=(15, 10))
plt.title('Average Data_Value for each year')
plt.xlabel('Year')
plt.ylabel('Average Data_Value')
plt.show()

TypeError: agg function failed [how->mean,dtype->object]