In [1]:
import pandas as pd
import os

In [2]:
# Move directory from notebooks to main
os.chdir('..')
cwd = os.getcwd()

### Medicare Part D Prescribers by Geography and Drug (2022 flavor)

##### Load and view data

In [3]:
cms_df = pd.read_csv(cwd + f'\\data\\raw\\Medicare_Part_D_Prescribers_by_Geography_and_Drug_2022.csv')

cms_df.head()

  cms_df = pd.read_csv(cwd + f'\\data\\raw\\Medicare_Part_D_Prescribers_by_Geography_and_Drug_2022.csv')


Unnamed: 0,Prscrbr_Geo_Lvl,Prscrbr_Geo_Cd,Prscrbr_Geo_Desc,Brnd_Name,Gnrc_Name,Tot_Prscrbrs,Tot_Clms,Tot_30day_Fills,Tot_Drug_Cst,Tot_Benes,...,GE65_Tot_30day_Fills,GE65_Tot_Drug_Cst,GE65_Bene_Sprsn_Flag,GE65_Tot_Benes,LIS_Bene_Cst_Shr,NonLIS_Bene_Cst_Shr,Opioid_Drug_Flag,Opioid_LA_Drug_Flag,Antbtc_Drug_Flag,Antpsyct_Drug_Flag
0,National,,National,1st Tier Unifine Pentips,"Pen Needle, Diabetic",1061,2501,4473.6,70039.61,1147.0,...,3699.6,56669.52,,933.0,2459.79,13488.17,N,N,N,N
1,National,,National,1st Tier Unifine Pentips Plus,"Pen Needle, Diabetic",1372,3846,6524.3,114601.54,1474.0,...,5390.3,91959.96,,1201.0,3363.09,15703.8,N,N,N,N
2,National,,National,Abacavir,Abacavir Sulfate,3161,24329,30526.9,6950841.5,3453.0,...,17095.9,3686774.74,,1888.0,14365.22,181969.45,N,N,N,N
3,National,,National,Abacavir-Lamivudine,Abacavir Sulfate/Lamivudine,2635,24999,33465.6,11964197.97,3426.0,...,20397.9,6845076.51,,2068.0,20304.76,402641.34,N,N,N,N
4,National,,National,Abacavir-Lamivudine-Zidovudine,Abacavir/Lamivudine/Zidovudine,15,59,61.0,68772.61,15.0,...,40.0,43762.49,*,,1517.19,365.38,N,N,N,N


##### Inspect datatypes

In [4]:
cms_df.dtypes

Prscrbr_Geo_Lvl          object
Prscrbr_Geo_Cd           object
Prscrbr_Geo_Desc         object
Brnd_Name                object
Gnrc_Name                object
Tot_Prscrbrs              int64
Tot_Clms                  int64
Tot_30day_Fills         float64
Tot_Drug_Cst            float64
Tot_Benes               float64
GE65_Sprsn_Flag          object
GE65_Tot_Clms           float64
GE65_Tot_30day_Fills    float64
GE65_Tot_Drug_Cst       float64
GE65_Bene_Sprsn_Flag     object
GE65_Tot_Benes          float64
LIS_Bene_Cst_Shr        float64
NonLIS_Bene_Cst_Shr     float64
Opioid_Drug_Flag         object
Opioid_LA_Drug_Flag      object
Antbtc_Drug_Flag         object
Antpsyct_Drug_Flag       object
dtype: object

##### Convert flag columns to categorical data

In [5]:
# Convert flag columns to categorical data types
flag_columns = ['GE65_Sprsn_Flag', 'GE65_Bene_Sprsn_Flag', 'Opioid_Drug_Flag', 
                'Opioid_LA_Drug_Flag', 'Antbtc_Drug_Flag', 'Antpsyct_Drug_Flag']

for col in flag_columns:
    cms_df[col] = cms_df[col].astype('category')

# Confirm the changes
cms_df.dtypes

Prscrbr_Geo_Lvl           object
Prscrbr_Geo_Cd            object
Prscrbr_Geo_Desc          object
Brnd_Name                 object
Gnrc_Name                 object
Tot_Prscrbrs               int64
Tot_Clms                   int64
Tot_30day_Fills          float64
Tot_Drug_Cst             float64
Tot_Benes                float64
GE65_Sprsn_Flag         category
GE65_Tot_Clms            float64
GE65_Tot_30day_Fills     float64
GE65_Tot_Drug_Cst        float64
GE65_Bene_Sprsn_Flag    category
GE65_Tot_Benes           float64
LIS_Bene_Cst_Shr         float64
NonLIS_Bene_Cst_Shr      float64
Opioid_Drug_Flag        category
Opioid_LA_Drug_Flag     category
Antbtc_Drug_Flag        category
Antpsyct_Drug_Flag      category
dtype: object

##### Identify any missing or non-numerical values in numerical cols

In [6]:
# Check for missing values or anomalies in numerical columns
numerical_columns = ['Tot_Prscrbrs', 'Tot_Clms', 'Tot_30day_Fills', 
                     'Tot_Drug_Cst', 'Tot_Benes', 'GE65_Tot_Clms', 
                     'GE65_Tot_30day_Fills', 'GE65_Tot_Drug_Cst', 
                     'GE65_Tot_Benes', 'LIS_Bene_Cst_Shr', 
                     'NonLIS_Bene_Cst_Shr']

# Check for missing values
missing_values = cms_df[numerical_columns].isnull().sum()

# Check for non-numeric values (shouldn't be any since they are already typed correctly)
non_numeric_values = cms_df[numerical_columns].applymap(lambda x: isinstance(x, (int, float)))

missing_values, non_numeric_values.all().all()

(Tot_Prscrbrs                0
 Tot_Clms                    0
 Tot_30day_Fills             0
 Tot_Drug_Cst                0
 Tot_Benes               21743
 GE65_Tot_Clms           21046
 GE65_Tot_30day_Fills    21046
 GE65_Tot_Drug_Cst       21046
 GE65_Tot_Benes          48879
 LIS_Bene_Cst_Shr            0
 NonLIS_Bene_Cst_Shr         0
 dtype: int64,
 True)

##### Handle missing values
*Given the data, it seems that nulls might indicate no prescriptions or beneficiaries. We will fill null values with 0.*

In [7]:
# Fill missing values with 0 in the relevant numerical columns
cms_df[numerical_columns] = cms_df[numerical_columns].fillna(0)

# Verify that there are no more missing values
missing_values_after_fill = cms_df[numerical_columns].isnull().sum()
missing_values_after_fill

Tot_Prscrbrs            0
Tot_Clms                0
Tot_30day_Fills         0
Tot_Drug_Cst            0
Tot_Benes               0
GE65_Tot_Clms           0
GE65_Tot_30day_Fills    0
GE65_Tot_Drug_Cst       0
GE65_Tot_Benes          0
LIS_Bene_Cst_Shr        0
NonLIS_Bene_Cst_Shr     0
dtype: int64

##### Create standardization for locations for merging data

In [8]:
# Standardizing geographic information
cms_df['GEO_TYPE'] = cms_df['Prscrbr_Geo_Lvl'].map({'National': 'National', 'State': 'State'})

cms_df['GEO_VALUE'] = cms_df.apply(
    lambda row: 'United States' if row['GEO_TYPE'] == 'National' else
                ('Foreign Country' if row['Prscrbr_Geo_Desc'] == 'Foreign Country' else row['Prscrbr_Geo_Desc']),
    axis=1
)

# Update GEO_TYPE for 'Foreign Country'
cms_df.loc[cms_df['GEO_VALUE'] == 'Foreign Country', 'GEO_TYPE'] = 'Foreign Country'

# Display the first few rows to confirm the update
cms_df[['GEO_TYPE', 'GEO_VALUE']].head()

Unnamed: 0,GEO_TYPE,GEO_VALUE
0,National,United States
1,National,United States
2,National,United States
3,National,United States
4,National,United States


##### Drop initial location columns in favor of standardized versions

In [9]:
# Drop the original location-related columns
cms_df_cleaned = cms_df.drop(columns=['Prscrbr_Geo_Lvl', 'Prscrbr_Geo_Cd', 'Prscrbr_Geo_Desc'])

# Display the first few rows of the cleaned dataset to confirm the drop
cms_df_cleaned.head()

Unnamed: 0,Brnd_Name,Gnrc_Name,Tot_Prscrbrs,Tot_Clms,Tot_30day_Fills,Tot_Drug_Cst,Tot_Benes,GE65_Sprsn_Flag,GE65_Tot_Clms,GE65_Tot_30day_Fills,...,GE65_Bene_Sprsn_Flag,GE65_Tot_Benes,LIS_Bene_Cst_Shr,NonLIS_Bene_Cst_Shr,Opioid_Drug_Flag,Opioid_LA_Drug_Flag,Antbtc_Drug_Flag,Antpsyct_Drug_Flag,GEO_TYPE,GEO_VALUE
0,1st Tier Unifine Pentips,"Pen Needle, Diabetic",1061,2501,4473.6,70039.61,1147.0,,2018.0,3699.6,...,,933.0,2459.79,13488.17,N,N,N,N,National,United States
1,1st Tier Unifine Pentips Plus,"Pen Needle, Diabetic",1372,3846,6524.3,114601.54,1474.0,,3085.0,5390.3,...,,1201.0,3363.09,15703.8,N,N,N,N,National,United States
2,Abacavir,Abacavir Sulfate,3161,24329,30526.9,6950841.5,3453.0,,12988.0,17095.9,...,,1888.0,14365.22,181969.45,N,N,N,N,National,United States
3,Abacavir-Lamivudine,Abacavir Sulfate/Lamivudine,2635,24999,33465.6,11964197.97,3426.0,,14523.0,20397.9,...,,2068.0,20304.76,402641.34,N,N,N,N,National,United States
4,Abacavir-Lamivudine-Zidovudine,Abacavir/Lamivudine/Zidovudine,15,59,61.0,68772.61,15.0,,40.0,40.0,...,*,0.0,1517.19,365.38,N,N,N,N,National,United States


##### Filter data on drugs of interest for Chronic Conditions based on CDC data

In [10]:
# Expanded mapping of chronic conditions to drugs
condition_drug_mapping = {
    'Diabetes': ['Insulin', 'Metformin', 'Glipizide', 'Glyburide', 'Glimepiride', 'Sitagliptin', 'Saxagliptin', 'Empagliflozin', 'Dapagliflozin', 'Liraglutide', 'Exenatide'],
    'Hypertension': ['Lisinopril', 'Enalapril', 'Ramipril', 'Losartan', 'Valsartan', 'Olmesartan', 'Metoprolol', 'Atenolol', 'Carvedilol', 'Amlodipine', 'Nifedipine', 'Diltiazem', 'Hydrochlorothiazide', 'Furosemide', 'Spironolactone', 'Doxazosin', 'Prazosin', 'Clonidine', 'Methyldopa'],
    'High Cholesterol': ['Atorvastatin', 'Simvastatin', 'Rosuvastatin', 'Pravastatin', 'Cholestyramine', 'Colestipol', 'Ezetimibe', 'Alirocumab', 'Evolocumab', 'Fenofibrate', 'Gemfibrozil', 'Niacin'],
    'COPD': ['Albuterol', 'Levalbuterol', 'Salmeterol', 'Formoterol', 'Fluticasone', 'Budesonide', 'Beclomethasone', 'Ipratropium', 'Tiotropium', 'Roflumilast', 'Theophylline'],
    'Heart Disease': ['Warfarin', 'Apixaban', 'Rivaroxaban', 'Aspirin', 'Clopidogrel', 'Ticagrelor', 'Metoprolol', 'Carvedilol', 'Lisinopril', 'Enalapril', 'Ramipril', 'Losartan', 'Valsartan', 'Nitroglycerin', 'Isosorbide Mononitrate', 'Atorvastatin', 'Simvastatin'],
    'Arthritis': ['Ibuprofen', 'Naproxen', 'Diclofenac', 'Methotrexate', 'Hydroxychloroquine', 'Sulfasalazine', 'Etanercept', 'Adalimumab', 'Infliximab', 'Prednisone', 'Methylprednisolone'],
    'Asthma': ['Fluticasone', 'Budesonide', 'Beclomethasone', 'Albuterol', 'Salmeterol', 'Formoterol', 'Montelukast', 'Zafirlukast', 'Cromolyn Sodium'],
    'Depression': ['Fluoxetine', 'Sertraline', 'Citalopram', 'Escitalopram', 'Paroxetine', 'Venlafaxine', 'Duloxetine', 'Amitriptyline', 'Nortriptyline', 'Phenelzine', 'Tranylcypromine', 'Bupropion', 'Mirtazapine'],
    'Osteoporosis': ['Alendronate', 'Risedronate', 'Raloxifene', 'Calcitonin', 'Teriparatide', 'Denosumab'],
    'Chronic Kidney Disease (CKD)': ['Lisinopril', 'Ramipril', 'Losartan', 'Valsartan', 'Furosemide', 'Spironolactone', 'Sevelamer', 'Calcium Acetate', 'Epoetin Alfa', 'Darbepoetin Alfa']
}

# Flatten the mapping to create a list of drugs of interest
drugs_of_interest = [drug for drugs in condition_drug_mapping.values() for drug in drugs]

In [11]:
# Filter the CMS dataset to include only the drugs of interest
filtered_cms_df = cms_df_cleaned[
    (cms_df_cleaned['Gnrc_Name'].isin(drugs_of_interest))
]

# Display the first few rows of the filtered dataset
filtered_cms_df

Unnamed: 0,Brnd_Name,Gnrc_Name,Tot_Prscrbrs,Tot_Clms,Tot_30day_Fills,Tot_Drug_Cst,Tot_Benes,GE65_Sprsn_Flag,GE65_Tot_Clms,GE65_Tot_30day_Fills,...,GE65_Bene_Sprsn_Flag,GE65_Tot_Benes,LIS_Bene_Cst_Shr,NonLIS_Bene_Cst_Shr,Opioid_Drug_Flag,Opioid_LA_Drug_Flag,Antbtc_Drug_Flag,Antpsyct_Drug_Flag,GEO_TYPE,GEO_VALUE
16,Accolate,Zafirlukast,106,358,687.2,149766.10,91.0,,300.0,584.2,...,,79.0,2099.02,20150.04,N,N,N,N,National,United States
98,Aldactone,Spironolactone,518,1702,3997.2,535855.22,536.0,,1550.0,3660.5,...,,484.0,2325.48,83346.28,N,N,N,N,National,United States
122,Altace,Ramipril,1046,3762,8400.0,2225508.15,902.0,,3638.0,8113.0,...,,873.0,1945.44,354064.82,N,N,N,N,National,United States
132,Amaryl,Glimepiride,512,1980,4058.5,660714.59,458.0,,1890.0,3877.6,...,,437.0,1290.90,80094.66,N,N,N,N,National,United States
241,Atenolol,Atenolol,306359,7109333,17860514.5,67541477.89,1715128.0,,6507052.0,16609448.7,...,,1586916.0,1324966.94,18747970.55,N,N,N,N,National,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115323,Simvastatin,Simvastatin,23,695,1571.9,7063.54,186.0,,648.0,1486.9,...,,173.0,149.55,1795.28,N,N,N,N,Foreign Country,Foreign Country
115331,Spironolactone,Spironolactone,24,250,472.3,2506.43,81.0,,217.0,411.9,...,,70.0,21.82,619.00,N,N,N,N,Foreign Country,Foreign Country
115335,Sulfasalazine,Sulfasalazine,2,25,25.1,627.66,0.0,,25.0,25.1,...,*,0.0,0.00,158.00,N,N,N,N,Foreign Country,Foreign Country
115372,Valsartan,Valsartan,13,72,151.1,2231.36,28.0,,53.0,114.1,...,#,0.0,11.70,640.83,N,N,N,N,Foreign Country,Foreign Country


##### Remove national rows (not of interest in this analysis)

In [12]:
# Remove rows where GEO_TYPE is 'National' or 'Foreign Country'
cms_df_cleaned = cms_df_cleaned[~cms_df_cleaned['GEO_TYPE'].isin(['National', 'Foreign Country'])]

# Display the first few rows of the filtered CMS dataset to confirm
cms_df_cleaned.head()

Unnamed: 0,Brnd_Name,Gnrc_Name,Tot_Prscrbrs,Tot_Clms,Tot_30day_Fills,Tot_Drug_Cst,Tot_Benes,GE65_Sprsn_Flag,GE65_Tot_Clms,GE65_Tot_30day_Fills,...,GE65_Bene_Sprsn_Flag,GE65_Tot_Benes,LIS_Bene_Cst_Shr,NonLIS_Bene_Cst_Shr,Opioid_Drug_Flag,Opioid_LA_Drug_Flag,Antbtc_Drug_Flag,Antpsyct_Drug_Flag,GEO_TYPE,GEO_VALUE
3584,1st Tier Unifine Pentips Plus,"Pen Needle, Diabetic",10,22,44.3,801.55,0.0,#,0.0,0.0,...,*,0.0,21.7,51.52,N,N,N,N,State,Alabama
3585,Abacavir,Abacavir Sulfate,48,455,496.0,133761.31,56.0,,171.0,188.0,...,,20.0,745.42,2163.21,N,N,N,N,State,Alabama
3586,Abacavir-Lamivudine,Abacavir Sulfate/Lamivudine,30,228,254.0,95155.34,29.0,,70.0,76.0,...,*,0.0,790.95,4041.79,N,N,N,N,State,Alabama
3587,Abilify,Aripiprazole,14,77,103.0,78308.72,11.0,,22.0,28.0,...,*,0.0,322.6,360.0,N,N,N,Y,State,Alabama
3588,Abilify Maintena,Aripiprazole,253,5363,5372.1,14021217.83,689.0,,697.0,699.1,...,,88.0,10878.7,33224.34,N,N,N,Y,State,Alabama


##### Filter to data of interest

In [13]:
# Define the columns to keep
columns_to_keep = [
    'GEO_TYPE', 
    'GEO_VALUE',
    'Brnd_Name',
    'Gnrc_Name', 
    'Tot_Clms', 
    'Tot_Drug_Cst', 
    'Tot_Benes'
]

# Filter the CMS dataset to include only the relevant columns
cms_filtered_df = cms_df_cleaned[columns_to_keep]

# Display the first few rows of the filtered CMS dataset
cms_filtered_df

Unnamed: 0,GEO_TYPE,GEO_VALUE,Brnd_Name,Gnrc_Name,Tot_Clms,Tot_Drug_Cst,Tot_Benes
3584,State,Alabama,1st Tier Unifine Pentips Plus,"Pen Needle, Diabetic",22,801.55,0.0
3585,State,Alabama,Abacavir,Abacavir Sulfate,455,133761.31,56.0
3586,State,Alabama,Abacavir-Lamivudine,Abacavir Sulfate/Lamivudine,228,95155.34,29.0
3587,State,Alabama,Abilify,Aripiprazole,77,78308.72,11.0
3588,State,Alabama,Abilify Maintena,Aripiprazole,5363,14021217.83,689.0
...,...,...,...,...,...,...,...
114931,State,Unknown,Vyvanse,Lisdexamfetamine Dimesylate,25,10839.65,0.0
114932,State,Unknown,Warfarin Sodium,Warfarin Sodium,183,1614.41,33.0
114933,State,Unknown,Xarelto,Rivaroxaban,237,189565.65,69.0
114934,State,Unknown,Xtandi,Enzalutamide,19,260344.55,0.0


##### Send to final dataframe

In [23]:
# Save the processed CMS data to a CSV file
cms_filtered_df.to_csv(cwd + f'\\data\\processed\\processed_Medicare_Prescriptions_data.csv', index=False)

---
---

### CDC PLACES Local Data for Better Health (2023 flavor)

##### Load data

In [15]:
cdc_df = pd.read_csv(cwd + f'\\data\\raw\\PLACES__Local_Data_for_Better_Health__County_Data_2023_release_20240811.csv')

cdc_df.head()

Unnamed: 0,Year,StateAbbr,StateDesc,LocationName,DataSource,Category,Measure,Data_Value_Unit,Data_Value_Type,Data_Value,...,Data_Value_Footnote,Low_Confidence_Limit,High_Confidence_Limit,TotalPopulation,LocationID,CategoryID,MeasureId,DataValueTypeID,Short_Question_Text,Geolocation
0,2021,AR,Arkansas,Conway,BRFSS,Health Outcomes,Arthritis among adults aged >=18 years,%,Crude prevalence,33.9,...,,29.1,38.8,20873,5029,HLTHOUT,ARTHRITIS,CrdPrv,Arthritis,POINT (-92.6892479 35.265702)
1,2020,AZ,Arizona,Maricopa,BRFSS,Prevention,Mammography use among women aged 50-74 years,%,Crude prevalence,69.0,...,,65.5,72.3,4496588,4013,PREVENT,MAMMOUSE,CrdPrv,Mammography,POINT (-112.4989296 33.3451756)
2,2021,AR,Arkansas,Jackson,BRFSS,Health Outcomes,Obesity among adults aged >=18 years,%,Crude prevalence,40.1,...,,32.5,47.8,16811,5067,HLTHOUT,OBESITY,CrdPrv,Obesity,POINT (-91.2232051 35.5964674)
3,2020,CA,California,Alameda,BRFSS,Health Outcomes,All teeth lost among adults aged >=65 years,%,Crude prevalence,6.7,...,,4.8,9.2,1648556,6001,HLTHOUT,TEETHLOST,CrdPrv,All Teeth Lost,POINT (-121.912488 37.6471385)
4,2021,AR,Arkansas,Ashley,BRFSS,Health Outcomes,Depression among adults aged >=18 years,%,Crude prevalence,25.4,...,,21.3,30.0,18674,5003,HLTHOUT,DEPRESSION,CrdPrv,Depression,POINT (-91.7722672 33.1908354)


##### Filter to only 2021 (most recent)

In [16]:
# Filter the CDC dataset to include only records from 2021
cdc_df_2021 = cdc_df[cdc_df['Year'] == 2021]

##### Standardize geo information

In [17]:
# Standardize geographic information in the CDC dataset
cdc_df_2021['GEO_TYPE'] = cdc_df_2021['StateDesc'].apply(lambda x: 'National' if x == 'United States' else 'State')
cdc_df_2021['GEO_VALUE'] = cdc_df_2021['StateDesc'].apply(lambda x: 'United States' if x == 'United States' else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cdc_df_2021['GEO_TYPE'] = cdc_df_2021['StateDesc'].apply(lambda x: 'National' if x == 'United States' else 'State')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cdc_df_2021['GEO_VALUE'] = cdc_df_2021['StateDesc'].apply(lambda x: 'United States' if x == 'United States' else x)


In [18]:
# Aggregate county-level data to state level using a weighted average
cdc_state_df = cdc_df_2021[cdc_df_2021['GEO_TYPE'] == 'State'].copy()

In [19]:
# Calculate weighted average for each state and condition
cdc_state_aggregated = cdc_state_df.groupby(['StateDesc', 'Measure']).apply(
    lambda x: pd.Series({
        'Weighted_Average': (x['Data_Value'] * x['TotalPopulation']).sum() / x['TotalPopulation'].sum(),
        'Total_Population': x['TotalPopulation'].sum()
    })
).reset_index()

# Drop unnecessary columns, keeping only those of interest
columns_to_keep = ['StateDesc', 'Measure', 'GEO_TYPE', 'GEO_VALUE']
cdc_state_df_cleaned = cdc_state_df[columns_to_keep]

# Merge the aggregated results back with the cleaned data
cdc_state_aggregated = pd.merge(
    cdc_state_aggregated, 
    cdc_state_df_cleaned,
    on=['StateDesc', 'Measure'], 
    how='left'
).drop_duplicates()

# Drop the 'StateDesc' column after merging
cdc_state_aggregated = cdc_state_aggregated.drop(columns=['StateDesc'])


##### Filter to Measures of interest

In [20]:
# Define the relevant measures and their shorthand names
measure_shorthand_mapping = {
    'Diagnosed diabetes among adults aged >=18 years': 'Diabetes',
    'High blood pressure among adults aged >=18 years': 'Hypertension',
    'High cholesterol among adults aged >=18 years who have been screened in the past 5 years': 'High_Cholesterol',
    'Chronic obstructive pulmonary disease among adults aged >=18 years': 'COPD',
    'Coronary heart disease among adults aged >=18 years': 'Heart_Disease',
    'Arthritis among adults aged >=18 years': 'Arthritis',
    'Current asthma among adults aged >=18 years': 'Asthma',
    'Depression among adults aged >=18 years': 'Depression',
    'Chronic kidney disease among adults aged >=18 years': 'CKD'
}

# Filter the CDC dataset to include only the relevant measures and add the shorthand column
cdc_filtered_df = cdc_state_aggregated[cdc_state_aggregated['Measure'].isin(measure_shorthand_mapping.keys())]

# Add the shorthand column
cdc_filtered_df['Measure_Short'] = cdc_filtered_df['Measure'].map(measure_shorthand_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cdc_filtered_df['Measure_Short'] = cdc_filtered_df['Measure'].map(measure_shorthand_mapping)


##### Re-order cols to final

In [21]:
# Reorder the columns as specified
cdc_filtered_df = cdc_filtered_df[['GEO_TYPE', 'GEO_VALUE', 'Measure', 'Measure_Short', 'Weighted_Average', 'Total_Population']]

# Display the first few rows of the reordered CDC dataset
cdc_filtered_df.head()


Unnamed: 0,GEO_TYPE,GEO_VALUE,Measure,Measure_Short,Weighted_Average,Total_Population
134,State,Alabama,Arthritis among adults aged >=18 years,Arthritis,29.802001,10079754.0
670,State,Alabama,Chronic kidney disease among adults aged >=18 ...,CKD,3.272161,10079754.0
804,State,Alabama,Chronic obstructive pulmonary disease among ad...,COPD,7.756373,10079754.0
1072,State,Alabama,Coronary heart disease among adults aged >=18 ...,Heart_Disease,6.619037,10079754.0
1206,State,Alabama,Current asthma among adults aged >=18 years,Asthma,10.423398,10079754.0


##### Send to dataframe

In [22]:
# Save the processed CDC data to a CSV file
cdc_filtered_df.to_csv(cwd + f'/data/processed/processed_CDC_PLACES_data.csv', index=False)