In [235]:
import requests
import subprocess
import sys
import os
from urllib.request import urlretrieve
from zipfile import ZipFile

# Function to download the file from Google Drive
def download_file_from_google_drive(file_id, destination):
    url = f"https://drive.google.com/uc?export=download&id={file_id}"
    response = requests.get(url)
    if response.status_code == 200:
        with open(destination, 'wb') as f:
            f.write(response.content)
        print(f"{destination} downloaded successfully.")
    else:
        print(f"Failed to download {destination}.")

# Download the requirements.txt file from Google Drive
file_id = "1e8vmGO8Z9g-QoDW9N5u9jACtx_qCVuM5"
destination = "requirements.txt"
download_file_from_google_drive(file_id, destination)

# Function to install missing packages
def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Read the requirements.txt file and store libraries with their versions
libraries = {}

with open('requirements.txt', 'r') as f:
    for line in f:
        # Each line in the file is in the form 'library==version'
        if '==' in line:
            lib, version = line.strip().split('==')
            libraries[lib] = version

# Standard libraries that do not require installation
standard_libraries = [
    'os', 'zipfile', 'urllib', 'csv', 'math'
]

# Check third-party libraries and install them if missing
for lib, version in libraries.items():
    try:
        __import__(lib)
        print(f"{lib} is already installed.")
    except ImportError:
        print(f"{lib} not found. Installing...")
        install_package(f"{lib}=={version}")

# Confirm standard libraries are available (they should be, as they are part of Python)
for lib in standard_libraries:
    try:
        __import__(lib)
        print(f"Standard library {lib} is available.")
    except ImportError:
        print(f"Standard library {lib} not found, but it should be included in the Python standard library.")

# Import the third-party libraries with shorthand notations
import requests
import pandas as pd
import numpy as np
import altair as alt
from vega_datasets import data
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.neighbors import BallTree
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from scipy.stats import t

requirements.txt downloaded successfully.
requests is already installed.
pandas is already installed.
numpy is already installed.
altair is already installed.
vega_datasets is already installed.
scikit-learn not found. Installing...
seaborn is already installed.
matplotlib is already installed.
statsmodels is already installed.
scipy is already installed.
Standard library os is available.
Standard library zipfile is available.
Standard library urllib is available.
Standard library csv is available.
Standard library math is available.


In [236]:
def download_file(download_f, filename, force=False):
    """
    Downloads a file using a dynamic function.

    :param download_f: Function to be used for the actual download operation
    :param filename: File name for saving
    :param force: Whether to overwrite an existing file
    """
    if force or not os.path.exists(filename):
        download_f(filename)

In [237]:
def download_cdc_survey(filename):
    """
    Downloads the cdc survey dataset.

    :param filename: File name for saving
    """
    def download(filename):
        """
        Actual download logic for the personal cdc survey dataset

        :param filename: File name for saving
        """
        url = 'https://www.cdc.gov/brfss/annual_data/2022/files/LLCP2022ASC.zip'
        urlretrieve(url, filename)
    download_file(download, filename)
    with ZipFile(filename, 'r') as zObject:
        zObject.extractall(path='./cdc_survey_unzipped/')


In [238]:
def download_heart_disease_mortality_by_county(filename):
    """
    Downloads the mortality dataset.

    :param filename: File name for saving
    """
    def download(filename):
        """
        Actual download logic for the personal income dataset

        :param filename: File name for saving
        """
        url = 'https://data.cdc.gov/resource/jiwm-ppbh.csv?$limit=1000000'
        urlretrieve(url, filename)
    download_file(download, filename)


In [239]:
def download_covid_dataset(filename):
    """
    Downloads the CDC COVID-19 mortality dataset.

    :param filename: File name for saving
    """
    def download(filename):
        """
        Actual download logic for the CDC COVID-19 dataset.

        :param filename: File name for saving
        """
        url = 'https://data.cdc.gov/resource/hk9y-quqm.csv?$limit=1000000'
        urlretrieve(url, filename)

    download_file(download, filename)

In [240]:
heart_disease_mortality_by_county_csv_filename = 'mortality.csv'
download_heart_disease_mortality_by_county(heart_disease_mortality_by_county_csv_filename)

cdc_survey_zip_filename = 'cdc_survey.zip'
download_cdc_survey(cdc_survey_zip_filename)

covid_dataset_csv_filename = 'covid_mortality.csv'
download_covid_dataset(covid_dataset_csv_filename)

Add later for the parsing of the COVID dataset

In [241]:
state_code_to_abbr = {
    1: 'AL', 2: 'AK', 4: 'AZ', 5: 'AR', 6: 'CA', 8: 'CO', 9: 'CT', 10: 'DE', 11: 'DC', 12: 'FL', 13: 'GA', 15: 'HI', 16: 'ID', 17: 'IL', 18: 'IN', 19: 'IA', 20: 'KS',
    21: 'KY', 22: 'LA', 23: 'ME', 24: 'MD', 25: 'MA', 26: 'MI', 27: 'MN', 28: 'MS', 29: 'MO', 30: 'MT', 31: 'NE', 32: 'NV', 33: 'NH', 34: 'NJ', 35: 'NM', 36: 'NY',
    37: 'NC', 38: 'ND', 39: 'OH', 40: 'OK', 41: 'OR', 42: 'PA', 44: 'RI', 45: 'SC', 46: 'SD', 47: 'TN', 48: 'TX', 49: 'UT', 50: 'VT', 51: 'VA', 53: 'WA', 54: 'WV', 55: 'WI', 56: 'WY'
    }

def parse_cdc_survey():
  """
  Reads the CDC survey input file and parses the contents using ASCII format.

  :return: CDC survey dataframe.
  """
  rows = []
  with open('./cdc_survey_unzipped/LLCP2022.ASC ', 'r') as f:
    count = 0
    for line in f:
      row = {}
      state_code = int(line[:2])
      interview_completed = line[31:35] == '1100'
      if interview_completed and state_code in state_code_to_abbr:
        row['State Abbr.'] = state_code_to_abbr[state_code]
        row['Interview Year'] = line[22:26]

        age_older_18_landline = line[67]
        if age_older_18_landline == '1':
          row['Is Adult Landline'] = True
        elif age_older_18_landline == '2':
          row['Is Adult Landline'] = False

        gender = line[1979]
        if gender == '1':
          row['Gender'] = 'Male'
        elif gender == '2':
          row['Gender'] = 'Female'

        num_of_adults_landline = line[69:71].strip()
        if num_of_adults_landline:
          row['Num of Adults Landline'] = int(num_of_adults_landline)


        age_older_18_cell = line[80]
        if age_older_18_cell == '1':
          row['Is Adult Cell'] = True

        num_of_adults_cell = line[88:90].strip()
        if num_of_adults_cell and int(num_of_adults_cell) <= 76:
          row['Num of Adults Cell'] = int(num_of_adults_cell)

        general_health = line[100].strip()
        if general_health == '1':
          row['General Health'] = 'Excellent'
        elif general_health == '2':
          row['General Health'] = 'Very good'
        elif general_health == '3':
          row['General Health'] = 'Good'
        elif general_health == '4':
          row['General Health'] = 'Fair'
        elif general_health == '5':
          row['General Health'] = 'Poor'

        health_care_access = line[107:109].strip()
        if health_care_access:
          health_care_access_code = int(health_care_access)
          if health_care_access_code == 1:
            row['Health Care Access'] = 'from_employer'
          elif health_care_access_code == 2:
            row['Health Care Access'] = 'from_own_pocket'
          elif health_care_access_code == 3:
            row['Health Care Access'] = 'medicare'
          elif health_care_access_code == 4:
            row['Health Care Access'] = 'medigap'
          elif health_care_access_code == 5:
            row['Health Care Access'] = 'medicaid'
          elif health_care_access_code == 6:
            row['Health Care Access'] = 'children_health_insurance_program'
          elif health_care_access_code == 7:
            row['Health Care Access'] = 'military'
          elif health_care_access_code == 8:
            row['Health Care Access'] = 'indian_health'
          elif health_care_access_code == 9:
            row['Health Care Access'] = 'state_sponsored'
          elif health_care_access_code == 10:
            row['Health Care Access'] = 'other_government'
          elif health_care_access_code == 88:
            row['Health Care Access'] = 'no_coverage'

        could_afford_doctor = line[110].strip()
        if could_afford_doctor:
          could_afford_doctor_code = int(could_afford_doctor)
          if could_afford_doctor_code == 1:
            row['Could Afford Doctor'] = True
          elif could_afford_doctor_code == 2:
            row['Could Afford Doctor'] = False

        how_many_years_since_last_checkup = line[111].strip()
        if how_many_years_since_last_checkup:
          how_many_years_since_last_checkup_code = int(how_many_years_since_last_checkup)
          if how_many_years_since_last_checkup_code == 1:
            row['Years Since Last Checkup'] = 'within_past_year'
          elif how_many_years_since_last_checkup_code == 2:
            row['Years Since Last Checkup'] = 'within_past_two_years'
          elif how_many_years_since_last_checkup_code == 3:
            row['Years Since Last Checkup'] = 'within_past_five_years'
          elif how_many_years_since_last_checkup_code == 4:
            row['Years Since Last Checkup'] = 'five_or_more_years'

        exercise_in_past_30_days = line[112].strip()
        if exercise_in_past_30_days:
          exercise_in_past_30_days_code = int(exercise_in_past_30_days)
          if exercise_in_past_30_days_code == 1:
            row['Exercise in Past 30 Days'] = True
          elif exercise_in_past_30_days_code == 2:
            row['Exercise in Past 30 Days'] = False

        hours_of_sleeping = line[113:115].strip()
        if hours_of_sleeping:
          hours_of_sleeping_code = int(hours_of_sleeping)
          if hours_of_sleeping_code <= 24:
            row['Hours of Sleeping'] = hours_of_sleeping_code

        how_many_years_since_last_dentist_visit = line[115].strip()
        if how_many_years_since_last_dentist_visit:
          how_many_years_since_last_dentist_visit_code = int(how_many_years_since_last_dentist_visit)
          if how_many_years_since_last_dentist_visit_code == 1:
            row['Years Since Last Dentist Visit'] = 'within_past_year'
          elif how_many_years_since_last_dentist_visit_code == 2:
            row['Years Since Last Dentist Visit'] = 'within_past_two_years'
          elif how_many_years_since_last_dentist_visit_code == 3:
            row['Years Since Last Dentist Visit'] = 'within_past_five_years'
          elif how_many_years_since_last_dentist_visit_code == 4:
            row['Years Since Last Dentist Visit'] = 'five_or_more_years'

        heart_attack = line[117].strip()
        if heart_attack:
          if heart_attack == '1':
            row['Heart Attack'] = True
          elif heart_attack == '2':
            row['Heart Attack'] = False

        heart_disease = line[118].strip()
        if heart_disease:
          if heart_disease == '1':
            row['Heart Disease'] = True
          elif heart_disease == '2':
            row['Heart Disease'] = False

        stroke = line[119].strip()
        if stroke:
          if stroke == '1':
            row['Stroke'] = True
          elif stroke == '2':
            row['Stroke'] = False

        asthma = line[1908].strip()
        if asthma:
          if asthma == '1':
            row['Asthma'] = False
          elif asthma == '2':
            row['Asthma'] = True

        asthma_in_effect = line[1909].strip()
        if asthma_in_effect:
          if asthma_in_effect == '2':
            row['Asthma in Effect'] = True
          elif asthma_in_effect == '1':
            row['Asthma in Effect'] = False

        depression = line[125].strip()
        if depression:
          if depression == '1':
            row['Depression'] = True
          elif depression == '2':
            row['Depression'] = False

        kidney_disease = line[126].strip()
        if kidney_disease:
          if kidney_disease == '1':
            row['Kidney Disease'] = True
          elif kidney_disease == '2':
            row['Kidney Disease'] = False

        arthritis = line[1911].strip()
        if arthritis:
          if arthritis == '1':
            row['Arthritis'] = True
          elif arthritis == '2':
            row['Arthritis'] = False

        diabetes = line[128].strip()
        if diabetes:
          if diabetes == '1':
            row['Diabetes'] = True
          elif diabetes in ['2', '3', '4']:
            row['Diabetes'] = False

        weight_in_pounds = line[188:192].strip()
        if weight_in_pounds:
          weight_in_pounds_code = int(weight_in_pounds)
          if weight_in_pounds_code >= 50 and weight_in_pounds_code <= 776:
            row['Weight in Pounds'] = weight_in_pounds_code

        height_in_inches = line[1986:1989].strip()
        if height_in_inches:
          height_in_inches_code = int(height_in_inches)
          row['Height in Inches'] = height_in_inches_code


        smoking = line[223].strip()
        if smoking:
          if smoking == '1':
            row['Smoking'] = 'every_day'
          elif smoking == '2':
            row['Smoking'] = 'some_days'
          elif smoking == '3':
            row['Smoking'] = 'never'

        cigarettes_per_day = line[232:235].strip()
        if cigarettes_per_day:
          cigarettes_per_day_code = int(cigarettes_per_day)
          if cigarettes_per_day_code <= 300:
            row['Cigarettes per Day'] = cigarettes_per_day_code

        drinks_last_30_day = line[241:243].strip()
        if drinks_last_30_day:
          drinks_last_30_day_code = int(drinks_last_30_day)
          if drinks_last_30_day_code <= 76:
            row['Drinks in Last 30 Days'] = drinks_last_30_day_code

        pre_diabetes = line[269].strip()
        if pre_diabetes:
          if pre_diabetes == '1':
            row['Prediabetes'] = True
          elif pre_diabetes in ['2', '3']:
            row['Prediabetes'] = False

        diabetes = line[270].strip()
        if diabetes:
          if diabetes in ['1', '2']:
            row['Diabetes'] = True

        shortness_of_breath = line[305].strip()
        if shortness_of_breath:
          if shortness_of_breath == '1':
            row['Shortness of Breath'] = True
          elif shortness_of_breath == '2':
            row['Shortness of Breath'] = False

        life_satisfaction = line[360].strip()
        if life_satisfaction:
          if life_satisfaction == '1':
            row['Life Satisfaction'] = 'very_satisfied'
          elif life_satisfaction == '2':
            row['Life Satisfaction'] = 'satisfied'
          elif life_satisfaction == '3':
            row['Life Satisfaction'] = 'dissatisfied'
          elif life_satisfaction == '4':
            row['Life Satisfaction'] = 'very_dissatisfied'

        ethnicity = line[1975].strip()
        if ethnicity:
          if ethnicity == '1':
            row['Ethnicity'] = 'White'
          elif ethnicity == '2':
            row['Ethnicity'] = 'Black'
          elif ethnicity == '3':
            row['Ethnicity'] = 'American Indian and Alaskan Native'
          elif ethnicity in ['4', '5']:
            row['Ethnicity'] = 'Asian and Pacific Islander'
          elif ethnicity == '8':
            row['Ethnicity'] = 'Hispanic'
          elif ethnicity == '7':
            row['Ethnicity'] = 'Multiracial, non-Hispanic'

        metropolitan = line[1401].strip()
        if metropolitan:
          if metropolitan == '1':
            row['Metropolitan'] = True
          elif metropolitan == '2':
            row['Metropolitan'] = False

        age_group = line[1985].strip()
        if age_group:
          if age_group == '1':
            row['Age Group'] = 'between_18_and_24'
          elif age_group == '2':
            row['Age Group'] = 'between_25_and_34'
          elif age_group == '3':
            row['Age Group'] = 'between_35_and_44'
          elif age_group == '4':
            row['Age Group'] = 'between_45_and_54'
          elif age_group == '5':
            row['Age Group'] = 'between_55_and_64'
          elif age_group == '6':
            row['Age Group'] = 'older_than_65'

        bmi_category = line[2001].strip()
        if bmi_category:
          if bmi_category == '1':
            row['BMI Category'] = 'underweight'
          elif bmi_category == '2':
            row['BMI Category'] = 'normal_weight'
          elif bmi_category == '3':
            row['BMI Category'] = 'over_weight'
          elif bmi_category == '4':
            row['BMI Category'] = 'obese'

        education = line[2004].strip()
        if education:
          if education == '1':
            row['Education'] = 'did_not_graduate_high_school'
          elif education == '2':
            row['Education'] = 'graduated_high_school'
          elif education == '3':
            row['Education'] = 'attended_college'
          elif education == '4':
            row['Education'] = 'graduated_college'

        income_level = line[2005].strip()
        if income_level:
          if income_level == '1':
            row['Income Level'] = 'less_than_15K'
          elif income_level == '2':
            row['Income Level'] = 'between_15K_and_25K'
          elif income_level == '3':
            row['Income Level'] = 'between_25K_and_35K'
          elif income_level == '4':
            row['Income Level'] = 'between_35K_and_50K'
          elif income_level == '5':
            row['Income Level'] = 'between_50K_and_100K'
          elif income_level == '6':
            row['Income Level'] = 'between_100K_and_200K'
          elif income_level == '7':
            row['Income Level'] = 'more_than_200K'

        heavy_alcohol_consumption = line[2047].strip()
        if heavy_alcohol_consumption:
          if heavy_alcohol_consumption == '1':
            row['Heavy Alcohol Consumption'] = False
          elif heavy_alcohol_consumption == '2':
            row['Heavy Alcohol Consumption'] = True

        rows.append(row)

    df = pd.DataFrame.from_dict(rows).astype({'Is Adult Landline': bool, 'Could Afford Doctor': bool, 'Exercise in Past 30 Days': bool, 'Heart Attack': bool, 'Heart Disease': bool,
                                              'Stroke': bool, 'Asthma': bool, 'Asthma in Effect': bool, 'Depression': bool, 'Kidney Disease': bool, 'Arthritis': bool,
                                              'Diabetes': bool, 'Metropolitan': bool, 'Heavy Alcohol Consumption': bool, 'Prediabetes': bool, 'Shortness of Breath': bool
                                              }
                                             )

    return df

def parse_mortality():
    """
    Reads the mortality input file and parses the contents using CSV format.

    :return: Mortality dataframe.
    """
    df = pd.read_csv('mortality.csv')
    return df

def parse_covid_dataset():
    """
    Reads the COVID input file and parses the contents with additional cleaning steps.

    :return: Cleaned COVID dataset as a pandas dataframe.
    """
    # Step 1: Read the COVID dataset
    df = pd.read_csv('covid_mortality.csv')

    # Step 2: Ensure the relevant columns are present
    relevant_columns = [
        'state', 'condition_group', 'condition', 'age_group',
        'covid_19_deaths', 'number_of_mentions', 'start_date', 'end_date', 'data_as_of', 'year', 'month', 'flag', 'group'
    ]

    if not all(col in df.columns for col in relevant_columns):
        raise ValueError("One or more required columns are missing from the dataset.")

    # Step 3: Filter out rows where 'state' is missing or irrelevant
    df_filtered = df[df['state'].notna()]

    # Step 4: Clean and standardize state names if necessary (e.g., convert full names to abbreviations)
    df_filtered['state'] = df_filtered['state'].str.upper()

    # Step 5: Handle missing data for 'covid_19_deaths' and 'number_of_mentions'
    df_filtered['covid_19_deaths'] = df_filtered['covid_19_deaths'].fillna(0).astype(int)
    df_filtered['number_of_mentions'] = df_filtered['number_of_mentions'].fillna(0).astype(int)

    # Step 6: Remove rows where the state is 'UNITED STATES'
    df_filtered = df_filtered[df_filtered['state'] != 'UNITED STATES']

    # Step 7: Drop irrelevant columns
    df_filtered = df_filtered.drop(columns=['data_as_of', 'year', 'month', 'flag', 'number_of_mentions'])

    # Step 8: Ensure 'start_date' and 'end_date' are in datetime format
    df_filtered['start_date'] = pd.to_datetime(df_filtered['start_date'])
    df_filtered['end_date'] = pd.to_datetime(df_filtered['end_date'])

    # Step 9: Calculate the time difference in days, months, and years
    days_difference = (df_filtered['end_date'] - df_filtered['start_date']).dt.days
    months_difference = days_difference / 30.44
    years_difference = days_difference / 365.25  # Accounts for leap years

    # Step 10: Create new columns for daily, monthly, and yearly averages of covid_19_deaths
    df_filtered['daily_avg'] = df_filtered['covid_19_deaths'] / days_difference
    df_filtered['monthly_avg'] = df_filtered['covid_19_deaths'] / months_difference
    df_filtered['yearly_avg'] = df_filtered['covid_19_deaths'] / years_difference

    # Step 11: Drop rows where 'age_group' is 'Not stated' or 'All Ages'
    df_filtered = df_filtered[~df_filtered['age_group'].isin(['Not stated', 'All Ages'])]

    # Step 12: Drop rows where the condition is 'COVID-19' (as we focus on pre-existing conditions)
    df_filtered = df_filtered[df_filtered['condition'] != 'COVID-19']

    # Step 13: Drop the columns 'start_date', 'end_date', 'covid_19_deaths', and 'group'
    df_filtered = df_filtered.drop(columns=['start_date', 'end_date', 'covid_19_deaths', 'group'])

    # Step 14: Reset the index
    df_filtered.reset_index(drop=True, inplace=True)

    return df_filtered

Loading of the Parsed BRFSS Survey

In [242]:
df_cdc_survey = parse_cdc_survey()
df_cdc_survey.head(20)

Unnamed: 0,State Abbr.,Interview Year,Is Adult Landline,Gender,Num of Adults Landline,General Health,Could Afford Doctor,Years Since Last Checkup,Exercise in Past 30 Days,Hours of Sleeping,...,BMI Category,Income Level,Smoking,Cigarettes per Day,Drinks in Last 30 Days,Prediabetes,Years Since Last Dentist Visit,Is Adult Cell,Num of Adults Cell,Shortness of Breath
0,AL,2022,True,Female,2.0,Very good,False,within_past_year,False,8.0,...,,,,,,True,,,,True
1,AL,2022,True,Female,2.0,Excellent,False,,False,6.0,...,over_weight,between_25K_and_35K,,,,True,,,,True
2,AL,2022,True,Female,1.0,Very good,False,within_past_year,True,5.0,...,over_weight,between_100K_and_200K,,,,True,,,,True
3,AL,2022,True,Female,3.0,Excellent,False,within_past_year,True,7.0,...,normal_weight,,some_days,2.0,,True,,,,True
4,AL,2022,True,Female,2.0,Fair,False,within_past_year,True,9.0,...,normal_weight,between_25K_and_35K,,,2.0,True,,,,True
5,AL,2022,True,Male,1.0,Poor,False,within_past_year,False,7.0,...,over_weight,,,,,True,,,,True
6,AL,2022,True,Female,1.0,Very good,False,within_past_year,True,7.0,...,normal_weight,between_50K_and_100K,never,35.0,2.0,True,,,,True
7,AL,2022,True,Female,1.0,Good,False,within_past_year,False,8.0,...,over_weight,between_50K_and_100K,,,,True,,,,True
8,AL,2022,True,Female,2.0,Good,False,within_past_year,True,6.0,...,,between_50K_and_100K,never,5.0,,True,,,,True
9,AL,2022,True,Female,2.0,Good,False,within_past_year,True,7.0,...,over_weight,between_50K_and_100K,,,1.0,True,,,,True


Loading of the Parsed Mortality Dataset

In [243]:
df_mortality = parse_mortality()
df_mortality.head()

Unnamed: 0,year,locationabbr,locationdesc,geographiclevel,datasource,class,topic,data_value,data_value_unit,data_value_type,data_value_footnote_symbol,data_value_footnote,stratificationcategory1,stratification1,stratificationcategory2,stratification2,topicid,locationid,y_lat,x_lon
0,2019,AK,Aleutians East,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,182.4,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",,,Gender,Overall,Race/Ethnicity,Overall,T2,2013,55.442393,-161.95993
1,2019,AK,Aleutians West,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,172.6,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",,,Gender,Overall,Race/Ethnicity,Overall,T2,2016,53.648343,-166.917533
2,2019,AK,Anchorage,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,255.6,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",,,Gender,Overall,Race/Ethnicity,Overall,T2,2020,61.159145,-149.104559
3,2019,AK,Bethel,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,343.4,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",,,Gender,Overall,Race/Ethnicity,Overall,T2,2050,60.923648,-159.753262
4,2019,AK,Bristol Bay,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",~,Insufficient Data,Gender,Overall,Race/Ethnicity,Overall,T2,2060,58.753496,-156.694662


Mortality dataset has statistics at both state and county levels. We will first focus our efforts on the counties.

25841 values (44.5%) from the data_value are missing in the mortality dataset at county level. We need imputation to avoid losing valuable information.

In [244]:
df_mortality_county = df_mortality[df_mortality['geographiclevel'] == 'County'].copy()

#Commented out for conciseness. Uncomment to see number of missing values per column.
#df_mortality_county.isnull().sum()

We will impute missing data by averaging the values from four closest neigbors. We first convert the lat and lon coordinates into radians because we will use those shortly for imputation.

In [245]:
for col in df_mortality_county[["y_lat", "x_lon"]]:
    rad = np.deg2rad(df_mortality_county[col].values)
    df_mortality_county[f'{col}_rad'] = rad

df_mortality_county_without_nan = df_mortality_county[df_mortality_county['data_value'].notna()].copy()

Ball tree is a data structure that can handle geospatial data for efficient search. With the help of a ball tree, we can reduce the search time for all missing data from
$O(n^2)$ down to
$O(nlogn)$

Ball trees work on radian data and that is why we did that conversion above.

In [246]:
ball = BallTree(df_mortality_county_without_nan[['y_lat_rad', 'x_lon_rad']].values, metric='haversine')

The function below finds the four nearest neigbour counties and averages the data from them.

In [247]:
def impute_from_neigbours(row):
    """
    Imputes missing information from the closest neigbors.

    :param row: Current row of the dataset to be processed.
    """
    if np.isnan(row['data_value']):
        _, indices = ball.query([row[['y_lat_rad', 'x_lon_rad']].values], k=4)
        row['data_value'] = df_mortality_county_without_nan.iloc[indices[0]]['data_value'].mean()
    return row

In [248]:
df_mortality_county_imputed = df_mortality_county.apply(impute_from_neigbours, axis=1)

In [249]:
df_covid = parse_covid_dataset()
df_covid.head()

Unnamed: 0,state,condition_group,condition,icd10_codes,age_group,daily_avg,monthly_avg,yearly_avg
0,ALABAMA,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,0.014695,0.447318,5.367377
1,ALABAMA,Respiratory diseases,Influenza and pneumonia,J09-J18,25-34,0.07568,2.303688,27.641991
2,ALABAMA,Respiratory diseases,Influenza and pneumonia,J09-J18,35-44,0.169728,5.166525,61.993204
3,ALABAMA,Respiratory diseases,Influenza and pneumonia,J09-J18,45-54,0.402645,12.256517,147.066128
4,ALABAMA,Respiratory diseases,Influenza and pneumonia,J09-J18,55-64,0.876561,26.682528,320.164034


**New York City is treated as a seperate entitiy, for the time being lets leave it to see if we can explore Metro differences**

In [250]:
df_cdc_survey.shape

(344751, 38)

In [251]:
df_mortality.shape

(59094, 20)

In [252]:
df_covid.shape

(466400, 8)

In [253]:
print(df_cdc_survey['State Abbr.'].unique())
print(df_mortality['locationabbr'].unique())
print(df_covid['state'].unique())


['AL' 'AK' 'AZ' 'AR' 'CA' 'CO' 'CT' 'DE' 'DC' 'FL' 'GA' 'HI' 'ID' 'IL'
 'IN' 'IA' 'KS' 'KY' 'LA' 'ME' 'MD' 'MA' 'MI' 'MN' 'MS' 'MO' 'MT' 'NE'
 'NV' 'NH' 'NJ' 'NM' 'NY' 'NC' 'ND' 'OH' 'OK' 'OR' 'PA' 'RI' 'SC' 'SD'
 'TN' 'TX' 'UT' 'VT' 'VA' 'WA' 'WV' 'WI' 'WY']
['AK' 'AL' 'AR' 'AS' 'AZ' 'CA' 'CO' 'CT' 'DC' 'DE' 'FL' 'GA' 'GU' 'HI'
 'IA' 'ID' 'IL' 'IN' 'KS' 'KY' 'LA' 'MA' 'MD' 'ME' 'MI' 'MN' 'MO' 'MP'
 'MS' 'MT' 'NC' 'ND' 'NE' 'NH' 'NJ' 'NM' 'NV' 'NY' 'OH' 'OK' 'OR' 'PA'
 'PR' 'RI' 'SC' 'SD' 'TN' 'TX' 'US' 'UT' 'VA' 'VI' 'VT' 'WA' 'WI' 'WV'
 'WY']
['ALABAMA' 'ALASKA' 'ARIZONA' 'ARKANSAS' 'CALIFORNIA' 'COLORADO'
 'CONNECTICUT' 'DELAWARE' 'DISTRICT OF COLUMBIA' 'FLORIDA' 'GEORGIA'
 'HAWAII' 'IDAHO' 'ILLINOIS' 'INDIANA' 'IOWA' 'KANSAS' 'KENTUCKY'
 'LOUISIANA' 'MAINE' 'MARYLAND' 'MASSACHUSETTS' 'MICHIGAN' 'MINNESOTA'
 'MISSISSIPPI' 'MISSOURI' 'MONTANA' 'NEBRASKA' 'NEVADA' 'NEW HAMPSHIRE'
 'NEW JERSEY' 'NEW MEXICO' 'NEW YORK' 'NEW YORK CITY' 'NORTH CAROLINA'
 'NORTH DAKOTA' 'OHIO' 'OKLAHOMA' '

In [254]:
# Dictionary to map state names to abbreviations
state_name_to_abbr = {
    'ALABAMA': 'AL', 'ALASKA': 'AK', 'ARIZONA': 'AZ', 'ARKANSAS': 'AR', 
    'CALIFORNIA': 'CA', 'COLORADO': 'CO', 'CONNECTICUT': 'CT', 'DELAWARE': 'DE', 
    'DISTRICT OF COLUMBIA': 'DC', 'FLORIDA': 'FL', 'GEORGIA': 'GA', 'HAWAII': 'HI', 
    'IDAHO': 'ID', 'ILLINOIS': 'IL', 'INDIANA': 'IN', 'IOWA': 'IA', 'KANSAS': 'KS', 
    'KENTUCKY': 'KY', 'LOUISIANA': 'LA', 'MAINE': 'ME', 'MARYLAND': 'MD', 
    'MASSACHUSETTS': 'MA', 'MICHIGAN': 'MI', 'MINNESOTA': 'MN', 'MISSISSIPPI': 'MS', 
    'MISSOURI': 'MO', 'MONTANA': 'MT', 'NEBRASKA': 'NE', 'NEVADA': 'NV', 
    'NEW HAMPSHIRE': 'NH', 'NEW JERSEY': 'NJ', 'NEW MEXICO': 'NM', 'NEW YORK': 'NY', 
    'NEW YORK CITY': 'NYC', 'NORTH CAROLINA': 'NC', 'NORTH DAKOTA': 'ND', 'OHIO': 'OH', 
    'OKLAHOMA': 'OK', 'OREGON': 'OR', 'PENNSYLVANIA': 'PA', 'RHODE ISLAND': 'RI', 
    'SOUTH CAROLINA': 'SC', 'SOUTH DAKOTA': 'SD', 'TENNESSEE': 'TN', 'TEXAS': 'TX', 
    'UTAH': 'UT', 'VERMONT': 'VT', 'VIRGINIA': 'VA', 'WASHINGTON': 'WA', 
    'WEST VIRGINIA': 'WV', 'WISCONSIN': 'WI', 'WYOMING': 'WY', 'PUERTO RICO': 'PR'
}

# Map state names to abbreviations in df_covid
df_covid['state_abbr'] = df_covid['state'].map(state_name_to_abbr)

print(df_covid[['state', 'state_abbr']].head())


     state state_abbr
0  ALABAMA         AL
1  ALABAMA         AL
2  ALABAMA         AL
3  ALABAMA         AL
4  ALABAMA         AL


In [255]:
df_covid.head(5)

Unnamed: 0,state,condition_group,condition,icd10_codes,age_group,daily_avg,monthly_avg,yearly_avg,state_abbr
0,ALABAMA,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,0.014695,0.447318,5.367377,AL
1,ALABAMA,Respiratory diseases,Influenza and pneumonia,J09-J18,25-34,0.07568,2.303688,27.641991,AL
2,ALABAMA,Respiratory diseases,Influenza and pneumonia,J09-J18,35-44,0.169728,5.166525,61.993204,AL
3,ALABAMA,Respiratory diseases,Influenza and pneumonia,J09-J18,45-54,0.402645,12.256517,147.066128,AL
4,ALABAMA,Respiratory diseases,Influenza and pneumonia,J09-J18,55-64,0.876561,26.682528,320.164034,AL


In [256]:
print(df_cdc_survey.dtypes)



State Abbr.                        object
Interview Year                     object
Is Adult Landline                    bool
Gender                             object
Num of Adults Landline            float64
General Health                     object
Could Afford Doctor                  bool
Years Since Last Checkup           object
Exercise in Past 30 Days             bool
Hours of Sleeping                 float64
Heart Attack                         bool
Heart Disease                        bool
Stroke                               bool
Asthma                               bool
Asthma in Effect                     bool
Depression                           bool
Kidney Disease                       bool
Arthritis                            bool
Diabetes                             bool
Life Satisfaction                  object
Ethnicity                          object
Metropolitan                         bool
Age Group                          object
Education                         

In [257]:
print(df_mortality_county_imputed.dtypes)

year                            int64
locationabbr                   object
locationdesc                   object
geographiclevel                object
datasource                     object
class                          object
topic                          object
data_value                    float64
data_value_unit                object
data_value_type                object
data_value_footnote_symbol     object
data_value_footnote            object
stratificationcategory1        object
stratification1                object
stratificationcategory2        object
stratification2                object
topicid                        object
locationid                      int64
y_lat                         float64
x_lon                         float64
y_lat_rad                     float64
x_lon_rad                     float64
dtype: object


In [258]:
print(df_covid.dtypes)

state               object
condition_group     object
condition           object
icd10_codes         object
age_group           object
daily_avg          float64
monthly_avg        float64
yearly_avg         float64
state_abbr          object
dtype: object


In [259]:
bmi_values = df_cdc_survey['BMI Category'].unique()
print(bmi_values)

[nan 'over_weight' 'normal_weight' 'obese' 'underweight']


# Mapping of CDC Survey

In [260]:
# Map General Health
df_cdc_survey['General Health'] = df_cdc_survey['General Health'].map({
    'Excellent': 5, 'Very good': 4, 'Good': 3, 'Fair': 2, 'Poor': 1
})

# Map Smoking 
df_cdc_survey['Smoking'] = df_cdc_survey['Smoking'].map({
    'never': 0,      # Non-smoker
    'some_days': 1,  # Occasional smoker
    'every_day': 2,  # Daily smoker
})

# Map True to 1 and False to 0 for 'Exercise in Past 30 Days'
df_cdc_survey['Exercise in Past 30 Days'] = df_cdc_survey['Exercise in Past 30 Days'].map({
    True: 1,
    False: 0
})

# Map categorical 'Years Since Last Checkup' values to numeric
df_cdc_survey['Years Since Last Checkup'] = df_cdc_survey['Years Since Last Checkup'].map({
    'within_past_year': 1,
    'within_past_two_years': 2,
    'within_past_five_years': 3,
    'five_or_more_years': 5
})

# Map categorical BMI values to numeric
df_cdc_survey['BMI Category'] = df_cdc_survey['BMI Category'].map({
    'underweight': 0,
    'normal_weight': 1,
    'over_weight': 2,
    'obese': 3
})

# Replace NaN values in 'Cigarettes per Day' and 'Drinks in Last 30 Days' with 0
# This is so we can get a average
df_cdc_survey['Cigarettes per Day'] = df_cdc_survey['Cigarettes per Day'].fillna(0)
df_cdc_survey['Drinks in Last 30 Days'] = df_cdc_survey['Drinks in Last 30 Days'].fillna(0)

# This is the unsupervised agg for clustering on the state level

In [266]:
# Aggregating df_cdc_survey
df_cdc_survey_agg_unsup = df_cdc_survey.groupby('State Abbr.').agg({
    'General Health': 'mean', 
    'Exercise in Past 30 Days': 'mean', 
    'Smoking': 'mean', 
    'Shortness of Breath': 'sum', 
    'Hours of Sleeping': 'mean',
    'BMI Category': 'mean',
    'Years Since Last Checkup': 'mean',
    'Cigarettes per Day': 'mean',
    'Drinks in Last 30 Days': 'mean'
}).reset_index()


In [267]:
# Aggregating df_mortality_county_imputed
df_mortality_agg_unsup = df_mortality_county_imputed.groupby('locationabbr').agg({
    'data_value': 'mean',  # Average heart disease mortality rate in a year
}).reset_index()

# Display the result
print(df_mortality_agg.head())

  locationabbr  data_value      y_lat       x_lon
0           AK  271.923180  60.293348 -147.694487
1           AL  463.608375  32.889358  -86.708843
2           AR  449.265481  34.914731  -92.428500
3           AS   83.631944 -14.301754 -170.719474
4           AZ  282.399537  33.681965 -111.460627


In [268]:
# Aggregating df_covid by state
df_covid_agg_unsup = df_covid.groupby('state_abbr').agg({
    'yearly_avg': 'mean'  # Average yearly cases
}).reset_index()

# Display the result
print(df_covid_agg.head())

  state_abbr  daily_avg  monthly_avg  yearly_avg
0         AK   0.004181     0.127283    1.527266
1         AL   0.183162     5.575465   66.900089
2         AR   0.114804     3.494626   41.932062
3         AZ   0.317130     9.653451  115.831893
4         CA   1.473746    44.860838  538.285840


In [269]:
# Merge df_cdc_survey_agg_unsup with df_mortality_agg_unsup
df_unsup = df_cdc_survey_agg_unsup.merge(df_mortality_agg_unsup, left_on='State Abbr.', right_on='locationabbr', how='inner')

# Merge the result with df_covid_agg_unsup
df_unsup = df_unsup.merge(df_covid_agg_unsup, left_on='State Abbr.', right_on='state_abbr', how='inner')

# Drop duplicate columns for state
df_unsup.drop(columns=['locationabbr', 'state_abbr'], inplace=True)

df_unsup.head(5)


Unnamed: 0,State Abbr.,General Health,Exercise in Past 30 Days,Smoking,Shortness of Breath,Hours of Sleeping,BMI Category,Years Since Last Checkup,Cigarettes per Day,Drinks in Last 30 Days,data_value,yearly_avg
0,AK,3.456126,0.780876,0.553381,5020,7.054147,2.005106,1.578323,5.894223,1.194223,271.92318,1.527266
1,AL,3.2087,0.687758,0.59201,3872,6.998162,2.133351,1.262841,5.818182,0.817665,463.608375,66.900089
2,AR,3.159434,0.662369,0.614639,4111,7.052125,2.082006,1.296665,7.329847,0.940161,449.265481,41.932062
3,AZ,3.370771,0.755157,0.486242,7417,7.087786,1.984653,1.434741,5.742618,1.091142,282.399537,115.831893
4,CA,3.454979,0.800115,0.422476,6949,7.012274,1.907335,1.474276,4.070514,1.214563,290.34181,538.28584
