### Ignore Warning Messages from Python

In [8]:
import warnings
warnings.filterwarnings('ignore')

### Importing Libraries

In [9]:
import numpy as np
import pandas as pd

### Load Data

In [10]:
data = pd.read_csv('../data/US_counties_COVID19_health_weather_data.csv', sep=',')

### Clean & Prepare Data

In [11]:
data['stay_at_home_announced'] = data['stay_at_home_announced'].map({'yes': 1, 'no': 0})
data['stay_at_home_effective'] = data['stay_at_home_effective'].map({'yes': 1, 'no': 0})

d = {True:1, False:0, np.nan:np.nan}
data['presence_of_water_violation'] = data['presence_of_water_violation'].apply(lambda x: d[x])

In [12]:
floating = ['lat',
            'log',
            'area_sqmi',
            'population_density_per_sqmi',
            'years_of_potential_life_lost_rate',
            'percent_fair_or_poor_health',
            'average_number_of_physically_unhealthy_days',
            'average_number_of_mentally_unhealthy_days',
            'percent_low_birthweight',
            'percent_smokers',
            'percent_adults_with_obesity',
            'food_environment_index',
            'percent_physically_inactive',
            'percent_with_access_to_exercise_opportunities',
            'percent_excessive_drinking',
            'percent_driving_deaths_with_alcohol_involvement',
            'chlamydia_rate',
            'teen_birth_rate',
            'percent_uninsured',
            'primary_care_physicians_rate',
            'dentist_rate',
            'mental_health_provider_rate',
            'preventable_hospitalization_rate',
            'percent_with_annual_mammogram',
            'percent_vaccinated',
            'high_school_graduation_rate',
            'percent_some_college',
            'percent_unemployed_CHR',
            'percent_children_in_poverty',
            'eightieth_percentile_income',
            'twentieth_percentile_income',
            'income_ratio',
            'percent_single_parent_households_CHR',
            'social_association_rate',
            'annual_average_violent_crimes',
            'violent_crime_rate',
            'injury_death_rate',
            'average_daily_pm2_5',
            'percent_severe_housing_problems',
            'severe_housing_cost_burden',
            'overcrowding',
            'inadequate_facilities',
            'percent_drive_alone_to_work',
            'percent_long_commute_drives_alone',
            'life_expectancy',
            'age_adjusted_death_rate',
            'child_mortality_rate',
            'infant_mortality_rate',
            'percent_frequent_physical_distress',
            'percent_frequent_mental_distress',
            'percent_adults_with_diabetes',
            'hiv_prevalence_rate',
            'percent_food_insecure',
            'num_limited_access',
            'percent_limited_access_to_healthy_foods',
            'drug_overdose_mortality_rate',
            'motor_vehicle_mortality_rate',
            'percent_insufficient_sleep',
            'percent_uninsured_2',
            'percent_uninsured_3',
            'other_primary_care_provider_rate',
            'percent_disconnected_youth',
            'average_grade_performance',
            'average_grade_performance_2',
            'median_household_income',
            'percent_enrolled_in_free_or_reduced_lunch',
            'segregation_index',
            'segregation_index_2',
            'homicide_rate',
            'suicide_rate_age_adjusted',
            'firearm_fatalities_rate',
            'juvenile_arrest_rate',
            'average_traffic_volume_per_meter_of_major_roadways',
            'percent_homeowners',
            'percent_severe_housing_cost_burden',
            'percent_less_than_18_years_of_age',
            'percent_65_and_over',
            'percent_black',
            'percent_american_indian_alaska_native',
            'percent_asian',
            'percent_native_hawaiian_other_pacific_islander',
            'percent_hispanic',
            'percent_non_hispanic_white',
            'percent_not_proficient_in_english',
            'percent_female',
            'percent_rural',
            'per_capita_income',
            'percent_below_poverty',
            'percent_unemployed_CDC',
            'percent_no_highschool_diploma',
            'percent_age_65_and_older',
            'percent_age_17_and_younger',
            'percent_disabled',
            'percent_single_parent_households_CDC',
            'percent_minorities',
            'percent_limited_english_abilities',
            'percent_multi_unit_housing',
            'percent_mobile_homes',
            'percent_overcrowding',
            'percent_no_vehicle',
            'percent_institutionalized_in_group_quarters',
            'percentile_rank_below_poverty',
            'percentile_rank_unemployed',
            'percentile_rank_per_capita_income',
            'percentile_rank_no_highschool_diploma',
            'percentile_rank_socioeconomic_theme',
            'percentile_rank_age_65_and_older',
            'percentile_rank_age_17_and_younger',
            'percentile_rank_disabled',
            'percentile_rank_single_parent_households',
            'percentile_rank_household_comp_disability_theme',
            'percentile_rank_minorities',
            'percentile_rank_limited_english_abilities',
            'percentile_rank_minority_status_and_language_theme',
            'percentile_rank_multi_unit_housing',
            'percentile_rank_mobile_homes',
            'percentile_rank_overcrowding',
            'percentile_rank_no_vehicle',
            'percentile_rank_institutionalized_in_group_quarters',
            'percentile_rank_housing_and_transportation',
            'percentile_rank_social_vulnerability',
            'presence_of_water_violation',
            'km_to_closest_station',
            'ELEV_M',
            'station_id',
            'mean_temp',
            'min_temp',
            'max_temp',
            'dewpoint',
            'sea_level_pressure',
            'station_pressure',
            'visibility',
            'wind_speed',
            'max_wind_speed',
            'wind_gust',
            'precipitation',
            'mean_temp_3d_avg',
            'mean_temp_5d_avg',
            'mean_temp_10d_avg',
            'mean_temp_15d_avg',
            'max_temp_3d_avg',
            'max_temp_5d_avg',
            'max_temp_10d_avg',
            'max_temp_15d_avg',
            'min_temp_3d_avg',
            'min_temp_5d_avg',
            'min_temp_10d_avg',
            'min_temp_15d_avg',
            'dewpoint_3d_avg',
            'dewpoint_5d_avg',
            'dewpoint_10d_avg',
            'dewpoint_15d_avg'
          ]

integers = ['cases',
            'deaths',
            'stay_at_home_announced',
            'stay_at_home_effective',
            'total_population',
            'num_deaths',
            'num_alcohol_impaired_driving_deaths',
            'num_driving_deaths',
            'num_chlamydia_cases',
            'num_uninsured',
            'num_primary_care_physicians',
            'num_dentists',
            'num_mental_health_providers',
            'population',
            'num_unemployed_CHR',
            'labor_force',
            'num_single_parent_households_CHR',
            'num_households_CHR',
            'num_associations',
            'num_injury_deaths',
            'num_workers_who_drive_alone',
            'num_deaths_2',
            'num_deaths_3',
            'num_deaths_4',
            'num_hiv_cases',
            'num_food_insecure',
            'num_drug_overdose_deaths',
            'num_motor_vehicle_deaths',
            'num_uninsured_2',
            'num_uninsured_3',
            'num_deaths_5',
            'num_firearm_fatalities',
            'num_homeowners',
            'num_households_with_severe_cost_burden',
            'population_2',
            'num_black',
            'num_american_indian_alaska_native',
            'num_asian',
            'num_native_hawaiian_other_pacific_islander',
            'num_hispanic',
            'num_non_hispanic_white',
            'num_not_proficient_in_english',
            'num_rural',
            'num_housing_units', 
            'num_households_CDC',
            'num_below_poverty',
            'num_unemployed_CDC',
            'num_no_highschool_diploma',
            'num_age_65_and_older',
            'num_age_17_and_younger',
            'num_disabled',
            'num_single_parent_households_CDC',
            'num_minorities',
            'num_limited_english_abilities',
            'num_multi_unit_housing',
            'num_mobile_homes',
            'num_overcrowding',
            'num_households_with_no_vehicle',
            'num_institutionalized_in_group_quarters',
            'fog',
            'rain',
            'snow',
            'hail',
            'thunder',
            'tornado'
           ]

strings = ['fips',
           'date', 
           'county', 
           'state', 
           'station_name',
           'CALL',
           'precip_flag',
           'date_stay_at_home_announced', 
           'date_stay_at_home_effective'
          ]

for i in strings:
    data.astype({i: 'str'}).dtypes
    
# Cannot convert nulls to ints
for i in integers:
    data.astype({i: 'float64'}).dtypes

In [13]:
to_drop = ['fips',
           'date',
           'station_name',
           'CALL',
           'precip_flag',
           'date_stay_at_home_announced',
           'date_stay_at_home_effective',
           'lat',
           'lon',
           'station_id',
           'area_sqmi',
           'km_to_closest_station',
           'county',
           'state',
           'stay_at_home_announced',
           'stay_at_home_effective',
           'years_of_potential_life_lost_rate']

for col in data:
    if 'avg' in col:
        to_drop.append(col)

for_imputation = data.drop(labels=to_drop, axis=1)

### Save Data for Imputation to CSV

In [14]:
for_imputation.to_csv("../data/data_to_impute.csv", index=False)