In [126]:
import os
import sys

import numpy as np
import pandas as pd
import scipy as sp

import seaborn as sns
import matplotlib.pyplot as plt

PROJ_ROOT =  os.path.join(os.pardir)

print(os.path.abspath(PROJ_ROOT))

D:\DANIELA\Projects\WIDS\covid-outcomes-research


In [127]:
#Let your audience know when this notebook was run, and with which packages. Useful when you are not sharing the notebook as
#executable code.

!pip install watermark



In [128]:
%load_ext watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [129]:
%watermark -a "Daniela Alvarez Zegarra" -d -t -v -p numpy,pandas

Author: Daniela Alvarez Zegarra

Python implementation: CPython
Python version       : 3.9.7
IPython version      : 8.2.0

numpy : 1.20.3
pandas: 1.4.1



In [130]:
#Documentation to know what other info can i get with watermark

%watermark?

In [131]:
#Let's print our requirements

print(open(os.path.join(PROJ_ROOT,'requirements.txt')).read())

# local package
-e .

# external requirements
click
Sphinx
coverage
awscli
flake8
python-dotenv>=0.5.1
jupyter
ipython
numpy
pandas
matplotlib
scikit-learn
scipy
pytest
nbdime
runipy
seaborn



## Loading the Data

In [132]:
wids_path = os.path.join(PROJ_ROOT,
                          "data",
                          "raw",
                          "dataset.csv")

area_path = os.path.join(PROJ_ROOT,
                          "data",
                          "raw",
                          "county_name_area.csv")

deaths_path = os.path.join(PROJ_ROOT,
                          "data",
                          "raw",
                          "county_deaths_and_cases_jan_to_may_2020.csv")

medical_conditions_path = os.path.join(PROJ_ROOT,
                          "data",
                          "raw",
                          "medical_conditions_risky_for_covid_by_county.csv")

health_rankings_path = os.path.join(PROJ_ROOT,
                          "data",
                          "raw",
                          "us-county-health-rankings-2020.csv")

hospital_capacity_path = os.path.join(PROJ_ROOT,
                          "data",
                          "raw",
                          "Patient_Impact_and_Hospital_Capacity_by_Facility.csv")


In [133]:
#Loading main dataset

df = pd.read_csv(wids_path)

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Data columns (total 93 columns):
 #   Column                                                                         Non-Null Count  Dtype  
---  ------                                                                         --------------  -----  
 0   Unnamed: 0                                                                     3140 non-null   int64  
 1   fips                                                                           3140 non-null   int64  
 2   TOT_POP                                                                        3140 non-null   int64  
 3   0-9                                                                            3140 non-null   int64  
 4   0-9 y/o % of total pop                                                         3140 non-null   float64
 5   10-19                                                                          3140 non-null   int64  
 6   10-19 y/o % of total pop

In [135]:
#We will drop all columns related to temperature because we don't find them relevant

df = df.drop(df.iloc[:, 63:77],axis = 1)

In [136]:
#df = df.drop(df.iloc[:, cols_to_drop],axis = 1)

In [137]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Data columns (total 79 columns):
 #   Column                                                                         Non-Null Count  Dtype  
---  ------                                                                         --------------  -----  
 0   Unnamed: 0                                                                     3140 non-null   int64  
 1   fips                                                                           3140 non-null   int64  
 2   TOT_POP                                                                        3140 non-null   int64  
 3   0-9                                                                            3140 non-null   int64  
 4   0-9 y/o % of total pop                                                         3140 non-null   float64
 5   10-19                                                                          3140 non-null   int64  
 6   10-19 y/o % of total pop

In [138]:
df.head()

Unnamed: 0.1,Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,10-19,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,...,Total nurse practitioners (2019),Total physician assistants (2019),Total Hospitals (2019),Internal Medicine Primary Care (2019),Family Medicine/General Practice Primary Care (2019),Total Specialist Physicians (2019),ICU Beds_x,Total Population,Population Aged 60+,Percent of Population Aged 60+
0,0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,...,28.859137,6.085786,1.148905,25.992561,21.249061,72.142154,6.0,55036,10523,19.1
1,1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,...,113.162114,23.863512,4.505074,101.92173,83.321572,282.882982,51.0,203360,53519,26.3
2,2,1005,24881,2732,10.980266,2960,11.896628,3268,13.13452,3201,...,12.914231,2.72334,0.514126,11.631462,9.508784,32.283033,5.0,26201,6150,23.5
3,3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,...,11.626493,2.451783,0.46286,10.471635,8.560619,29.063942,0.0,22580,4773,21.1
4,4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,...,30.021267,6.330854,1.195171,27.039257,22.10474,75.047251,6.0,57667,13600,23.6


In [139]:
perc_cols_index = [4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,78]

In [140]:
#Let's see some statistics of the columns expressed in percentage for Ages and Races.

df.iloc[:,perc_cols_index].describe()

Unnamed: 0,0-9 y/o % of total pop,10-19 y/o % of total pop,20-29 y/o % of total pop,30-39 y/o % of total pop,40-49 y/o % of total pop,50-59 y/o % of total pop,60-69 y/o % of total pop,70-79 y/o % of total pop,80+ y/o % of total pop,% White-alone,% Black-alone,% NA/AI-alone,% Asian-alone,% Hawaiian/PI-alone,% Two or more races,Percent of Population Aged 60+
count,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0
mean,11.871051,12.694609,12.283979,11.751535,11.556685,13.526733,13.182225,8.399382,4.733801,84.494109,9.342405,2.344741,1.554548,0.141007,2.12319,24.736943
std,2.124081,1.815044,3.126297,1.696599,1.372935,1.481692,2.532498,2.214915,1.535028,16.348987,14.478294,7.72199,2.857088,0.967946,1.544463,5.564871
min,0.0,0.0,0.0,6.092789,2.631579,5.316361,3.444403,2.199551,0.0,3.841985,0.0,0.0,0.0,0.0,0.0,5.8
25%,10.594639,11.674504,10.496774,10.689322,10.685197,12.738561,11.621232,7.032275,3.738648,79.620473,0.867069,0.384294,0.465659,0.031896,1.357557,21.2
50%,11.802727,12.687422,11.772649,11.580861,11.618372,13.621339,13.013409,8.208162,4.565338,91.410189,2.54544,0.641143,0.739372,0.062024,1.78972,24.4
75%,12.95184,13.659282,13.18226,12.639379,12.386083,14.429115,14.467254,9.445777,5.484143,95.521727,10.852053,1.331416,1.433411,0.118429,2.40997,27.8
max,25.460677,23.304372,37.570198,22.225129,19.430369,20.454545,27.272727,31.327959,22.727273,99.043785,86.069762,92.5152,42.95231,48.863636,30.304056,64.2


We can see that the median and the mean in ages are almost the same for all countie.

## Visualizing different datasets

### DF1 : COUNTY AREAS

In [141]:
df1 = pd.read_csv(area_path)

In [142]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3094 entries, 0 to 3093
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   state         3094 non-null   object 
 1   county        3094 non-null   object 
 2   population    3094 non-null   int64  
 3   land_area_km  3094 non-null   int64  
 4   density_km    3094 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 121.0+ KB


### DF2 = DEATHS AND CASES BY MAY 2020

In [143]:
df2 = pd.read_csv(deaths_path)

In [144]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141403 entries, 0 to 141402
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   date    141403 non-null  object 
 1   county  141403 non-null  object 
 2   state   141403 non-null  object 
 3   fips    139802 non-null  float64
 4   cases   141403 non-null  int64  
 5   deaths  141403 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 6.5+ MB


### DF3 : MEDICAL CONDITIONS

In [145]:
df3 = pd.read_csv(medical_conditions_path)

In [146]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3142 entries, 0 to 3141
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   COUNTY_NAME                  3142 non-null   object 
 1   STATE_NAME                   3142 non-null   object 
 2   STATE_FIPS                   3142 non-null   int64  
 3   CNTY_FIPS                    3142 non-null   int64  
 4   FIPS                         3142 non-null   int64  
 5   county_pop2018_18 and older  3142 non-null   int64  
 6   anycondition_prevalence      3142 non-null   float64
 7   anycondition_Lower 95% CI    3142 non-null   float64
 8   anycondition_Upper 95% CI    3142 non-null   float64
 9   anycondition_number          3142 non-null   int64  
 10  Obesity_prevalence           3142 non-null   float64
 11  Obesity_Lower 95% CI         3142 non-null   float64
 12  Obesity_Upper 95% CI         3142 non-null   float64
 13  Obesity_number    

In [147]:
#Keeping columns that i will merge

cols_drop_df3 = [2, 3, 7, 8, 11, 12,15,16,19,20,23,24,27,28,31]

new_df3 = df3.drop(df3.iloc[:,cols_drop_df3], axis= 1)

In [148]:
new_df3.head()

Unnamed: 0,COUNTY_NAME,STATE_NAME,FIPS,county_pop2018_18 and older,anycondition_prevalence,anycondition_number,Obesity_prevalence,Obesity_number,Heart disease_prevalence,Heart disease_number,COPD_prevalence,COPD_number,diabetes_prevalence,diabetes_number,CKD_prevalence,CKD_number,Urban_rural_code
0,Autauga,Alabama,1001,42438,47.6,20181,35.8,15193,7.9,3345,8.6,3644,12.9,5462,3.1,1326,3
1,Baldwin,Alabama,1003,170912,40.2,68790,29.7,50761,7.8,13414,8.6,14692,12.0,20520,3.2,5479,4
2,Barbour,Alabama,1005,19689,57.5,11325,40.7,8013,11.0,2159,12.1,2373,19.7,3870,4.5,887,6
3,Bibb,Alabama,1007,17813,51.6,9190,38.7,6894,8.6,1533,10.0,1789,14.1,2511,3.3,595,2
4,Blount,Alabama,1009,44448,46.3,20584,34.0,15112,9.2,4101,10.5,4661,13.5,6017,3.4,1507,2


In [149]:
new_df3.rename(columns = {'FIPS':'fips'}, inplace = True)

In [150]:
# merge wids data with medical conditions data

df = df.merge(new_df3, on='fips', how='left')

In [156]:
df.head()

Unnamed: 0.1,Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,10-19,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,...,Obesity_number,Heart disease_prevalence,Heart disease_number,COPD_prevalence,COPD_number,diabetes_prevalence,diabetes_number,CKD_prevalence,CKD_number,Urban_rural_code
0,0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,...,15193,7.9,3345,8.6,3644,12.9,5462,3.1,1326,3
1,1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,...,50761,7.8,13414,8.6,14692,12.0,20520,3.2,5479,4
2,2,1005,24881,2732,10.980266,2960,11.896628,3268,13.13452,3201,...,8013,11.0,2159,12.1,2373,19.7,3870,4.5,887,6
3,3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,...,6894,8.6,1533,10.0,1789,14.1,2511,3.3,595,2
4,4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,...,15112,9.2,4101,10.5,4661,13.5,6017,3.4,1507,2


### DF4 : Health rankings

In [151]:
df4 = pd.read_csv(health_rankings_path)

In [152]:
df4.head()

Unnamed: 0,fips,state,county,num_deaths,years_of_potential_life_lost_rate,95percent_ci_low,95percent_ci_high,quartile,ypll_rate_aian,ypll_rate_aian_95percent_ci_low,...,percent_hispanic,num_non_hispanic_white,percent_non_hispanic_white,num_not_proficient_in_english,percent_not_proficient_in_english,95percent_ci_low_39,95percent_ci_high_39,percent_female,num_rural,percent_rural
0,1000,Alabama,,81791.0,9942.794666,9840.535949,10045.053384,,,,...,4.443264,3197324,65.413428,48517,1.061048,1.006759,1.115337,51.633032,1957932.0,40.963183
1,1001,Alabama,Autauga,791.0,8128.59119,7283.340731,8973.841649,1.0,,,...,2.965774,41316,74.308016,426,0.820225,0.347891,1.292558,51.448715,22921.0,42.002162
2,1003,Alabama,Baldwin,2967.0,7354.12253,6918.55427,7789.69079,1.0,,,...,4.646779,181201,83.111337,1068,0.543517,0.347271,0.739763,51.538377,77060.0,42.279099
3,1005,Alabama,Barbour,472.0,10253.573403,8782.217281,11724.929524,2.0,,,...,4.276355,11356,45.641252,398,1.631683,0.824903,2.438462,47.216752,18613.0,67.789635
4,1007,Alabama,Bibb,471.0,11977.539484,10344.064842,13611.014126,3.0,,,...,2.625,16708,74.589286,57,0.26821,0.0,0.807504,46.78125,15663.0,68.352607


### DF5: HOSPITAL CAPACITY 2020 - 2022 per week

In [153]:
df5 = pd.read_csv(hospital_capacity_path)

In [154]:
df5.head()

Unnamed: 0,hospital_pk,collection_week,state,ccn,hospital_name,address,city,zip,hospital_subtype,fips_code,...,previous_day_admission_pediatric_covid_confirmed_unknown_7_day_sum,staffed_icu_pediatric_patients_confirmed_covid_7_day_avg,staffed_icu_pediatric_patients_confirmed_covid_7_day_coverage,staffed_icu_pediatric_patients_confirmed_covid_7_day_sum,staffed_pediatric_icu_bed_occupancy_7_day_avg,staffed_pediatric_icu_bed_occupancy_7_day_coverage,staffed_pediatric_icu_bed_occupancy_7_day_sum,total_staffed_pediatric_icu_beds_7_day_avg,total_staffed_pediatric_icu_beds_7_day_coverage,total_staffed_pediatric_icu_beds_7_day_sum
0,171347,2022/04/29,KS,171347,SHERIDAN COUNTY HOSPITAL,826 18TH STREET,HOXIE,67740.0,Critical Access Hospitals,20179.0,...,0.0,0.0,4,0.0,0.0,4,0.0,0.0,4,0.0
1,370002,2022/04/29,OK,370002,ALLIANCEHEALTH WOODWARD,900 17TH STREET,WOODWARD,73801.0,Short Term,40153.0,...,0.0,0.0,7,0.0,0.0,7,0.0,0.0,7,0.0
2,501311,2022/04/29,WA,501311,EAST ADAMS RURAL HOSPITAL,903 SOUTH ADAMS,RITZVILLE,99169.0,Critical Access Hospitals,53001.0,...,0.0,0.0,7,0.0,0.0,7,0.0,0.0,7,0.0
3,40161,2022/04/22,AR,40161,ARKANSAS HEART HOSPITAL-ENCORE,1901 ENCORE WAY,BRYANT,72022.0,Short Term,5125.0,...,0.0,0.0,7,0.0,0.0,7,0.0,0.0,7,0.0
4,370225,2022/04/22,OK,370225,"SUMMIT MEDICAL CENTER, LLC",1800 SOUTH RENAISSANCE BOULEVARD,EDMOND,73013.0,Short Term,40109.0,...,0.0,0.0,7,0.0,0.0,7,0.0,0.0,7,0.0


In [155]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459883 entries, 0 to 459882
Columns: 128 entries, hospital_pk to total_staffed_pediatric_icu_beds_7_day_sum
dtypes: bool(2), float64(87), int64(29), object(10)
memory usage: 443.0+ MB
