In [3]:
import pandas as pd
import sqlite3
from sodapy import Socrata

# ETL of [Asthma data](https://chronicdata.cdc.gov/Chronic-Disease-Indicators/U-S-Chronic-Disease-Indicators-Asthma/us8e-ubyj)

In [11]:
# Instruction from the CDC website to extract data
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("chronicdata.cdc.gov", None)

# Return as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get_all("us8e-ubyj")

# Convert to pandas DataFrame
asthma_df = pd.DataFrame.from_records(results)



In [208]:
asthma_df

Unnamed: 0,yearstart,yearend,locationabbr,locationdesc,datasource,topic,question,datavaluetype,datavalue,datavaluealt,...,topicid,questionid,datavaluetypeid,stratificationcategoryid1,stratificationid1,datavalueunit,lowconfidencelimit,highconfidencelimit,datavaluefootnotesymbol,datavaluefootnote
0,2012,2012,AL,Alabama,NVSS,Asthma,Asthma mortality rate,Number,37,37,...,AST,AST4_1,NMBR,GENDER,GENF,,,,,
1,2014,2014,AL,Alabama,NVSS,Asthma,Asthma mortality rate,Number,22,22,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
2,2011,2011,AZ,Arizona,NVSS,Asthma,Asthma mortality rate,Number,29,29,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
3,2015,2015,CT,Connecticut,NVSS,Asthma,Asthma mortality rate,Number,34,34,...,AST,AST4_1,NMBR,GENDER,GENF,,,,,
4,2011,2011,FL,Florida,NVSS,Asthma,Asthma mortality rate,Number,54,54,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64714,2011,2011,WI,Wisconsin,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,Age-adjusted Prevalence,,,...,AST,AST6_2,AGEADJPREV,GENDER,GENM,%,,,****,Sample size of denominator and/or age group fo...
64715,2016,2016,TX,Texas,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,Age-adjusted Prevalence,,,...,AST,AST6_2,AGEADJPREV,RACE,MRC,%,,,****,Sample size of denominator and/or age group fo...
64716,2019,2019,WI,Wisconsin,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,Age-adjusted Prevalence,,,...,AST,AST6_2,AGEADJPREV,RACE,OTH,%,,,****,Sample size of denominator and/or age group fo...
64717,2014,2014,PR,Puerto Rico,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,Crude Prevalence,,,...,AST,AST6_2,CRDPREV,GENDER,GENM,%,,,****,Sample size of denominator and/or age group fo...


In [209]:
asthma_df.columns

Index(['yearstart', 'yearend', 'locationabbr', 'locationdesc', 'datasource',
       'topic', 'question', 'datavaluetype', 'datavalue', 'datavaluealt',
       'stratificationcategory1', 'stratification1', 'locationid', 'topicid',
       'questionid', 'datavaluetypeid', 'stratificationcategoryid1',
       'stratificationid1', 'datavalueunit', 'lowconfidencelimit',
       'highconfidencelimit', 'datavaluefootnotesymbol', 'datavaluefootnote'],
      dtype='object')

In [248]:
asthma_df['question'].unique()

array(['Asthma mortality rate',
       'Emergency department visit rate for asthma',
       'Hospitalizations for asthma',
       'Current asthma prevalence among adults aged >= 18 years',
       'Asthma prevalence among women aged 18-44 years',
       'Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma',
       'Influenza vaccination among noninstitutionalized adults aged >= 65 years with asthma',
       'Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma',
       'Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma'],
      dtype=object)

In [249]:
questions_OI = ['Asthma mortality rate','Emergency department visit rate for asthma','Hospitalizations for asthma',\
               'Current asthma prevalence among adults aged >= 18 years','Asthma prevalence among women aged 18-44 years']


In [264]:
questionids_list=[]
for question in questions_OI:
    questionids_list += list(asthma_df[asthma_df['question']==question]['questionid'].unique())
print(questionids_list)

['AST4_1', 'AST2_1', 'AST3_1', 'AST1_1', 'AST1_2']


In [265]:
# filtering the dataset for the questions of interest
filtered_asthma_df = asthma_df[asthma_df['questionid'].isin(questionids_list)]
filtered_asthma_df

Unnamed: 0,yearstart,yearend,locationabbr,locationdesc,datasource,topic,question,datavaluetype,datavalue,datavaluealt,...,topicid,questionid,datavaluetypeid,stratificationcategoryid1,stratificationid1,datavalueunit,lowconfidencelimit,highconfidencelimit,datavaluefootnotesymbol,datavaluefootnote
0,2012,2012,AL,Alabama,NVSS,Asthma,Asthma mortality rate,Number,37,37,...,AST,AST4_1,NMBR,GENDER,GENF,,,,,
1,2014,2014,AL,Alabama,NVSS,Asthma,Asthma mortality rate,Number,22,22,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
2,2011,2011,AZ,Arizona,NVSS,Asthma,Asthma mortality rate,Number,29,29,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
3,2015,2015,CT,Connecticut,NVSS,Asthma,Asthma mortality rate,Number,34,34,...,AST,AST4_1,NMBR,GENDER,GENF,,,,,
4,2011,2011,FL,Florida,NVSS,Asthma,Asthma mortality rate,Number,54,54,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37547,2019,2019,ID,Idaho,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,7.9,7.9,...,AST,AST1_1,AGEADJPREV,GENDER,GENM,%,6.4,9.7,,
37548,2017,2017,GA,Georgia,BRFSS,Asthma,Asthma prevalence among women aged 18-44 years,Crude Prevalence,12.2,12.2,...,AST,AST1_2,CRDPREV,RACE,WHT,%,8.8,16.8,,
37549,2013,2013,FL,Florida,BRFSS,Asthma,Asthma prevalence among women aged 18-44 years,Crude Prevalence,10,10,...,AST,AST1_2,CRDPREV,OVERALL,OVR,%,8.6,11.5,,
37550,2014,2014,MO,Missouri,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,Crude Prevalence,9.7,9.7,...,AST,AST1_1,CRDPREV,OVERALL,OVR,%,8.7,10.8,,


In [266]:
#checking if column 'yearstart' and 'yearend' are the same
print(f"Are the columns datavalue and datavaluealt end the same?\n{filtered_asthma_df['datavalue'].equals(filtered_asthma_df['datavaluealt'])}")
print(f"Are the columns yearstart and year end the same?\n{filtered_asthma_df['yearstart'].equals(filtered_asthma_df['yearend'])}")
#if these columns are the same drop one of the duplicates and other not useful columns
if filtered_asthma_df['yearstart'].equals(filtered_asthma_df['yearend']):
    columns_to_drop = ['yearend',\
                        'topic',\
                        'datavaluealt',\
                        'topicid',\
                        'datavaluetypeid',\
                        'stratificationcategoryid1',\
                        'stratificationid1',\
                        'lowconfidencelimit',\
                       'highconfidencelimit',\
                       'datavaluefootnotesymbol',\
                      'datavaluefootnote']
    filtered_asthma_df=filtered_asthma_df.drop(columns=columns_to_drop)
    print(f"The following columns {columns_to_drop} were dropped")

Are the columns datavalue and datavaluealt end the same?
True
Are the columns yearstart and year end the same?
True
The following columns ['yearend', 'topic', 'datavaluealt', 'topicid', 'datavaluetypeid', 'stratificationcategoryid1', 'stratificationid1', 'lowconfidencelimit', 'highconfidencelimit', 'datavaluefootnotesymbol', 'datavaluefootnote'] were dropped


In [267]:
filtered_asthma_df=filtered_asthma_df.rename(columns={"yearstart": "year","locationabbr":"state_id","locationdesc":"state"})

In [268]:
#checking for missing data
filtered_asthma_df.isnull().sum()


year                           0
state_id                       0
state                          0
datasource                     0
question                       0
datavaluetype                  0
datavalue                  17259
stratificationcategory1        0
stratification1                0
locationid                     0
questionid                     0
datavalueunit               7548
dtype: int64

In [269]:
# removing missing data values
filtered_asthma_df=filtered_asthma_df[filtered_asthma_df['datavalue'].isna()==False]
filtered_asthma_df

Unnamed: 0,year,state_id,state,datasource,question,datavaluetype,datavalue,stratificationcategory1,stratification1,locationid,questionid,datavalueunit
0,2012,AL,Alabama,NVSS,Asthma mortality rate,Number,37,Gender,Female,1,AST4_1,
1,2014,AL,Alabama,NVSS,Asthma mortality rate,Number,22,Gender,Male,1,AST4_1,
2,2011,AZ,Arizona,NVSS,Asthma mortality rate,Number,29,Gender,Male,4,AST4_1,
3,2015,CT,Connecticut,NVSS,Asthma mortality rate,Number,34,Gender,Female,9,AST4_1,
4,2011,FL,Florida,NVSS,Asthma mortality rate,Number,54,Gender,Male,12,AST4_1,
...,...,...,...,...,...,...,...,...,...,...,...,...
37547,2019,ID,Idaho,BRFSS,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,7.9,Gender,Male,16,AST1_1,%
37548,2017,GA,Georgia,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,12.2,Race/Ethnicity,"White, non-Hispanic",13,AST1_2,%
37549,2013,FL,Florida,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,10,Overall,Overall,12,AST1_2,%
37550,2014,MO,Missouri,BRFSS,Current asthma prevalence among adults aged >=...,Crude Prevalence,9.7,Overall,Overall,29,AST1_1,%


In [270]:
# checking that all the missing data value were correctly removed
filtered_asthma_df.isnull().sum()

year                          0
state_id                      0
state                         0
datasource                    0
question                      0
datavaluetype                 0
datavalue                     0
stratificationcategory1       0
stratification1               0
locationid                    0
questionid                    0
datavalueunit              2827
dtype: int64

In [271]:
filtered_asthma_df['question'].unique()

array(['Asthma mortality rate',
       'Emergency department visit rate for asthma',
       'Hospitalizations for asthma',
       'Current asthma prevalence among adults aged >= 18 years',
       'Asthma prevalence among women aged 18-44 years'], dtype=object)

In [272]:
# drop territories (PR, GU, VI) and nation-wide data (US)
state_to_drop = ['PR','GU','US','VI']
filtered_asthma_df = filtered_asthma_df[~filtered_asthma_df['state_id'].isin(state_to_drop)]

filtered_asthma_df

Unnamed: 0,year,state_id,state,datasource,question,datavaluetype,datavalue,stratificationcategory1,stratification1,locationid,questionid,datavalueunit
0,2012,AL,Alabama,NVSS,Asthma mortality rate,Number,37,Gender,Female,1,AST4_1,
1,2014,AL,Alabama,NVSS,Asthma mortality rate,Number,22,Gender,Male,1,AST4_1,
2,2011,AZ,Arizona,NVSS,Asthma mortality rate,Number,29,Gender,Male,4,AST4_1,
3,2015,CT,Connecticut,NVSS,Asthma mortality rate,Number,34,Gender,Female,9,AST4_1,
4,2011,FL,Florida,NVSS,Asthma mortality rate,Number,54,Gender,Male,12,AST4_1,
...,...,...,...,...,...,...,...,...,...,...,...,...
37547,2019,ID,Idaho,BRFSS,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,7.9,Gender,Male,16,AST1_1,%
37548,2017,GA,Georgia,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,12.2,Race/Ethnicity,"White, non-Hispanic",13,AST1_2,%
37549,2013,FL,Florida,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,10,Overall,Overall,12,AST1_2,%
37550,2014,MO,Missouri,BRFSS,Current asthma prevalence among adults aged >=...,Crude Prevalence,9.7,Overall,Overall,29,AST1_1,%


In [273]:
#checking for duplicates
filtered_asthma_df=filtered_asthma_df.drop_duplicates()
filtered_asthma_df

Unnamed: 0,year,state_id,state,datasource,question,datavaluetype,datavalue,stratificationcategory1,stratification1,locationid,questionid,datavalueunit
0,2012,AL,Alabama,NVSS,Asthma mortality rate,Number,37,Gender,Female,1,AST4_1,
1,2014,AL,Alabama,NVSS,Asthma mortality rate,Number,22,Gender,Male,1,AST4_1,
2,2011,AZ,Arizona,NVSS,Asthma mortality rate,Number,29,Gender,Male,4,AST4_1,
3,2015,CT,Connecticut,NVSS,Asthma mortality rate,Number,34,Gender,Female,9,AST4_1,
4,2011,FL,Florida,NVSS,Asthma mortality rate,Number,54,Gender,Male,12,AST4_1,
...,...,...,...,...,...,...,...,...,...,...,...,...
37547,2019,ID,Idaho,BRFSS,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,7.9,Gender,Male,16,AST1_1,%
37548,2017,GA,Georgia,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,12.2,Race/Ethnicity,"White, non-Hispanic",13,AST1_2,%
37549,2013,FL,Florida,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,10,Overall,Overall,12,AST1_2,%
37550,2014,MO,Missouri,BRFSS,Current asthma prevalence among adults aged >=...,Crude Prevalence,9.7,Overall,Overall,29,AST1_1,%


In [279]:
#checking that there isn't more than one data value entry for the same year and state.
duplicateRowsDF = filtered_asthma_df[filtered_asthma_df.duplicated(subset=['year','state_id','state','datasource','question','datavaluetype','datavaluetype','stratificationcategory1','stratification1','locationid','questionid','datavalueunit'], keep=False)]
duplicateRowsDF


Unnamed: 0,year,state_id,state,datasource,question,datavaluetype,datavalue,stratificationcategory1,stratification1,locationid,questionid,datavalueunit


In [282]:
#keeping only the Overall values
filtered_asthma_df = filtered_asthma_df[filtered_asthma_df['stratificationcategory1'] == 'Overall']
filtered_asthma_df

Unnamed: 0,year,state_id,state,datasource,question,datavaluetype,datavalue,stratificationcategory1,stratification1,locationid,questionid,datavalueunit
6,2011,HI,Hawaii,NVSS,Asthma mortality rate,Number,26,Overall,Overall,15,AST4_1,
7,2013,HI,Hawaii,NVSS,Asthma mortality rate,Number,24,Overall,Overall,15,AST4_1,
12,2012,TX,Texas,NVSS,Asthma mortality rate,Number,206,Overall,Overall,48,AST4_1,
14,2015,WA,Washington,NVSS,Asthma mortality rate,Number,88,Overall,Overall,53,AST4_1,
21,2019,NY,New York,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,12,Overall,Overall,36,AST1_2,%
...,...,...,...,...,...,...,...,...,...,...,...,...
37524,2013,ID,Idaho,BRFSS,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,8.7,Overall,Overall,16,AST1_1,%
37536,2019,NC,North Carolina,BRFSS,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,8.2,Overall,Overall,37,AST1_1,%
37549,2013,FL,Florida,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,10,Overall,Overall,12,AST1_2,%
37550,2014,MO,Missouri,BRFSS,Current asthma prevalence among adults aged >=...,Crude Prevalence,9.7,Overall,Overall,29,AST1_1,%


# ETL of [Asthma data](https://chronicdata.cdc.gov/Chronic-Disease-Indicators/U-S-Chronic-Disease-Indicators-Asthma/us8e-ubyj)

In [None]:
# final table needs to have: year, state_id, air_quality_indexes

In [None]:
#connecting to the database
conn = sqlite3.connect('health_air.db')

In [283]:
#function to add a table to the database
def add_table_to_db(table_name, df, connection_db):
    cur = conn.cursor()
    df.to_sql(name=table_name, if_exists='replace', con=conn)
    return(cur.execute("""SELECT * FROM asthma;""").fetchall())

In [284]:
# adding the table asthma to the database and controlling that there are data recorded.
add_table_to_db('asthma',filtered_asthma_df)

[(6,
  '2011',
  'HI',
  'Hawaii',
  'NVSS',
  'Asthma mortality rate',
  'Number',
  '26',
  'Overall',
  'Overall',
  '15',
  'AST4_1',
  None),
 (7,
  '2013',
  'HI',
  'Hawaii',
  'NVSS',
  'Asthma mortality rate',
  'Number',
  '24',
  'Overall',
  'Overall',
  '15',
  'AST4_1',
  None),
 (12,
  '2012',
  'TX',
  'Texas',
  'NVSS',
  'Asthma mortality rate',
  'Number',
  '206',
  'Overall',
  'Overall',
  '48',
  'AST4_1',
  None),
 (14,
  '2015',
  'WA',
  'Washington',
  'NVSS',
  'Asthma mortality rate',
  'Number',
  '88',
  'Overall',
  'Overall',
  '53',
  'AST4_1',
  None),
 (21,
  '2019',
  'NY',
  'New York',
  'BRFSS',
  'Asthma prevalence among women aged 18-44 years',
  'Crude Prevalence',
  '12',
  'Overall',
  'Overall',
  '36',
  'AST1_2',
  '%'),
 (22,
  '2011',
  'VA',
  'Virginia',
  'BRFSS',
  'Asthma prevalence among women aged 18-44 years',
  'Crude Prevalence',
  '11.9',
  'Overall',
  'Overall',
  '51',
  'AST1_2',
  '%'),
 (24,
  '2014',
  'GA',
  'Georgia

In [285]:
conn.close()