In [294]:
import pandas as pd
import sqlite3
from sodapy import Socrata
from config import userID, key
import requests

In [292]:
#connecting to the sqlite database
conn = sqlite3.connect('health_air.db')

In [288]:
#function to add a table to the database
def add_table_to_db(table_name, df, connection_db):
    cur = conn.cursor()
    df.to_sql(name=table_name, if_exists='replace', con=conn)
    return(cur.execute("""SELECT * FROM asthma;""").fetchall())

# ETL of [Asthma data](https://chronicdata.cdc.gov/Chronic-Disease-Indicators/U-S-Chronic-Disease-Indicators-Asthma/us8e-ubyj)

In [11]:
# Instruction from the CDC website to extract data
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("chronicdata.cdc.gov", None)

# Return as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get_all("us8e-ubyj")

# Convert to pandas DataFrame
asthma_df = pd.DataFrame.from_records(results)



In [208]:
asthma_df

Unnamed: 0,yearstart,yearend,locationabbr,locationdesc,datasource,topic,question,datavaluetype,datavalue,datavaluealt,...,topicid,questionid,datavaluetypeid,stratificationcategoryid1,stratificationid1,datavalueunit,lowconfidencelimit,highconfidencelimit,datavaluefootnotesymbol,datavaluefootnote
0,2012,2012,AL,Alabama,NVSS,Asthma,Asthma mortality rate,Number,37,37,...,AST,AST4_1,NMBR,GENDER,GENF,,,,,
1,2014,2014,AL,Alabama,NVSS,Asthma,Asthma mortality rate,Number,22,22,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
2,2011,2011,AZ,Arizona,NVSS,Asthma,Asthma mortality rate,Number,29,29,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
3,2015,2015,CT,Connecticut,NVSS,Asthma,Asthma mortality rate,Number,34,34,...,AST,AST4_1,NMBR,GENDER,GENF,,,,,
4,2011,2011,FL,Florida,NVSS,Asthma,Asthma mortality rate,Number,54,54,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64714,2011,2011,WI,Wisconsin,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,Age-adjusted Prevalence,,,...,AST,AST6_2,AGEADJPREV,GENDER,GENM,%,,,****,Sample size of denominator and/or age group fo...
64715,2016,2016,TX,Texas,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,Age-adjusted Prevalence,,,...,AST,AST6_2,AGEADJPREV,RACE,MRC,%,,,****,Sample size of denominator and/or age group fo...
64716,2019,2019,WI,Wisconsin,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,Age-adjusted Prevalence,,,...,AST,AST6_2,AGEADJPREV,RACE,OTH,%,,,****,Sample size of denominator and/or age group fo...
64717,2014,2014,PR,Puerto Rico,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,Crude Prevalence,,,...,AST,AST6_2,CRDPREV,GENDER,GENM,%,,,****,Sample size of denominator and/or age group fo...


In [209]:
asthma_df.columns

Index(['yearstart', 'yearend', 'locationabbr', 'locationdesc', 'datasource',
       'topic', 'question', 'datavaluetype', 'datavalue', 'datavaluealt',
       'stratificationcategory1', 'stratification1', 'locationid', 'topicid',
       'questionid', 'datavaluetypeid', 'stratificationcategoryid1',
       'stratificationid1', 'datavalueunit', 'lowconfidencelimit',
       'highconfidencelimit', 'datavaluefootnotesymbol', 'datavaluefootnote'],
      dtype='object')

In [248]:
asthma_df['question'].unique()

array(['Asthma mortality rate',
       'Emergency department visit rate for asthma',
       'Hospitalizations for asthma',
       'Current asthma prevalence among adults aged >= 18 years',
       'Asthma prevalence among women aged 18-44 years',
       'Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma',
       'Influenza vaccination among noninstitutionalized adults aged >= 65 years with asthma',
       'Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma',
       'Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma'],
      dtype=object)

In [249]:
questions_OI = ['Asthma mortality rate','Emergency department visit rate for asthma','Hospitalizations for asthma',\
               'Current asthma prevalence among adults aged >= 18 years','Asthma prevalence among women aged 18-44 years']


In [264]:
questionids_list=[]
for question in questions_OI:
    questionids_list += list(asthma_df[asthma_df['question']==question]['questionid'].unique())
print(questionids_list)

['AST4_1', 'AST2_1', 'AST3_1', 'AST1_1', 'AST1_2']


In [265]:
# filtering the dataset for the questions of interest
filtered_asthma_df = asthma_df[asthma_df['questionid'].isin(questionids_list)]
filtered_asthma_df

Unnamed: 0,yearstart,yearend,locationabbr,locationdesc,datasource,topic,question,datavaluetype,datavalue,datavaluealt,...,topicid,questionid,datavaluetypeid,stratificationcategoryid1,stratificationid1,datavalueunit,lowconfidencelimit,highconfidencelimit,datavaluefootnotesymbol,datavaluefootnote
0,2012,2012,AL,Alabama,NVSS,Asthma,Asthma mortality rate,Number,37,37,...,AST,AST4_1,NMBR,GENDER,GENF,,,,,
1,2014,2014,AL,Alabama,NVSS,Asthma,Asthma mortality rate,Number,22,22,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
2,2011,2011,AZ,Arizona,NVSS,Asthma,Asthma mortality rate,Number,29,29,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
3,2015,2015,CT,Connecticut,NVSS,Asthma,Asthma mortality rate,Number,34,34,...,AST,AST4_1,NMBR,GENDER,GENF,,,,,
4,2011,2011,FL,Florida,NVSS,Asthma,Asthma mortality rate,Number,54,54,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37547,2019,2019,ID,Idaho,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,7.9,7.9,...,AST,AST1_1,AGEADJPREV,GENDER,GENM,%,6.4,9.7,,
37548,2017,2017,GA,Georgia,BRFSS,Asthma,Asthma prevalence among women aged 18-44 years,Crude Prevalence,12.2,12.2,...,AST,AST1_2,CRDPREV,RACE,WHT,%,8.8,16.8,,
37549,2013,2013,FL,Florida,BRFSS,Asthma,Asthma prevalence among women aged 18-44 years,Crude Prevalence,10,10,...,AST,AST1_2,CRDPREV,OVERALL,OVR,%,8.6,11.5,,
37550,2014,2014,MO,Missouri,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,Crude Prevalence,9.7,9.7,...,AST,AST1_1,CRDPREV,OVERALL,OVR,%,8.7,10.8,,


In [266]:
#checking if column 'yearstart' and 'yearend' are the same
print(f"Are the columns datavalue and datavaluealt end the same?\n{filtered_asthma_df['datavalue'].equals(filtered_asthma_df['datavaluealt'])}")
print(f"Are the columns yearstart and year end the same?\n{filtered_asthma_df['yearstart'].equals(filtered_asthma_df['yearend'])}")
#if these columns are the same drop one of the duplicates and other not useful columns
if filtered_asthma_df['yearstart'].equals(filtered_asthma_df['yearend']):
    columns_to_drop = ['yearend',\
                        'topic',\
                        'datavaluealt',\
                        'topicid',\
                        'datavaluetypeid',\
                        'stratificationcategoryid1',\
                        'stratificationid1',\
                        'lowconfidencelimit',\
                       'highconfidencelimit',\
                       'datavaluefootnotesymbol',\
                      'datavaluefootnote']
    filtered_asthma_df=filtered_asthma_df.drop(columns=columns_to_drop)
    print(f"The following columns {columns_to_drop} were dropped")

Are the columns datavalue and datavaluealt end the same?
True
Are the columns yearstart and year end the same?
True
The following columns ['yearend', 'topic', 'datavaluealt', 'topicid', 'datavaluetypeid', 'stratificationcategoryid1', 'stratificationid1', 'lowconfidencelimit', 'highconfidencelimit', 'datavaluefootnotesymbol', 'datavaluefootnote'] were dropped


In [267]:
filtered_asthma_df=filtered_asthma_df.rename(columns={"yearstart": "year","locationabbr":"state_id","locationdesc":"state"})

In [268]:
#checking for missing data
filtered_asthma_df.isnull().sum()


year                           0
state_id                       0
state                          0
datasource                     0
question                       0
datavaluetype                  0
datavalue                  17259
stratificationcategory1        0
stratification1                0
locationid                     0
questionid                     0
datavalueunit               7548
dtype: int64

In [269]:
# removing missing data values
filtered_asthma_df=filtered_asthma_df[filtered_asthma_df['datavalue'].isna()==False]
filtered_asthma_df

Unnamed: 0,year,state_id,state,datasource,question,datavaluetype,datavalue,stratificationcategory1,stratification1,locationid,questionid,datavalueunit
0,2012,AL,Alabama,NVSS,Asthma mortality rate,Number,37,Gender,Female,1,AST4_1,
1,2014,AL,Alabama,NVSS,Asthma mortality rate,Number,22,Gender,Male,1,AST4_1,
2,2011,AZ,Arizona,NVSS,Asthma mortality rate,Number,29,Gender,Male,4,AST4_1,
3,2015,CT,Connecticut,NVSS,Asthma mortality rate,Number,34,Gender,Female,9,AST4_1,
4,2011,FL,Florida,NVSS,Asthma mortality rate,Number,54,Gender,Male,12,AST4_1,
...,...,...,...,...,...,...,...,...,...,...,...,...
37547,2019,ID,Idaho,BRFSS,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,7.9,Gender,Male,16,AST1_1,%
37548,2017,GA,Georgia,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,12.2,Race/Ethnicity,"White, non-Hispanic",13,AST1_2,%
37549,2013,FL,Florida,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,10,Overall,Overall,12,AST1_2,%
37550,2014,MO,Missouri,BRFSS,Current asthma prevalence among adults aged >=...,Crude Prevalence,9.7,Overall,Overall,29,AST1_1,%


In [270]:
# checking that all the missing data value were correctly removed
filtered_asthma_df.isnull().sum()

year                          0
state_id                      0
state                         0
datasource                    0
question                      0
datavaluetype                 0
datavalue                     0
stratificationcategory1       0
stratification1               0
locationid                    0
questionid                    0
datavalueunit              2827
dtype: int64

In [271]:
filtered_asthma_df['question'].unique()

array(['Asthma mortality rate',
       'Emergency department visit rate for asthma',
       'Hospitalizations for asthma',
       'Current asthma prevalence among adults aged >= 18 years',
       'Asthma prevalence among women aged 18-44 years'], dtype=object)

In [272]:
# drop territories (PR, GU, VI) and nation-wide data (US)
state_to_drop = ['PR','GU','US','VI']
filtered_asthma_df = filtered_asthma_df[~filtered_asthma_df['state_id'].isin(state_to_drop)]

filtered_asthma_df

Unnamed: 0,year,state_id,state,datasource,question,datavaluetype,datavalue,stratificationcategory1,stratification1,locationid,questionid,datavalueunit
0,2012,AL,Alabama,NVSS,Asthma mortality rate,Number,37,Gender,Female,1,AST4_1,
1,2014,AL,Alabama,NVSS,Asthma mortality rate,Number,22,Gender,Male,1,AST4_1,
2,2011,AZ,Arizona,NVSS,Asthma mortality rate,Number,29,Gender,Male,4,AST4_1,
3,2015,CT,Connecticut,NVSS,Asthma mortality rate,Number,34,Gender,Female,9,AST4_1,
4,2011,FL,Florida,NVSS,Asthma mortality rate,Number,54,Gender,Male,12,AST4_1,
...,...,...,...,...,...,...,...,...,...,...,...,...
37547,2019,ID,Idaho,BRFSS,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,7.9,Gender,Male,16,AST1_1,%
37548,2017,GA,Georgia,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,12.2,Race/Ethnicity,"White, non-Hispanic",13,AST1_2,%
37549,2013,FL,Florida,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,10,Overall,Overall,12,AST1_2,%
37550,2014,MO,Missouri,BRFSS,Current asthma prevalence among adults aged >=...,Crude Prevalence,9.7,Overall,Overall,29,AST1_1,%


In [273]:
#checking for duplicates
filtered_asthma_df=filtered_asthma_df.drop_duplicates()
filtered_asthma_df

Unnamed: 0,year,state_id,state,datasource,question,datavaluetype,datavalue,stratificationcategory1,stratification1,locationid,questionid,datavalueunit
0,2012,AL,Alabama,NVSS,Asthma mortality rate,Number,37,Gender,Female,1,AST4_1,
1,2014,AL,Alabama,NVSS,Asthma mortality rate,Number,22,Gender,Male,1,AST4_1,
2,2011,AZ,Arizona,NVSS,Asthma mortality rate,Number,29,Gender,Male,4,AST4_1,
3,2015,CT,Connecticut,NVSS,Asthma mortality rate,Number,34,Gender,Female,9,AST4_1,
4,2011,FL,Florida,NVSS,Asthma mortality rate,Number,54,Gender,Male,12,AST4_1,
...,...,...,...,...,...,...,...,...,...,...,...,...
37547,2019,ID,Idaho,BRFSS,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,7.9,Gender,Male,16,AST1_1,%
37548,2017,GA,Georgia,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,12.2,Race/Ethnicity,"White, non-Hispanic",13,AST1_2,%
37549,2013,FL,Florida,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,10,Overall,Overall,12,AST1_2,%
37550,2014,MO,Missouri,BRFSS,Current asthma prevalence among adults aged >=...,Crude Prevalence,9.7,Overall,Overall,29,AST1_1,%


In [279]:
#checking that there isn't more than one data value entry for the same year and state.
duplicateRowsDF = filtered_asthma_df[filtered_asthma_df.duplicated(subset=['year','state_id','state','datasource','question','datavaluetype','datavaluetype','stratificationcategory1','stratification1','locationid','questionid','datavalueunit'], keep=False)]
duplicateRowsDF

Unnamed: 0,year,state_id,state,datasource,question,datavaluetype,datavalue,stratificationcategory1,stratification1,locationid,questionid,datavalueunit


In [282]:
#keeping only the Overall values
filtered_asthma_df = filtered_asthma_df[filtered_asthma_df['stratificationcategory1'] == 'Overall']
filtered_asthma_df

Unnamed: 0,year,state_id,state,datasource,question,datavaluetype,datavalue,stratificationcategory1,stratification1,locationid,questionid,datavalueunit
6,2011,HI,Hawaii,NVSS,Asthma mortality rate,Number,26,Overall,Overall,15,AST4_1,
7,2013,HI,Hawaii,NVSS,Asthma mortality rate,Number,24,Overall,Overall,15,AST4_1,
12,2012,TX,Texas,NVSS,Asthma mortality rate,Number,206,Overall,Overall,48,AST4_1,
14,2015,WA,Washington,NVSS,Asthma mortality rate,Number,88,Overall,Overall,53,AST4_1,
21,2019,NY,New York,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,12,Overall,Overall,36,AST1_2,%
...,...,...,...,...,...,...,...,...,...,...,...,...
37524,2013,ID,Idaho,BRFSS,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,8.7,Overall,Overall,16,AST1_1,%
37536,2019,NC,North Carolina,BRFSS,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,8.2,Overall,Overall,37,AST1_1,%
37549,2013,FL,Florida,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,10,Overall,Overall,12,AST1_2,%
37550,2014,MO,Missouri,BRFSS,Current asthma prevalence among adults aged >=...,Crude Prevalence,9.7,Overall,Overall,29,AST1_1,%


In [286]:
# final dropping of unnecessary columns
filtered_asthma_df=filtered_asthma_df.drop(columns=['state',\
                                                    'datasource',\
                                                    'question',\
                                                    'stratificationcategory1',\
                                                    'stratification1',\
                                                    'locationid',\
                                                    'questionid'])
filtered_asthma_df

Unnamed: 0,year,state_id,datavaluetype,datavalue,datavalueunit
6,2011,HI,Number,26,
7,2013,HI,Number,24,
12,2012,TX,Number,206,
14,2015,WA,Number,88,
21,2019,NY,Crude Prevalence,12,%
...,...,...,...,...,...
37524,2013,ID,Age-adjusted Prevalence,8.7,%
37536,2019,NC,Age-adjusted Prevalence,8.2,%
37549,2013,FL,Crude Prevalence,10,%
37550,2014,MO,Crude Prevalence,9.7,%


In [290]:
# adding the asthma table to the database and controlling that there are data recorded.
add_table_to_db('asthma',filtered_asthma_df, conn)

[(6, '2011', 'HI', 'Number', '26', None),
 (7, '2013', 'HI', 'Number', '24', None),
 (12, '2012', 'TX', 'Number', '206', None),
 (14, '2015', 'WA', 'Number', '88', None),
 (21, '2019', 'NY', 'Crude Prevalence', '12', '%'),
 (22, '2011', 'VA', 'Crude Prevalence', '11.9', '%'),
 (24, '2014', 'GA', 'Number', '96', None),
 (31, '2013', 'IN', 'Number', '69', None),
 (34, '2012', 'CA', 'Number', '384', None),
 (41, '2013', 'NC', 'Number', '112', None),
 (42, '2013', 'WA', 'Number', '87', None),
 (45, '2016', 'NY', 'Number', '273', None),
 (58, '2017', 'IA', 'Number', '32', None),
 (69, '2017', 'AR', 'Crude Rate', '10.7', 'cases per 1,000,000'),
 (74, '2013', 'AL', 'Crude Rate', '8.7', 'cases per 1,000,000'),
 (81, '2015', 'CT', 'Crude Rate', '13.4', 'cases per 1,000,000'),
 (83, '2011', 'CO', 'Crude Rate', '8.2', 'cases per 1,000,000'),
 (84, '2014', 'IL', 'Age-adjusted Rate', '12.2', 'cases per 1,000,000'),
 (85, '2015', 'IA', 'Crude Rate', '13.1', 'cases per 1,000,000'),
 (92, '2013', 'CT'

# ETL of [Air Data](https://www.epa.gov/outdoor-air-quality-data)

In [None]:
# final table needs to have: year, state_id, air_quality_indexes

In [299]:
#filtered_asthma_df['year'].unique()

array(['2011', '2013', '2012', '2015', '2019', '2014', '2016', '2017',
       '2010', '2018'], dtype=object)

In [332]:
#res_list = []
#for year in ['2011','2012']:
#    url= f'https://aqs.epa.gov/data/api/annualData/byState?email={userID}&key={key}&param=81102,81104&bdate={year}0101&edate={year}1231&state=37'
#    r = requests.get(url)
#    print(r.status_code)
#    res_list += [r.json()]

200
200


In [345]:
#air_df_list = []

#for res in res_list:
#    air_df_list += [pd.DataFrame(res['Data'])]
#air_df = pd.concat(air_df_list)
#air_df

Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration,...,fiftieth_percentile,tenth_percentile,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change
0,37,117,1,81102,1,35.81066,-76.9063,WGS84,PM10 Total 0-10um STP,24 HOUR,...,9.0,9.0,Jamesville School,1210 Hayes Street,North Carolina,Martin,Jamesville,,,2020-05-21
1,37,123,1,81102,1,35.263165,-79.836636,NAD83,PM10 Total 0-10um STP,24 HOUR,...,12.0,6.0,Candor: EPA CASTNet Site,126 PERRY DRIVE,North Carolina,Montgomery,Candor,,,2020-05-21
2,37,183,14,81102,7,35.856111,-78.574167,WGS84,PM10 Total 0-10um STP,24 HOUR,...,14.0,7.0,Millbrook School,3801 SPRING FOREST RD.,North Carolina,Wake,Raleigh,39580.0,"Raleigh, NC",2020-05-21
3,37,111,4,81102,1,35.687406,-81.993808,WGS84,PM10 Total 0-10um STP,24 HOUR,...,14.0,6.0,Marion Sch.,676 State Street,North Carolina,McDowell,Marion,32000.0,"Marion, NC",2020-05-21
4,37,35,4,81102,2,35.728889,-81.365556,WGS84,PM10 Total 0-10um STP,24 HOUR,...,13.0,7.0,Hickory Water Tower,1650 1ST STREET,North Carolina,Catawba,Hickory,25860.0,"Hickory-Lenoir-Morganton, NC",2020-05-21
5,37,35,4,81102,1,35.728889,-81.365556,WGS84,PM10 Total 0-10um STP,24 HOUR,...,13.0,5.0,Hickory Water Tower,1650 1ST STREET,North Carolina,Catawba,Hickory,25860.0,"Hickory-Lenoir-Morganton, NC",2020-05-21
6,37,81,13,81102,1,36.109006,-79.802314,NAD83,PM10 Total 0-10um STP,24 HOUR,...,9.0,5.0,Mendenhall School,205 WILOUGHBY BLVD,North Carolina,Guilford,Greensboro,24660.0,"Greensboro-High Point, NC",2020-05-21
7,37,51,9,81102,1,35.041416,-78.953112,WGS84,PM10 Total 0-10um STP,24 HOUR,...,13.0,6.0,William Owen School,4533 RAEFORD RD,North Carolina,Cumberland,Fayetteville,22180.0,"Fayetteville, NC",2020-05-21
8,37,63,15,81102,7,36.032955,-78.904037,NAD83,PM10 Total 0-10um STP,24 HOUR,...,14.0,8.0,Durham Armory,801 STADIUM DRIVE,North Carolina,Durham,Durham,20500.0,"Durham-Chapel Hill, NC",2020-05-21
9,37,119,3,81102,2,35.251717,-80.824717,WGS84,PM10 Total 0-10um STP,24 HOUR,...,21.0,9.0,#11 Fire Station,FIRE STA #11 620 WEST 28TH STREET,North Carolina,Mecklenburg,Charlotte,16740.0,"Charlotte-Concord-Gastonia, NC-SC",2020-05-21


In [346]:
#air_df['parameter'].unique()

array(['PM10 Total 0-10um STP'], dtype=object)

In [347]:
#air_df.columns

Index(['state_code', 'county_code', 'site_number', 'parameter_code', 'poc',
       'latitude', 'longitude', 'datum', 'parameter', 'sample_duration',
       'pollutant_standard', 'metric_used', 'method', 'year',
       'units_of_measure', 'event_type', 'observation_count',
       'observation_percent', 'validity_indicator', 'valid_day_count',
       'required_day_count', 'exceptional_data_count',
       'null_observation_count', 'primary_exceedance_count',
       'secondary_exceedance_count', 'certification_indicator',
       'arithmetic_mean', 'standard_deviation', 'first_max_value',
       'first_max_datetime', 'second_max_value', 'second_max_datetime',
       'third_max_value', 'third_max_datetime', 'fourth_max_value',
       'fourth_max_datetime', 'first_max_nonoverlap_value',
       'first_max_n_o_datetime', 'second_max_nonoverlap_value',
       'second_max_n_o_datetime', 'ninety_ninth_percentile',
       'ninety_eighth_percentile', 'ninety_fifth_percentile',
       'ninetieth_perc

In [349]:
#air_df['observation_percent']

0       2.0
1      85.0
2      98.0
3      80.0
4      93.0
5      97.0
6      93.0
7     100.0
8      91.0
9      90.0
10     95.0
11     98.0
12     97.0
13     98.0
14     79.0
15     51.0
16     80.0
17     51.0
0      98.0
1      98.0
2      98.0
3      96.0
4      96.0
5      97.0
6      98.0
7      95.0
8     100.0
9      90.0
10     82.0
11     84.0
12     16.0
13     98.0
14     16.0
15    100.0
Name: observation_percent, dtype: float64

In [291]:
#closing the connection to the sqlite database
conn.close()