In [1]:
import glob
import pandas as pd
import sqlite3
from pymongo import MongoClient
from sodapy import Socrata
import requests
from math import ceil
import glob

In [2]:
#connecting to the sqlite database
conn = sqlite3.connect('health_air.db')
#connecting to the MongoDB Database
mongo_client = MongoClient()
#creating the MongoDB
mydb = mongo_client['health_air_mongo']

In [3]:
#function to add a table to the database
def add_table_to_db(table_name, df, connection_db):
    cur = conn.cursor()
    df.to_sql(name=table_name, if_exists='replace', con=conn)
    return(cur.execute(f"""SELECT * FROM {table_name};""").fetchall())

In [4]:
def add_collection_to_mongo (collection_name, df, mongo_db_conn = mydb):
    mongo_db_conn[collection_name].insert_many(df.to_dict('records'))
    return print(f'{collection_name} stored in the MongoDB database named {mydb}')

# ETL of [Asthma data](https://chronicdata.cdc.gov/Chronic-Disease-Indicators/U-S-Chronic-Disease-Indicators-Asthma/us8e-ubyj)

In [5]:
# Instruction from the CDC website to extract data
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("chronicdata.cdc.gov", None)

# Return as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get_all("us8e-ubyj")

# Convert to pandas DataFrame
asthma_df = pd.DataFrame.from_records(results)



In [6]:
asthma_df

Unnamed: 0,yearstart,yearend,locationabbr,locationdesc,datasource,topic,question,datavaluetype,datavalue,datavaluealt,...,topicid,questionid,datavaluetypeid,stratificationcategoryid1,stratificationid1,datavalueunit,lowconfidencelimit,highconfidencelimit,datavaluefootnotesymbol,datavaluefootnote
0,2012,2012,AL,Alabama,NVSS,Asthma,Asthma mortality rate,Number,37,37,...,AST,AST4_1,NMBR,GENDER,GENF,,,,,
1,2014,2014,AL,Alabama,NVSS,Asthma,Asthma mortality rate,Number,22,22,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
2,2011,2011,AZ,Arizona,NVSS,Asthma,Asthma mortality rate,Number,29,29,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
3,2015,2015,CT,Connecticut,NVSS,Asthma,Asthma mortality rate,Number,34,34,...,AST,AST4_1,NMBR,GENDER,GENF,,,,,
4,2011,2011,FL,Florida,NVSS,Asthma,Asthma mortality rate,Number,54,54,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64714,2011,2011,WI,Wisconsin,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,Age-adjusted Prevalence,,,...,AST,AST6_2,AGEADJPREV,GENDER,GENM,%,,,****,Sample size of denominator and/or age group fo...
64715,2016,2016,TX,Texas,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,Age-adjusted Prevalence,,,...,AST,AST6_2,AGEADJPREV,RACE,MRC,%,,,****,Sample size of denominator and/or age group fo...
64716,2019,2019,WI,Wisconsin,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,Age-adjusted Prevalence,,,...,AST,AST6_2,AGEADJPREV,RACE,OTH,%,,,****,Sample size of denominator and/or age group fo...
64717,2014,2014,PR,Puerto Rico,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,Crude Prevalence,,,...,AST,AST6_2,CRDPREV,GENDER,GENM,%,,,****,Sample size of denominator and/or age group fo...


In [7]:
#printing the names of the dataframe columns
asthma_df.columns

Index(['yearstart', 'yearend', 'locationabbr', 'locationdesc', 'datasource',
       'topic', 'question', 'datavaluetype', 'datavalue', 'datavaluealt',
       'stratificationcategory1', 'stratification1', 'locationid', 'topicid',
       'questionid', 'datavaluetypeid', 'stratificationcategoryid1',
       'stratificationid1', 'datavalueunit', 'lowconfidencelimit',
       'highconfidencelimit', 'datavaluefootnotesymbol', 'datavaluefootnote'],
      dtype='object')

In [8]:
# printing the question the data are answering
asthma_df['question'].unique()

array(['Asthma mortality rate',
       'Emergency department visit rate for asthma',
       'Hospitalizations for asthma',
       'Current asthma prevalence among adults aged >= 18 years',
       'Asthma prevalence among women aged 18-44 years',
       'Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma',
       'Influenza vaccination among noninstitutionalized adults aged >= 65 years with asthma',
       'Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma',
       'Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma'],
      dtype=object)

In [9]:
#list containing the questions of interest for our project
questions_OI = ['Asthma mortality rate',\
                'Emergency department visit rate for asthma',\
                'Hospitalizations for asthma',\
                'Current asthma prevalence among adults aged >= 18 years',\
                'Asthma prevalence among women aged 18-44 years']


In [10]:
#retrieving the unique ID per question so that we address eventual spelling mistakes
questionids_list=[]
for question in questions_OI:
    questionids_list += list(asthma_df[asthma_df['question']==question]['questionid'].unique())
print(questionids_list)

['AST4_1', 'AST2_1', 'AST3_1', 'AST1_1', 'AST1_2']


In [11]:
# filtering the dataset for the questions of interest
filtered_asthma_df = asthma_df[asthma_df['questionid'].isin(questionids_list)]
filtered_asthma_df

Unnamed: 0,yearstart,yearend,locationabbr,locationdesc,datasource,topic,question,datavaluetype,datavalue,datavaluealt,...,topicid,questionid,datavaluetypeid,stratificationcategoryid1,stratificationid1,datavalueunit,lowconfidencelimit,highconfidencelimit,datavaluefootnotesymbol,datavaluefootnote
0,2012,2012,AL,Alabama,NVSS,Asthma,Asthma mortality rate,Number,37,37,...,AST,AST4_1,NMBR,GENDER,GENF,,,,,
1,2014,2014,AL,Alabama,NVSS,Asthma,Asthma mortality rate,Number,22,22,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
2,2011,2011,AZ,Arizona,NVSS,Asthma,Asthma mortality rate,Number,29,29,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
3,2015,2015,CT,Connecticut,NVSS,Asthma,Asthma mortality rate,Number,34,34,...,AST,AST4_1,NMBR,GENDER,GENF,,,,,
4,2011,2011,FL,Florida,NVSS,Asthma,Asthma mortality rate,Number,54,54,...,AST,AST4_1,NMBR,GENDER,GENM,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37547,2019,2019,ID,Idaho,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,7.9,7.9,...,AST,AST1_1,AGEADJPREV,GENDER,GENM,%,6.4,9.7,,
37548,2017,2017,GA,Georgia,BRFSS,Asthma,Asthma prevalence among women aged 18-44 years,Crude Prevalence,12.2,12.2,...,AST,AST1_2,CRDPREV,RACE,WHT,%,8.8,16.8,,
37549,2013,2013,FL,Florida,BRFSS,Asthma,Asthma prevalence among women aged 18-44 years,Crude Prevalence,10,10,...,AST,AST1_2,CRDPREV,OVERALL,OVR,%,8.6,11.5,,
37550,2014,2014,MO,Missouri,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,Crude Prevalence,9.7,9.7,...,AST,AST1_1,CRDPREV,OVERALL,OVR,%,8.7,10.8,,


In [12]:
#checking if column 'yearstart' and 'yearend' are the same
print(f"Are the columns datavalue and datavaluealt end the same?\n{filtered_asthma_df['datavalue'].equals(filtered_asthma_df['datavaluealt'])}")
print(f"Are the columns yearstart and year end the same?\n{filtered_asthma_df['yearstart'].equals(filtered_asthma_df['yearend'])}")
#if these columns are the same drop one of the duplicates and other not useful columns
if filtered_asthma_df['yearstart'].equals(filtered_asthma_df['yearend']):
    columns_to_drop = ['yearend',\
                        'topic',\
                        'datavaluealt',\
                        'topicid',\
                        'datavaluetypeid',\
                        'stratificationcategoryid1',\
                        'stratificationid1',\
                        'lowconfidencelimit',\
                       'highconfidencelimit',\
                       'datavaluefootnotesymbol',\
                      'datavaluefootnote']
    filtered_asthma_df=filtered_asthma_df.drop(columns=columns_to_drop)
    print(f"The following columns {columns_to_drop} were dropped")

Are the columns datavalue and datavaluealt end the same?
True
Are the columns yearstart and year end the same?
True
The following columns ['yearend', 'topic', 'datavaluealt', 'topicid', 'datavaluetypeid', 'stratificationcategoryid1', 'stratificationid1', 'lowconfidencelimit', 'highconfidencelimit', 'datavaluefootnotesymbol', 'datavaluefootnote'] were dropped


In [13]:
#renaming some columns
filtered_asthma_df=filtered_asthma_df.rename(columns={"yearstart": "year","locationabbr":"state_id","locationdesc":"state"})

In [14]:
#checking for missing data
filtered_asthma_df.isnull().sum()


year                           0
state_id                       0
state                          0
datasource                     0
question                       0
datavaluetype                  0
datavalue                  17261
stratificationcategory1        0
stratification1                0
locationid                     0
questionid                     0
datavalueunit               7548
dtype: int64

In [15]:
# removing missing data values
filtered_asthma_df=filtered_asthma_df[filtered_asthma_df['datavalue'].isna()==False]
filtered_asthma_df

Unnamed: 0,year,state_id,state,datasource,question,datavaluetype,datavalue,stratificationcategory1,stratification1,locationid,questionid,datavalueunit
0,2012,AL,Alabama,NVSS,Asthma mortality rate,Number,37,Gender,Female,1,AST4_1,
1,2014,AL,Alabama,NVSS,Asthma mortality rate,Number,22,Gender,Male,1,AST4_1,
2,2011,AZ,Arizona,NVSS,Asthma mortality rate,Number,29,Gender,Male,4,AST4_1,
3,2015,CT,Connecticut,NVSS,Asthma mortality rate,Number,34,Gender,Female,9,AST4_1,
4,2011,FL,Florida,NVSS,Asthma mortality rate,Number,54,Gender,Male,12,AST4_1,
...,...,...,...,...,...,...,...,...,...,...,...,...
37547,2019,ID,Idaho,BRFSS,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,7.9,Gender,Male,16,AST1_1,%
37548,2017,GA,Georgia,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,12.2,Race/Ethnicity,"White, non-Hispanic",13,AST1_2,%
37549,2013,FL,Florida,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,10,Overall,Overall,12,AST1_2,%
37550,2014,MO,Missouri,BRFSS,Current asthma prevalence among adults aged >=...,Crude Prevalence,9.7,Overall,Overall,29,AST1_1,%


In [16]:
# checking that all the missing data value were correctly removed
filtered_asthma_df.isnull().sum()

year                          0
state_id                      0
state                         0
datasource                    0
question                      0
datavaluetype                 0
datavalue                     0
stratificationcategory1       0
stratification1               0
locationid                    0
questionid                    0
datavalueunit              2827
dtype: int64

In [17]:
filtered_asthma_df['question'].unique()

array(['Asthma mortality rate',
       'Emergency department visit rate for asthma',
       'Hospitalizations for asthma',
       'Current asthma prevalence among adults aged >= 18 years',
       'Asthma prevalence among women aged 18-44 years'], dtype=object)

In [18]:
# drop territories (PR, GU, VI) and nation-wide data (US)
state_to_drop = ['PR','GU','US','VI']
filtered_asthma_df = filtered_asthma_df[~filtered_asthma_df['state_id'].isin(state_to_drop)]

filtered_asthma_df

Unnamed: 0,year,state_id,state,datasource,question,datavaluetype,datavalue,stratificationcategory1,stratification1,locationid,questionid,datavalueunit
0,2012,AL,Alabama,NVSS,Asthma mortality rate,Number,37,Gender,Female,1,AST4_1,
1,2014,AL,Alabama,NVSS,Asthma mortality rate,Number,22,Gender,Male,1,AST4_1,
2,2011,AZ,Arizona,NVSS,Asthma mortality rate,Number,29,Gender,Male,4,AST4_1,
3,2015,CT,Connecticut,NVSS,Asthma mortality rate,Number,34,Gender,Female,9,AST4_1,
4,2011,FL,Florida,NVSS,Asthma mortality rate,Number,54,Gender,Male,12,AST4_1,
...,...,...,...,...,...,...,...,...,...,...,...,...
37547,2019,ID,Idaho,BRFSS,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,7.9,Gender,Male,16,AST1_1,%
37548,2017,GA,Georgia,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,12.2,Race/Ethnicity,"White, non-Hispanic",13,AST1_2,%
37549,2013,FL,Florida,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,10,Overall,Overall,12,AST1_2,%
37550,2014,MO,Missouri,BRFSS,Current asthma prevalence among adults aged >=...,Crude Prevalence,9.7,Overall,Overall,29,AST1_1,%


In [19]:
#checking for duplicates
filtered_asthma_df=filtered_asthma_df.drop_duplicates()
filtered_asthma_df

Unnamed: 0,year,state_id,state,datasource,question,datavaluetype,datavalue,stratificationcategory1,stratification1,locationid,questionid,datavalueunit
0,2012,AL,Alabama,NVSS,Asthma mortality rate,Number,37,Gender,Female,1,AST4_1,
1,2014,AL,Alabama,NVSS,Asthma mortality rate,Number,22,Gender,Male,1,AST4_1,
2,2011,AZ,Arizona,NVSS,Asthma mortality rate,Number,29,Gender,Male,4,AST4_1,
3,2015,CT,Connecticut,NVSS,Asthma mortality rate,Number,34,Gender,Female,9,AST4_1,
4,2011,FL,Florida,NVSS,Asthma mortality rate,Number,54,Gender,Male,12,AST4_1,
...,...,...,...,...,...,...,...,...,...,...,...,...
37547,2019,ID,Idaho,BRFSS,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,7.9,Gender,Male,16,AST1_1,%
37548,2017,GA,Georgia,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,12.2,Race/Ethnicity,"White, non-Hispanic",13,AST1_2,%
37549,2013,FL,Florida,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,10,Overall,Overall,12,AST1_2,%
37550,2014,MO,Missouri,BRFSS,Current asthma prevalence among adults aged >=...,Crude Prevalence,9.7,Overall,Overall,29,AST1_1,%


In [20]:
#checking that there isn't more than one data value entry for the same year and state.
duplicateRowsDF = filtered_asthma_df[filtered_asthma_df.duplicated(subset=['year','state_id','state','datasource','question','datavaluetype','datavaluetype','stratificationcategory1','stratification1','locationid','questionid','datavalueunit'], keep=False)]
duplicateRowsDF

Unnamed: 0,year,state_id,state,datasource,question,datavaluetype,datavalue,stratificationcategory1,stratification1,locationid,questionid,datavalueunit


In [21]:
#keeping only the Overall values
filtered_asthma_df = filtered_asthma_df[filtered_asthma_df['stratificationcategory1'] == 'Overall']
filtered_asthma_df

Unnamed: 0,year,state_id,state,datasource,question,datavaluetype,datavalue,stratificationcategory1,stratification1,locationid,questionid,datavalueunit
6,2011,HI,Hawaii,NVSS,Asthma mortality rate,Number,26,Overall,Overall,15,AST4_1,
7,2013,HI,Hawaii,NVSS,Asthma mortality rate,Number,24,Overall,Overall,15,AST4_1,
12,2012,TX,Texas,NVSS,Asthma mortality rate,Number,206,Overall,Overall,48,AST4_1,
14,2015,WA,Washington,NVSS,Asthma mortality rate,Number,88,Overall,Overall,53,AST4_1,
21,2019,NY,New York,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,12,Overall,Overall,36,AST1_2,%
...,...,...,...,...,...,...,...,...,...,...,...,...
37524,2013,ID,Idaho,BRFSS,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,8.7,Overall,Overall,16,AST1_1,%
37536,2019,NC,North Carolina,BRFSS,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,8.2,Overall,Overall,37,AST1_1,%
37549,2013,FL,Florida,BRFSS,Asthma prevalence among women aged 18-44 years,Crude Prevalence,10,Overall,Overall,12,AST1_2,%
37550,2014,MO,Missouri,BRFSS,Current asthma prevalence among adults aged >=...,Crude Prevalence,9.7,Overall,Overall,29,AST1_1,%


In [22]:
# final dropping of unnecessary columns
filtered_asthma_df=filtered_asthma_df.drop(columns=['state',\
                                                    'datasource',\
                                                    'question',\
                                                    'stratificationcategory1',\
                                                    'stratification1',\
                                                    'locationid',\
                                                    'questionid'])
filtered_asthma_df

Unnamed: 0,year,state_id,datavaluetype,datavalue,datavalueunit
6,2011,HI,Number,26,
7,2013,HI,Number,24,
12,2012,TX,Number,206,
14,2015,WA,Number,88,
21,2019,NY,Crude Prevalence,12,%
...,...,...,...,...,...
37524,2013,ID,Age-adjusted Prevalence,8.7,%
37536,2019,NC,Age-adjusted Prevalence,8.2,%
37549,2013,FL,Crude Prevalence,10,%
37550,2014,MO,Crude Prevalence,9.7,%


In [23]:
# adding collection about asthma to MongoDB
add_collection_to_mongo('asthma',filtered_asthma_df)

asthma stored in the MongoDB database named Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'health_air_mongo')


In [24]:
# adding the asthma table to the database and controlling that there are data recorded.
add_table_to_db('asthma',filtered_asthma_df, conn)

[(6, '2011', 'HI', 'Number', '26', None),
 (7, '2013', 'HI', 'Number', '24', None),
 (12, '2012', 'TX', 'Number', '206', None),
 (14, '2015', 'WA', 'Number', '88', None),
 (21, '2019', 'NY', 'Crude Prevalence', '12', '%'),
 (22, '2011', 'VA', 'Crude Prevalence', '11.9', '%'),
 (24, '2014', 'GA', 'Number', '96', None),
 (31, '2013', 'IN', 'Number', '69', None),
 (34, '2012', 'CA', 'Number', '384', None),
 (41, '2013', 'NC', 'Number', '112', None),
 (42, '2013', 'WA', 'Number', '87', None),
 (45, '2016', 'NY', 'Number', '273', None),
 (58, '2017', 'IA', 'Number', '32', None),
 (69, '2017', 'AR', 'Crude Rate', '10.7', 'cases per 1,000,000'),
 (74, '2013', 'AL', 'Crude Rate', '8.7', 'cases per 1,000,000'),
 (81, '2015', 'CT', 'Crude Rate', '13.4', 'cases per 1,000,000'),
 (83, '2011', 'CO', 'Crude Rate', '8.2', 'cases per 1,000,000'),
 (84, '2014', 'IL', 'Age-adjusted Rate', '12.2', 'cases per 1,000,000'),
 (85, '2015', 'IA', 'Crude Rate', '13.1', 'cases per 1,000,000'),
 (92, '2013', 'CT'

# ETL of [Air Data](https://www.epa.gov/outdoor-air-quality-data)

In [25]:
#finding the name of all the files in the air quality folder
all_files = glob.glob('./Air_quality_csv/*.csv')

air_list = []
#reading all the CSV files
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    air_list.append(df)
#creating the concatenated dataframe
air_df = pd.concat(air_list, axis=0, ignore_index=True)


In [26]:
air_df

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days SO2,Days PM2.5,Days PM10
0,Alabama,Baldwin,2018,270,245,25,0,0,0,0,97,50,35,0,0,214,0,56,0
1,Alabama,Clay,2018,110,103,7,0,0,0,0,64,45,27,0,0,0,0,110,0
2,Alabama,Colbert,2018,277,251,26,0,0,0,0,93,50,35,0,0,209,0,68,0
3,Alabama,DeKalb,2018,350,316,34,0,0,0,0,84,50,35,0,0,317,0,33,0
4,Alabama,Elmore,2018,222,203,19,0,0,0,0,71,49,33,0,0,222,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10659,Wyoming,Sublette,2010,365,250,113,2,0,0,0,112,67,47,0,2,347,0,16,0
10660,Wyoming,Sweetwater,2010,365,254,108,3,0,0,0,112,66,45,0,6,313,1,7,38
10661,Wyoming,Teton,2010,365,281,84,0,0,0,0,90,61,44,4,0,355,0,4,2
10662,Wyoming,Uinta,2010,364,286,76,2,0,0,0,135,61,44,0,3,350,0,0,11


In [27]:
# there is the need to aggregate the data by state and year
# The output is the average number of days that were over the law limits per parameter of interest (i.e. PMx)
# The average median and maximum API per state in a year is evaluated as well.
aggregate_air_df = air_df.groupby(by=['State','Year']).mean()
air_df = aggregate_air_df.reset_index()

In [28]:
#printing the names of the dataframe columns
air_df.columns

Index(['State', 'Year', 'Days with AQI', 'Good Days', 'Moderate Days',
       'Unhealthy for Sensitive Groups Days', 'Unhealthy Days',
       'Very Unhealthy Days', 'Hazardous Days', 'Max AQI',
       '90th Percentile AQI', 'Median AQI', 'Days CO', 'Days NO2',
       'Days Ozone', 'Days SO2', 'Days PM2.5', 'Days PM10'],
      dtype='object')

In [29]:
#dropping not useful column 
columns_to_drop = ['Days with AQI',\
                   'Good Days',\
                   'Moderate Days',\
                   'Unhealthy for Sensitive Groups Days',\
                   'Unhealthy Days',\
                   'Very Unhealthy Days',\
                   'Hazardous Days',\
                   '90th Percentile AQI',
                   'Days CO',\
                   'Days NO2',\
                   'Days SO2',]
air_df=air_df.drop(columns=columns_to_drop)

In [30]:
air_df

Unnamed: 0,State,Year,Max AQI,Median AQI,Days Ozone,Days PM2.5,Days PM10
0,Alabama,2010,112.263158,46.157895,131.000000,142.105263,8.894737
1,Alabama,2011,111.526316,42.789474,158.631579,84.894737,4.736842
2,Alabama,2012,110.235294,40.176471,174.529412,85.117647,3.529412
3,Alabama,2013,86.055556,37.166667,173.166667,84.055556,3.222222
4,Alabama,2014,93.333333,38.166667,173.055556,83.611111,3.277778
...,...,...,...,...,...,...,...
535,Wyoming,2015,110.263158,34.789474,235.894737,29.315789,42.052632
536,Wyoming,2016,95.944444,37.277778,266.555556,16.055556,44.444444
537,Wyoming,2017,169.411765,38.470588,259.352941,18.882353,42.705882
538,Wyoming,2018,107.777778,39.555556,269.444444,14.722222,41.666667


In [31]:
#converting the number of days in integers because as float they do not make too much sense
for column in air_df.columns:
    if air_df[column].dtype == float:
        air_df[column]= air_df[column].astype('int64')

In [32]:
# adding collection about asthma to sqlite
add_table_to_db('air', air_df, conn)

  sql.to_sql(


[(0, 'Alabama', 2010, 112, 46, 131, 142, 8),
 (1, 'Alabama', 2011, 111, 42, 158, 84, 4),
 (2, 'Alabama', 2012, 110, 40, 174, 85, 3),
 (3, 'Alabama', 2013, 86, 37, 173, 84, 3),
 (4, 'Alabama', 2014, 93, 38, 173, 83, 3),
 (5, 'Alabama', 2015, 105, 39, 137, 145, 3),
 (6, 'Alabama', 2016, 111, 39, 156, 116, 3),
 (7, 'Alabama', 2017, 104, 38, 153, 125, 3),
 (8, 'Alabama', 2018, 99, 36, 159, 131, 0),
 (9, 'Alabama', 2019, 88, 38, 163, 124, 0),
 (10, 'Alaska', 2010, 88, 19, 59, 191, 9),
 (11, 'Alaska', 2011, 98, 22, 75, 176, 20),
 (12, 'Alaska', 2012, 96, 22, 95, 152, 51),
 (13, 'Alaska', 2013, 108, 20, 52, 176, 52),
 (14, 'Alaska', 2014, 133, 23, 68, 163, 38),
 (15, 'Alaska', 2015, 99, 23, 78, 157, 17),
 (16, 'Alaska', 2016, 88, 24, 105, 150, 28),
 (17, 'Alaska', 2017, 92, 22, 98, 158, 25),
 (18, 'Alaska', 2018, 84, 22, 87, 166, 24),
 (19, 'Alaska', 2019, 155, 19, 53, 182, 37),
 (20, 'Arizona', 2010, 238, 46, 180, 22, 79),
 (21, 'Arizona', 2011, 314, 47, 167, 27, 106),
 (22, 'Arizona', 2012,

In [33]:
#removing the point from the key for handling the dataset in MongoDb without issues
air_df=air_df.rename(columns={"Days PM2.5": "Days PM2_5"})
air_df

Unnamed: 0,State,Year,Max AQI,Median AQI,Days Ozone,Days PM2_5,Days PM10
0,Alabama,2010,112,46,131,142,8
1,Alabama,2011,111,42,158,84,4
2,Alabama,2012,110,40,174,85,3
3,Alabama,2013,86,37,173,84,3
4,Alabama,2014,93,38,173,83,3
...,...,...,...,...,...,...,...
535,Wyoming,2015,110,34,235,29,42
536,Wyoming,2016,95,37,266,16,44
537,Wyoming,2017,169,38,259,18,42
538,Wyoming,2018,107,39,269,14,41


In [34]:
# adding collection about asthma to MongoDB
add_collection_to_mongo('air',air_df)

air stored in the MongoDB database named Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'health_air_mongo')


In [35]:
#closing the connections to sqlite and MongoDB
conn.close()
mongo_client.close()