In [2]:
import sqlite3
import pandas as pd
import numpy as np
from IPython.display import display

In [3]:
# Problem Formulation
# todo

In [15]:
# Data Acquisition
# Dataset #1: Respiratory-related hospitalizations data - CDC
# Dataset #2: Air Quality Index Data - EPA
# Both datasets are publically available and downloadable as CSV files

# Since cdc dataset is so large, turn off low memory to ensure it loads correctly
df_cdc = pd.DataFrame(pd.read_csv('CDC-Hospitalizations.csv', low_memory=False))
df_aqi_2025 = pd.DataFrame(pd.read_csv('../data/raw/AQI/daily_aqi_by_county_2025.csv'))
df_aqi_2024 = pd.DataFrame(pd.read_csv('../data/raw/AQI/daily_aqi_by_county_2024.csv'))
df_aqi_2023 = pd.DataFrame(pd.read_csv('../data/raw/AQI/daily_aqi_by_county_2023.csv'))
df_aqi_2022 = pd.DataFrame(pd.read_csv('../data/raw/AQI/daily_aqi_by_county_2022.csv'))
df_aqi_2021 = pd.DataFrame(pd.read_csv('../data/raw/AQI/daily_aqi_by_county_2021.csv'))

# Add year column to AQI datasets since it's logged daily
# Not sure if this is okay since I don't think we're supposed to manipulate the data in Milestone 1 but this will be helpful later on
df_aqi_2025['year'] = 2025
df_aqi_2024['year'] = 2024
df_aqi_2023['year'] = 2023
df_aqi_2022['year'] = 2022
df_aqi_2021['year'] = 2021

print("Respiratory-related Hospitalizations Dataset - CDC")
display(df_cdc.head(5))
print("\nAir Quality Index 2025 Dataset - EPA\n")
display(df_aqi_2025.head(5))
print("\nAir Quality Index 2021 Dataset - EPA\n")
display(df_aqi_2021.head(5))

Respiratory-related Hospitalizations Dataset - CDC


Unnamed: 0,Week Ending Date,Geographic aggregation,Number of Inpatient Beds,Number of Adult Inpatient Beds,Number of Pediatric Inpatient beds,Number of Inpatient Beds Occupied,Number of Adult Inpatient Beds Occupied,Number of Pediatric Inpatient Beds Occupied,Number of ICU Beds,Number of Adult ICU Beds,...,Absolute Change in the Percent Hospitals Reporting Percent Hospitalized Adult Patients with Influenza in the ICU from Prior Week,Absolute Change in the Percent Hospitals Reporting Percent Hospitalized Pediatric Patients with Influenza in the ICU from Prior Week,Absolute Change in the Percent Hospitals Reporting Percent Hospitalized Patients with RSV in the ICU from Prior Week,Absolute Change in the Percent Hospitals Reporting Percent Hospitalized Adult Patients with RSV in the ICU from Prior Week,Absolute Change in the Percent Hospitals Reporting Percent Hospitalized Pediatric Patients with RSV in the ICU from Prior Week,Respiratory Virus Season,Cumulative Seasonal Total Confirmed New COVID-19 Admissions,Cumulative Seasonal Total Confirmed New Influenza Admissions,Cumulative Seasonal Total Confirmed New RSV Admissions,"Cumulative Seasonal Total Confirmed New Respiratory Admissions (COVID-19, Influenza, and RSV)"
0,2025-10-04,AK,1417,1237,180,1084,962,122,189,113,...,0.0,0.0,0.0,0.0,0.0,2025-2026,5,2,1,8
1,2025-10-11,AK,1404,1230,174,1050,944,106,182,105,...,0.0,0.0,0.0,0.0,0.0,2025-2026,9,2,1,12
2,2025-10-18,AK,1412,1229,183,1051,943,108,189,108,...,-4.17,-4.17,-4.17,-4.17,-4.17,2025-2026,14,2,2,18
3,2025-10-25,AK,1401,1215,186,1064,945,119,198,114,...,4.17,4.17,4.17,4.17,4.17,2025-2026,16,3,2,21
4,2025-11-01,AK,1425,1234,190,1078,946,131,192,106,...,0.0,0.0,0.0,0.0,0.0,2025-2026,26,3,5,34



Air Quality Index 2025 Dataset - EPA



Unnamed: 0,State Name,county Name,State Code,County Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting,year
0,Alabama,Baldwin,1,3,2025-01-01,20,Good,PM2.5,01-003-0010,1,2025
1,Alabama,Baldwin,1,3,2025-01-02,37,Good,PM2.5,01-003-0010,1,2025
2,Alabama,Baldwin,1,3,2025-01-03,52,Moderate,PM2.5,01-003-0010,1,2025
3,Alabama,Baldwin,1,3,2025-01-04,31,Good,PM2.5,01-003-0010,1,2025
4,Alabama,Baldwin,1,3,2025-01-05,31,Good,PM2.5,01-003-0010,1,2025



Air Quality Index 2021 Dataset - EPA



Unnamed: 0,State Name,county Name,State Code,County Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting,year
0,Alabama,Baldwin,1,3,2021-01-01,36,Good,PM2.5,01-003-0010,1,2021
1,Alabama,Baldwin,1,3,2021-01-07,32,Good,PM2.5,01-003-0010,1,2021
2,Alabama,Baldwin,1,3,2021-01-13,55,Moderate,PM2.5,01-003-0010,1,2021
3,Alabama,Baldwin,1,3,2021-01-16,28,Good,PM2.5,01-003-0010,1,2021
4,Alabama,Baldwin,1,3,2021-01-19,58,Moderate,PM2.5,01-003-0010,1,2021


In [17]:
# Data Acquisition II
conn = sqlite3.connect('combined_datasets.db')
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS cdc_data_raw")
cursor.execute("DROP TABLE IF EXISTS cdc_data")
cursor.execute("DROP TABLE IF EXISTS epa_data")
cursor.execute("DROP TABLE IF EXISTS aqi_data")

# Adding CDC dataset to the database
# Since dataset is so large, manual schema will only define variables we will be using in our investigation
df_cdc.to_sql('cdc_raw_data', conn, if_exists='replace', index=False)
# print(pd.read_sql("SELECT * FROM cdc_raw_data LIMIT 5", conn))

# Just a test to consolidate data from main cdc table to smaller, cleaner format
cursor.execute('''
CREATE TABLE IF NOT EXISTS cdc_data (
    week TEXT,
    state_name TEXT
)
''')
conn.commit()

cursor.execute('''
INSERT INTO cdc_data (week, state_name)
SELECT
    "Week Ending Date" as week,
    "Geographic aggregation" as state_name
FROM cdc_raw_data
''')
conn.commit()
print(pd.read_sql("SELECT * FROM cdc_data LIMIT 5", conn))

# Adding AQI dataset to the database w/ manual schema
cursor.execute('''
CREATE TABLE IF NOT EXISTS aqi_data (
    state_name TEXT,
    county_name TEXT,
    date TEXT,
    aqi INTEGER,
    category TEXT,
    defining_parameter TEXT,
    defining_site TEXT,
    sites_reporting INTEGER,
    year INTEGER
)
''')
conn.commit()

df_aqi_combined = pd.concat([df_aqi_2025, df_aqi_2024, df_aqi_2023, df_aqi_2022, df_aqi_2021], ignore_index=True)

df_aqi_combined_filtered = df_aqi_combined[['State Name', 'county Name', 'Date', 'AQI', 'Category', 'Defining Parameter', 'Defining Site', 'Number of Sites Reporting', 'year']]
df_aqi_combined_filtered.columns = ['state_name', 'county_name', 'date', 'aqi', 'category', 'defining_parameter', 'defining_site', 'sites_reporting', 'year']
df_aqi_combined_filtered.to_sql('aqi_data', conn, if_exists='append', index=False)
print(pd.read_sql("SELECT * FROM aqi_data LIMIT 5", conn))

conn.close()

         week state_name
0  2025-10-04         AK
1  2025-10-11         AK
2  2025-10-18         AK
3  2025-10-25         AK
4  2025-11-01         AK
  state_name county_name        date  aqi  category defining_parameter  \
0    Alabama     Baldwin  2025-01-01   20      Good              PM2.5   
1    Alabama     Baldwin  2025-01-02   37      Good              PM2.5   
2    Alabama     Baldwin  2025-01-03   52  Moderate              PM2.5   
3    Alabama     Baldwin  2025-01-04   31      Good              PM2.5   
4    Alabama     Baldwin  2025-01-05   31      Good              PM2.5   

  defining_site  sites_reporting  year  
0   01-003-0010                1  2025  
1   01-003-0010                1  2025  
2   01-003-0010                1  2025  
3   01-003-0010                1  2025  
4   01-003-0010                1  2025  


In [18]:
# Check to see if database file is accurate
conn = sqlite3.connect("combined_datasets.db")

tables = pd.read_sql("""
SELECT name
FROM sqlite_master
WHERE type='table';
""", conn)

print(tables, "\n")

cdc_test = pd.read_sql("""
SELECT *
FROM cdc_data
LIMIT 2
""", conn)

print(cdc_test, "\n")

aqi_test = pd.read_sql("""
SELECT *
FROM aqi_data
LIMIT 2
""", conn)

print(aqi_test)

conn.close()

           name
0  cdc_raw_data
1      cdc_data
2      aqi_data 

         week state_name
0  2025-10-04         AK
1  2025-10-11         AK 

  state_name county_name        date  aqi category defining_parameter  \
0    Alabama     Baldwin  2025-01-01   20     Good              PM2.5   
1    Alabama     Baldwin  2025-01-02   37     Good              PM2.5   

  defining_site  sites_reporting  year  
0   01-003-0010                1  2025  
1   01-003-0010                1  2025  


In [5]:
# Data Exploration
# todo