In [12]:
import sqlite3
import pandas as pd
import numpy as np
from IPython.display import display

In [22]:
# Data Acquisition
# Dataset #1: Respiratory-related hospitalizations data - CDC
# Dataset #2: Air Quality Index Data - EPA
# Both datasets are publically available and downloadable as CSV files

# Since cdc dataset is so large, turn off low memory to ensure it loads correctly
df_cdc = pd.DataFrame(pd.read_csv('CDC-Hospitalizations.csv', low_memory=False))
df_epa = pd.DataFrame(pd.read_csv('EPA-AQI-County-2025.csv'))

print("Respiratory-related Hospitalizations Dataset - CDC")
display(df_cdc.head(5))
print("\nAir Quality Index Dataset - EPA\n")
display(df_epa.head(5))

Respiratory-related Hospitalizations Dataset - CDC


Unnamed: 0,Week Ending Date,Geographic aggregation,Number of Inpatient Beds,Number of Adult Inpatient Beds,Number of Pediatric Inpatient beds,Number of Inpatient Beds Occupied,Number of Adult Inpatient Beds Occupied,Number of Pediatric Inpatient Beds Occupied,Number of ICU Beds,Number of Adult ICU Beds,...,Absolute Change in the Percent Hospitals Reporting Percent Hospitalized Adult Patients with Influenza in the ICU from Prior Week,Absolute Change in the Percent Hospitals Reporting Percent Hospitalized Pediatric Patients with Influenza in the ICU from Prior Week,Absolute Change in the Percent Hospitals Reporting Percent Hospitalized Patients with RSV in the ICU from Prior Week,Absolute Change in the Percent Hospitals Reporting Percent Hospitalized Adult Patients with RSV in the ICU from Prior Week,Absolute Change in the Percent Hospitals Reporting Percent Hospitalized Pediatric Patients with RSV in the ICU from Prior Week,Respiratory Virus Season,Cumulative Seasonal Total Confirmed New COVID-19 Admissions,Cumulative Seasonal Total Confirmed New Influenza Admissions,Cumulative Seasonal Total Confirmed New RSV Admissions,"Cumulative Seasonal Total Confirmed New Respiratory Admissions (COVID-19, Influenza, and RSV)"
0,2025-10-04,AK,1417,1237,180,1084,962,122,189,113,...,0.0,0.0,0.0,0.0,0.0,2025-2026,5,2,1,8
1,2025-10-11,AK,1404,1230,174,1050,944,106,182,105,...,0.0,0.0,0.0,0.0,0.0,2025-2026,9,2,1,12
2,2025-10-18,AK,1412,1229,183,1051,943,108,189,108,...,-4.17,-4.17,-4.17,-4.17,-4.17,2025-2026,14,2,2,18
3,2025-10-25,AK,1401,1215,186,1064,945,119,198,114,...,4.17,4.17,4.17,4.17,4.17,2025-2026,16,3,2,21
4,2025-11-01,AK,1425,1234,190,1078,946,131,192,106,...,0.0,0.0,0.0,0.0,0.0,2025-2026,26,3,5,34



Air Quality Index Dataset - EPA



Unnamed: 0,State Name,county Name,State Code,County Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting
0,Alabama,Baldwin,1,3,2025-01-01,20,Good,PM2.5,01-003-0010,1
1,Alabama,Baldwin,1,3,2025-01-02,37,Good,PM2.5,01-003-0010,1
2,Alabama,Baldwin,1,3,2025-01-03,52,Moderate,PM2.5,01-003-0010,1
3,Alabama,Baldwin,1,3,2025-01-04,31,Good,PM2.5,01-003-0010,1
4,Alabama,Baldwin,1,3,2025-01-05,31,Good,PM2.5,01-003-0010,1


In [60]:
# Data Acquisition II
conn = sqlite3.connect('combined_datasets.db')
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS cdc_data_raw")
cursor.execute("DROP TABLE IF EXISTS cdc_data")
cursor.execute("DROP TABLE IF EXISTS aqi_data")

# Adding CDC dataset to the database
# Since dataset is so large, manual schema will be d
df_cdc.to_sql('cdc_raw_data', conn, if_exists='replace', index=False)
# print(pd.read_sql("SELECT * FROM cdc_raw_data LIMIT 5", conn))

# Just a test to consolidate data from main cdc table to small, cleaner format
cursor.execute('''
CREATE TABLE IF NOT EXISTS cdc_data (
    week TEXT,
    state_name TEXT
)
''')
conn.commit()

cursor.execute('''
INSERT INTO cdc_data (week, state_name)
SELECT
    "Week Ending Date" as week,
    "Geographic aggregation" as state_name
FROM cdc_raw_data
''')
conn.commit()
print(pd.read_sql("SELECT * FROM cdc_data LIMIT 5", conn))

# Adding EPA dataset to the database w/ manual schema
cursor.execute('''
CREATE TABLE IF NOT EXISTS epa_data (
    state_name TEXT NOT NULL,
    county_name TEXT NOT NULL,
    date TEXT,
    aqi INTEGER,
    category TEXT,
    defining_parameter TEXT,
    defining_site TEXT,
    sites_reporting INTEGER
)
''')
conn.commit()

df_epa_filtered = df_epa[['State Name', 'county Name', 'Date', 'AQI', 'Category', 'Defining Parameter', 'Defining Site', 'Number of Sites Reporting']]
df_epa_filtered.columns = ['state_name', 'county_name', 'date', 'aqi', 'category', 'defining_parameter', 'defining_site', 'sites_reporting']
df_epa_filtered.to_sql('epa_data', conn, if_exists='append', index=False)
print(pd.read_sql("SELECT * FROM epa_data LIMIT 5", conn))

conn.close()

         week state_name
0  2025-10-04         AK
1  2025-10-11         AK
2  2025-10-18         AK
3  2025-10-25         AK
4  2025-11-01         AK
   id state_name county_name        date  aqi  category defining_parameter  \
0   1    Alabama     Baldwin  2025-01-01   20      Good              PM2.5   
1   2    Alabama     Baldwin  2025-01-02   37      Good              PM2.5   
2   3    Alabama     Baldwin  2025-01-03   52  Moderate              PM2.5   
3   4    Alabama     Baldwin  2025-01-04   31      Good              PM2.5   
4   5    Alabama     Baldwin  2025-01-05   31      Good              PM2.5   

  defining_site  sites_reporting  
0   01-003-0010                1  
1   01-003-0010                1  
2   01-003-0010                1  
3   01-003-0010                1  
4   01-003-0010                1  


In [63]:
# Check to see if database file is accurate
conn = sqlite3.connect("combined_datasets.db")

tables = pd.read_sql("""
SELECT name
FROM sqlite_master
WHERE type='table';
""", conn)

print(tables, "\n")

cdc_test = pd.read_sql("""
SELECT *
FROM cdc_data
LIMIT 2
""", conn)

print(cdc_test, "\n")

epa_test = pd.read_sql("""
SELECT *
FROM epa_data
LIMIT 2
""", conn)

print(epa_test)

conn.close()

              name
0  sqlite_sequence
1         epa_data
2     cdc_raw_data
3         cdc_data 

         week state_name
0  2025-10-04         AK
1  2025-10-11         AK 

   id state_name county_name        date  aqi category defining_parameter  \
0   1    Alabama     Baldwin  2025-01-01   20     Good              PM2.5   
1   2    Alabama     Baldwin  2025-01-02   37     Good              PM2.5   

  defining_site  sites_reporting  
0   01-003-0010                1  
1   01-003-0010                1  


In [None]:
# Data Exploration
# todo