In [1]:
import sqlite3
import pandas as pd
import numpy as np
from IPython.display import display
import glob

In [None]:
# Data Acquisition
# Dataset #1: Respiratory-related hospitalizations data - CDC
# Dataset #2: Air Quality Index Data - EPA
# Both datasets are publicly available and downloadable as CSV files

# Since cdc dataset is so large, turn off low memory to ensure it loads correctly
df_cdc = pd.DataFrame(pd.read_csv('CDC-Hospitalizations.csv', low_memory=False))
df_epa = pd.DataFrame(pd.read_csv('EPA-AQI-County-2025.csv'))

print("Respiratory-related Hospitalizations Dataset - CDC")
display(df_cdc.head(5))
print("\nAir Quality Index Dataset - EPA\n")
display(df_epa.head(5))

In [None]:
# Data Acquisition II
conn = sqlite3.connect('combined_datasets.db')
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS cdc_data_raw")
cursor.execute("DROP TABLE IF EXISTS cdc_data")
cursor.execute("DROP TABLE IF EXISTS aqi_data")

# Adding CDC dataset to the database
# Since dataset is so large, manual schema will be d
df_cdc.to_sql('cdc_raw_data', conn, if_exists='replace', index=False)
# print(pd.read_sql("SELECT * FROM cdc_raw_data LIMIT 5", conn))

# Just a test to consolidate data from main cdc table to small, cleaner format
cursor.execute('''
CREATE TABLE IF NOT EXISTS cdc_data (
    week TEXT,
    state_name TEXT
)
''')
conn.commit()

cursor.execute('''
INSERT INTO cdc_data (week, state_name)
SELECT
    "Week Ending Date" as week,
    "Geographic aggregation" as state_name
FROM cdc_raw_data
''')
conn.commit()
print(pd.read_sql("SELECT * FROM cdc_data LIMIT 5", conn))

# Adding EPA dataset to the database w/ manual schema
cursor.execute('''
CREATE TABLE IF NOT EXISTS epa_data (
    state_name TEXT NOT NULL,
    county_name TEXT NOT NULL,
    date TEXT,
    aqi INTEGER,
    category TEXT,
    defining_parameter TEXT,
    defining_site TEXT,
    sites_reporting INTEGER
)
''')
conn.commit()

df_epa_filtered = df_epa[['State Name', 'county Name', 'Date', 'AQI', 'Category', 'Defining Parameter', 'Defining Site', 'Number of Sites Reporting']]
df_epa_filtered.columns = ['state_name', 'county_name', 'date', 'aqi', 'category', 'defining_parameter', 'defining_site', 'sites_reporting']
df_epa_filtered.to_sql('epa_data', conn, if_exists='append', index=False)
print(pd.read_sql("SELECT * FROM epa_data LIMIT 5", conn))

conn.close()

In [None]:
# Check to see if database file is accurate
conn = sqlite3.connect("combined_datasets.db")

tables = pd.read_sql("""
SELECT name
FROM sqlite_master
WHERE type='table';
""", conn)

print(tables, "\n")

cdc_test = pd.read_sql("""
SELECT *
FROM cdc_data
LIMIT 2
""", conn)

print(cdc_test, "\n")

epa_test = pd.read_sql("""
SELECT *
FROM epa_data
LIMIT 2
""", conn)

print(epa_test)

conn.close()

In [None]:
# Data Exploration
# todo