In [2]:
import sqlite3
import pandas as pd
import numpy as np
from IPython.display import display
import glob

In [10]:
# Get the path to CDC raw data
CDC_PATH = "../data/raw/CDC/cdc_raw.csv"

# Grab the raw data from the .csv file with low_memory=False because of the large size
cdc_raw = pd.read_csv(CDC_PATH, low_memory=False)

# Only grab the relevant columns we need
cdc_m1 = cdc_raw[[
    "Week Ending Date",
    "Geographic aggregation",
    "Total COVID-19 Admissions",
    "Total Influenza Admissions",
    "Total RSV Admissions"
]].copy()

# Rename the columns
cdc_m1 = cdc_m1.rename(columns={
    "Week Ending Date": "week",
    "Geographic aggregation": "state",
    "Total COVID-19 Admissions": "covid_admissions",
    "Total Influenza Admissions": "influenza_admissions",
    "Total RSV Admissions": "rsv_admissions"
})

# Update the week column from str to datetime
cdc_m1["week"] = pd.to_datetime(cdc_m1["week"])

# Change the admission cols to numeric dtype
for c in ["covid_admissions", "influenza_admissions", "rsv_admissions"]:
    cdc_m1[c] = pd.to_numeric(cdc_m1[c], errors="coerce")
    
# Create a new column for the total respiratory-related hospitalizations
cdc_m1["total_admissions"] = (
    cdc_m1["covid_admissions"].fillna(0)
    + cdc_m1["influenza_admissions"].fillna(0)
    + cdc_m1["rsv_admissions"].fillna(0)
)

cdc_m1.to_csv("../data/processed/cdc_m1.csv", index=False)

cdc_m1.head()    

Unnamed: 0,week,state,covid_admissions,influenza_admissions,rsv_admissions,total_admissions
0,2025-10-04,AK,5.0,2.0,1.0,8.0
1,2025-10-11,AK,4.0,0.0,0.0,4.0
2,2025-10-18,AK,5.0,0.0,1.0,6.0
3,2025-10-25,AK,2.0,1.0,0.0,3.0
4,2025-11-01,AK,10.0,0.0,3.0,13.0


In [8]:
import glob
import pandas as pd

# Gather all the aqi daily .csv files together
aqi_files = glob.glob("../data/raw/AQI/*.csv")
if not aqi_files:
    raise FileNotFoundError("No AQI daily .csv files found in the data directory.")
# Concat the aqi_files together
aqi_daily_raw = pd.concat((pd.read_csv(f) for f in aqi_files), ignore_index=True)

# Show the aqi raw cols, MIGHT DELETE THIS LATER
aqi_daily_raw.columns[:30]

Index(['State Name', 'county Name', 'State Code', 'County Code', 'Date', 'AQI',
       'Category', 'Defining Parameter', 'Defining Site',
       'Number of Sites Reporting'],
      dtype='str')

In [14]:
# Copy only the relevant columns
aqi_daily = aqi_daily_raw[[
    "Date",
    "State Name",
    "AQI"
]].copy()

# Rename the columns for consistency
aqi_daily = aqi_daily.rename(columns={
    "Date": "date",
    "State Name": "state",
    "AQI": "aqi"
})

# Update the date col to datetime dtype
aqi_daily["date"] = pd.to_datetime(aqi_daily["date"])
aqi_daily["aqi"] = pd.to_numeric(aqi_daily["aqi"], errors="coerce")

# Create week-ending dates that end on Saturday
aqi_daily["week"] = aqi_daily["date"].dt.to_period("W-SAT").apply(lambda r: r.end_time)

# Aggregate daily county AQI to weekly state AQI summaries
aqi_m1 = (
    aqi_daily.groupby(["state", "week"], as_index=False).agg(
        aqi_mean=("aqi", "mean"),
        aqi_p90=("aqi", lambda s: s.quantile(0.90)),
        aqi_max=("aqi", "max"),
        days_reported=("aqi", "count")
    )
)

# Save the processed data to the following directory and file
aqi_m1.to_csv("../data/processed/aqi_m1.csv", index=False)
aqi_m1.head()


Unnamed: 0,state,week,aqi_mean,aqi_p90,aqi_max,days_reported
0,Alabama,2021-01-02 23:59:59.999999,33.285714,53.0,55,21
1,Alabama,2021-01-09 23:59:59.999999,42.246154,56.0,63,65
2,Alabama,2021-01-16 23:59:59.999999,45.522388,61.0,72,67
3,Alabama,2021-01-23 23:59:59.999999,39.5,59.2,67,60
4,Alabama,2021-01-30 23:59:59.999999,36.457627,56.6,67,59


In [20]:
US_STATE_ABBR = {
    "Alabama":"AL","Alaska":"AK","Arizona":"AZ","Arkansas":"AR","California":"CA","Colorado":"CO",
    "Connecticut":"CT","Delaware":"DE","District of Columbia":"DC","Florida":"FL","Georgia":"GA",
    "Hawaii":"HI","Idaho":"ID","Illinois":"IL","Indiana":"IN","Iowa":"IA","Kansas":"KS","Kentucky":"KY",
    "Louisiana":"LA","Maine":"ME","Maryland":"MD","Massachusetts":"MA","Michigan":"MI","Minnesota":"MN",
    "Mississippi":"MS","Missouri":"MO","Montana":"MT","Nebraska":"NE","Nevada":"NV","New Hampshire":"NH",
    "New Jersey":"NJ","New Mexico":"NM","New York":"NY","North Carolina":"NC","North Dakota":"ND","Ohio":"OH",
    "Oklahoma":"OK","Oregon":"OR","Pennsylvania":"PA","Rhode Island":"RI","South Carolina":"SC","South Dakota":"SD",
    "Tennessee":"TN","Texas":"TX","Utah":"UT","Vermont":"VT","Virginia":"VA","Washington":"WA",
    "West Virginia":"WV","Wisconsin":"WI","Wyoming":"WY",
    # Territories (include only if present in both datasets)
    "Puerto Rico":"PR","Guam":"GU","Virgin Islands":"VI","American Samoa":"AS","Northern Mariana Islands":"MP"
}

# Strip the EPA names and map them to their abbreviated versions (CDC dataset uses state abbreviations)

aqi_m1["state"] = aqi_m1["state"].astype(str).str.strip()
aqi_m1["state"] = aqi_m1["state"].map(US_STATE_ABBR)

In [23]:
import pandas as pd

cdc_m1 = pd.read_csv("../data/processed/cdc_m1.csv")
cdc_m1["week"] = pd.to_datetime(cdc_m1["week"])
cdc_m1["state"] = cdc_m1["state"].astype(str).str.strip().str.upper()

aqi_m1 = pd.read_csv("../data/processed/aqi_m1.csv")
aqi_m1["week"] = pd.to_datetime(aqi_m1["week"])

In [24]:
aqi_m1["state"].dropna().astype(str).str.strip().unique()[:15]

<StringArray>
[             'Alabama',               'Alaska',              'Arizona',
             'Arkansas',           'California',             'Colorado',
          'Connecticut',    'Country Of Mexico',             'Delaware',
 'District Of Columbia',              'Florida',              'Georgia',
               'Hawaii',                'Idaho',             'Illinois']
Length: 15, dtype: str

In [21]:
aqi_m1["state"].isna().sum(), aqi_m1["state"].unique()[:20]

(np.int64(13329),
 <StringArray>
 [nan]
 Length: 1, dtype: str)

In [26]:
US_STATE_ABBR = {
    "Alabama":"AL","Alaska":"AK","Arizona":"AZ","Arkansas":"AR","California":"CA","Colorado":"CO",
    "Connecticut":"CT","Delaware":"DE","District of Columbia":"DC","Florida":"FL","Georgia":"GA",
    "Hawaii":"HI","Idaho":"ID","Illinois":"IL","Indiana":"IN","Iowa":"IA","Kansas":"KS","Kentucky":"KY",
    "Louisiana":"LA","Maine":"ME","Maryland":"MD","Massachusetts":"MA","Michigan":"MI","Minnesota":"MN",
    "Mississippi":"MS","Missouri":"MO","Montana":"MT","Nebraska":"NE","Nevada":"NV","New Hampshire":"NH",
    "New Jersey":"NJ","New Mexico":"NM","New York":"NY","North Carolina":"NC","North Dakota":"ND","Ohio":"OH",
    "Oklahoma":"OK","Oregon":"OR","Pennsylvania":"PA","Rhode Island":"RI","South Carolina":"SC","South Dakota":"SD",
    "Tennessee":"TN","Texas":"TX","Utah":"UT","Vermont":"VT","Virginia":"VA","Washington":"WA",
    "West Virginia":"WV","Wisconsin":"WI","Wyoming":"WY",
    # Territories (include only if present in both datasets)
    "Puerto Rico":"PR","Guam":"GU","Virgin Islands":"VI","American Samoa":"AS","Northern Mariana Islands":"MP"
}

# Strip the EPA names and map them to their abbreviated versions (CDC dataset uses state abbreviations)

aqi_m1["state_name"] = aqi_m1["state"].astype(str).str.strip()
aqi_m1["state_abbr"] = aqi_m1["state_name"].map(US_STATE_ABBR)

In [28]:
print("AQI state_abbr NaNs:", aqi_m1["state_abbr"].isna().sum())
print("Example AQI states:", aqi_m1["state_name"].dropna().unique()[:10])

print("Example CDC states:", cdc_m1["state"].dropna().unique()[:10])

AQI state_abbr NaNs: 13329
Example AQI states: <StringArray>
['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA']
Length: 10, dtype: str
Example CDC states: <StringArray>
['AK', 'AL', 'AR', 'AS', 'AZ', 'CO', 'FL', 'HI', 'ID', 'MA']
Length: 10, dtype: str


In [30]:
aqi_m1.loc[aqi_m1["state_name"].isna(), "state"].dropna().unique()[:30]


<StringArray>
[]
Length: 0, dtype: str

In [31]:
aqi_m1["state_abbr"][:10]

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
Name: state_abbr, dtype: str

In [32]:
aqi_m1["state_abbr"] = (
    aqi_m1["state_name"]
    .astype(str)
    .str.strip()
    .str.upper()
)

aqi_m1["state_abbr"].isna().sum(), aqi_m1["state_abbr"].unique()[:10]

(np.int64(485),
 <StringArray>
 ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', nan, 'DE', 'FL']
 Length: 10, dtype: str)

In [34]:
aqi_m1.head()

Unnamed: 0,state,week,aqi_mean,aqi_p90,aqi_max,days_reported,state_name,state_abbr
0,AL,2021-01-02 23:59:59.999999,33.285714,53.0,55,21,AL,AL
1,AL,2021-01-09 23:59:59.999999,42.246154,56.0,63,65,AL,AL
2,AL,2021-01-16 23:59:59.999999,45.522388,61.0,72,67,AL,AL
3,AL,2021-01-23 23:59:59.999999,39.5,59.2,67,60,AL,AL
4,AL,2021-01-30 23:59:59.999999,36.457627,56.6,67,59,AL,AL


In [36]:
import pandas as pd

# CDC keys
cdc_m1["state_key"] = cdc_m1["state"].astype(str).str.strip().str.upper()
cdc_m1["week_key"] = pd.to_datetime(cdc_m1["week"]).dt.normalize()  # sets time to 00:00:00

# AQI keys
aqi_m1["state_key"] = aqi_m1["state"].astype(str).str.strip().str.upper()

# This is the critical line:
aqi_m1["week_key"] = pd.to_datetime(aqi_m1["week"]).dt.normalize()

In [37]:
states_overlap = set(cdc_m1["state_key"]).intersection(set(aqi_m1["state_key"]))
len(states_overlap), list(sorted(states_overlap))[:10]

(52, ['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA'])

In [38]:
weeks_overlap = set(cdc_m1["week_key"]).intersection(set(aqi_m1["week_key"]))
len(weeks_overlap), sorted(list(weeks_overlap))[:5]

(255,
 [Timestamp('2021-01-02 00:00:00'),
  Timestamp('2021-01-09 00:00:00'),
  Timestamp('2021-01-16 00:00:00'),
  Timestamp('2021-01-23 00:00:00'),
  Timestamp('2021-01-30 00:00:00')])

In [40]:
merged_m1 = pd.merge(
    cdc_m1,
    aqi_m1,
    on=["state_key", "week_key"],
    how="inner"
)

merged_m1.shape, merged_m1.head()

((12844, 16),
       week_x state_x  covid_admissions  influenza_admissions  rsv_admissions  \
 0 2025-10-04      AK               5.0                   2.0             1.0   
 1 2021-01-02      AR               NaN                  20.0             NaN   
 2 2021-01-09      AR               NaN                  20.0             NaN   
 3 2021-01-16      AR               NaN                  17.0             NaN   
 4 2021-01-23      AR               NaN                  13.0             NaN   
 
    total_admissions state_key   week_key state_y                     week_y  \
 0               8.0        AK 2025-10-04      AK 2025-10-04 23:59:59.999999   
 1              20.0        AR 2021-01-02      AR 2021-01-02 23:59:59.999999   
 2              20.0        AR 2021-01-09      AR 2021-01-09 23:59:59.999999   
 3              17.0        AR 2021-01-16      AR 2021-01-16 23:59:59.999999   
 4              13.0        AR 2021-01-23      AR 2021-01-23 23:59:59.999999   
 
     aqi_mean  a

In [41]:
merged_m1["state"] = merged_m1["state_key"].astype(str).str.strip().str.upper()
merged_m1["week"] = pd.to_datetime(merged_m1["week_key"]).dt.normalize()

In [42]:
merged_m1 = merged_m1.rename(columns={"total_admissions": "total_respiratory_admissions"})

In [43]:
merged_m1 = merged_m1[[
    "state", "week",
    "covid_admissions", "influenza_admissions", "rsv_admissions",
    "total_respiratory_admissions",
    "aqi_mean", "aqi_p90", "aqi_max", "days_reported"
]].copy()

merged_m1.head()

Unnamed: 0,state,week,covid_admissions,influenza_admissions,rsv_admissions,total_respiratory_admissions,aqi_mean,aqi_p90,aqi_max,days_reported
0,AK,2025-10-04,5.0,2.0,1.0,8.0,20.0,24.4,26,3
1,AR,2021-01-02,,20.0,,20.0,21.705882,31.2,35,17
2,AR,2021-01-09,,20.0,,20.0,34.078431,42.0,50,51
3,AR,2021-01-16,,17.0,,17.0,42.035088,57.0,73,57
4,AR,2021-01-23,,13.0,,13.0,35.115385,46.9,55,52


In [44]:
cols_to_drop = [
    "week_x", "state_x", "state_key", "week_key",
    "state_y", "week_y", "state_name", "state_abbr"
]
merged_m1_clean = merged_m1.drop(columns=[c for c in cols_to_drop if c in merged_m1.columns])


In [46]:
cdc_m1 = cdc_m1.rename(columns={"total_admissions": "total_respiratory_admissions"})
cdc_m1.to_csv("../data/processed/cdc_m1.csv", index=False)

In [48]:
aqi_m1["week"] = pd.to_datetime(aqi_m1["week"]).dt.normalize()
aqi_m1["state"] = aqi_m1["state"].astype(str).str.strip().str.upper()
aqi_m1.to_csv("../data/processed/aqi_m1.csv", index=False)

In [50]:
merged_m1.to_csv("../data/processed/merged_m1.csv", index=False)

In [51]:
merged_m1.duplicated(subset=["state", "week"]).sum()

np.int64(0)

In [52]:
merged_m1["week"].head()

0   2025-10-04
1   2021-01-02
2   2021-01-09
3   2021-01-16
4   2021-01-23
Name: week, dtype: datetime64[us]

In [53]:
aqi_m1.head()

Unnamed: 0,state,week,aqi_mean,aqi_p90,aqi_max,days_reported,state_name,state_abbr,state_key,week_key
0,AL,2021-01-02,33.285714,53.0,55,21,AL,AL,AL,2021-01-02
1,AL,2021-01-09,42.246154,56.0,63,65,AL,AL,AL,2021-01-09
2,AL,2021-01-16,45.522388,61.0,72,67,AL,AL,AL,2021-01-16
3,AL,2021-01-23,39.5,59.2,67,60,AL,AL,AL,2021-01-23
4,AL,2021-01-30,36.457627,56.6,67,59,AL,AL,AL,2021-01-30


In [54]:
merged_m1.head(), merged_m1.shape

(  state       week  covid_admissions  influenza_admissions  rsv_admissions  \
 0    AK 2025-10-04               5.0                   2.0             1.0   
 1    AR 2021-01-02               NaN                  20.0             NaN   
 2    AR 2021-01-09               NaN                  20.0             NaN   
 3    AR 2021-01-16               NaN                  17.0             NaN   
 4    AR 2021-01-23               NaN                  13.0             NaN   
 
    total_respiratory_admissions   aqi_mean  aqi_p90  aqi_max  days_reported  
 0                           8.0  20.000000     24.4       26              3  
 1                          20.0  21.705882     31.2       35             17  
 2                          20.0  34.078431     42.0       50             51  
 3                          17.0  42.035088     57.0       73             57  
 4                          13.0  35.115385     46.9       55             52  ,
 (12844, 10))

In [None]:
import sqlite3

conn = sqlite3.connect("data/processed/milestone1.db")

cdc_m1.to_sql("cdc_m1", conn, if_exists="replace", index=False)
aqi_m1.to_sql("aqi_m1", conn, if_exists="replace", index=False)
merged_m1.to_sql("merged_m1", conn, if_exists="replace", index=False)

conn.close()

In [0]:
# Data Acquisition
# Dataset #1: Respiratory-related hospitalizations data - CDC
# Dataset #2: Air Quality Index Data - EPA
# Both datasets are publicly available and downloadable as CSV files

# Since cdc dataset is so large, turn off low memory to ensure it loads correctly
df_cdc = pd.DataFrame(pd.read_csv('CDC-Hospitalizations.csv', low_memory=False))
df_epa = pd.DataFrame(pd.read_csv('EPA-AQI-County-2025.csv'))

print("Respiratory-related Hospitalizations Dataset - CDC")
display(df_cdc.head(5))
print("\nAir Quality Index Dataset - EPA\n")
display(df_epa.head(5))

In [None]:
# Data Acquisition II
conn = sqlite3.connect('combined_datasets.db')
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS cdc_data_raw")
cursor.execute("DROP TABLE IF EXISTS cdc_data")
cursor.execute("DROP TABLE IF EXISTS aqi_data")

# Adding CDC dataset to the database
# Since dataset is so large, manual schema will be d
df_cdc.to_sql('cdc_raw_data', conn, if_exists='replace', index=False)
# print(pd.read_sql("SELECT * FROM cdc_raw_data LIMIT 5", conn))

# Just a test to consolidate data from main cdc table to small, cleaner format
cursor.execute('''
CREATE TABLE IF NOT EXISTS cdc_data (
    week TEXT,
    state_name TEXT
)
''')
conn.commit()

cursor.execute('''
INSERT INTO cdc_data (week, state_name)
SELECT
    "Week Ending Date" as week,
    "Geographic aggregation" as state_name
FROM cdc_raw_data
''')
conn.commit()
print(pd.read_sql("SELECT * FROM cdc_data LIMIT 5", conn))

# Adding EPA dataset to the database w/ manual schema
cursor.execute('''
CREATE TABLE IF NOT EXISTS epa_data (
    state_name TEXT NOT NULL,
    county_name TEXT NOT NULL,
    date TEXT,
    aqi INTEGER,
    category TEXT,
    defining_parameter TEXT,
    defining_site TEXT,
    sites_reporting INTEGER
)
''')
conn.commit()

df_epa_filtered = df_epa[['State Name', 'county Name', 'Date', 'AQI', 'Category', 'Defining Parameter', 'Defining Site', 'Number of Sites Reporting']]
df_epa_filtered.columns = ['state_name', 'county_name', 'date', 'aqi', 'category', 'defining_parameter', 'defining_site', 'sites_reporting']
df_epa_filtered.to_sql('epa_data', conn, if_exists='append', index=False)
print(pd.read_sql("SELECT * FROM epa_data LIMIT 5", conn))

conn.close()

In [None]:
# Check to see if database file is accurate
conn = sqlite3.connect("combined_datasets.db")

tables = pd.read_sql("""
SELECT name
FROM sqlite_master
WHERE type='table';
""", conn)

print(tables, "\n")

cdc_test = pd.read_sql("""
SELECT *
FROM cdc_data
LIMIT 2
""", conn)

print(cdc_test, "\n")

epa_test = pd.read_sql("""
SELECT *
FROM epa_data
LIMIT 2
""", conn)

print(epa_test)

conn.close()

In [None]:
# Data Exploration
# todo