# ETL Missing Dates Investigation

This notebook investigates missing date entries for specific states (e.g., Colorado) in both the original cleaned dataset and the new enriched dataset with coordinates.

In [1]:
import pandas as pd

# Paths to datasets (adjust if needed)
cleaned_path = '../data/cleaned_enriched.csv.zip'  # or .parquet if preferred
enriched_path = '../data/cleaned_enriched.parquet'

# Load datasets
def load_data():
    cleaned_df = pd.read_csv(cleaned_path)
    enriched_df = pd.read_parquet(enriched_path)
    return cleaned_df, enriched_df

cleaned_df, enriched_df = load_data()
print(f"Cleaned shape: {cleaned_df.shape}")
print(f"Enriched shape: {enriched_df.shape}")

Cleaned shape: (412856, 24)
Enriched shape: (412856, 24)


In [2]:
# States to check for missing dates
states_to_check = ['CO', 'Colorado']

# Function to check missing dates for a given DataFrame and label
def check_missing_dates(df, label):
    print(f"\n--- {label} Dataset ---")
    if 'State' in df.columns and 'Date Local' in df.columns:
        state_mask = df['State'].isin(states_to_check)
        missing_dates = df[state_mask]['Date Local'].isnull().sum()
        total_rows = state_mask.sum()
        print(f"Rows for Colorado: {total_rows}, Missing Dates: {missing_dates}")
        # Show some examples
        display(df[state_mask & df['Date Local'].isnull()].head())
    else:
        print('Required columns not found.')

check_missing_dates(cleaned_df, 'Cleaned')
check_missing_dates(enriched_df, 'Enriched')


--- Cleaned Dataset ---
Rows for Colorado: 8778, Missing Dates: 0


Unnamed: 0,State,County,City,Date Local,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,NO2 AQI,O3 Mean,O3 1st Max Value,...,CO 1st Max Value,CO 1st Max Hour,CO AQI,state_fips_x,lat_city,lon_city,coord_source_city,state_fips_y,population_state,population_city



--- Enriched Dataset ---
Rows for Colorado: 8778, Missing Dates: 0


Unnamed: 0,State,County,City,Date Local,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,NO2 AQI,O3 Mean,O3 1st Max Value,...,CO 1st Max Value,CO 1st Max Hour,CO AQI,state_fips_x,lat_city,lon_city,coord_source_city,state_fips_y,population_state,population_city


## Summary of Findings

- This notebook loads both the cleaned and enriched datasets.
- It checks for missing date entries for Colorado (and similar states).
- Results will show the number of rows and missing dates for each dataset, plus a preview of affected rows.
- Use this to identify if missing dates are present in the original or only in the enriched dataset.

In [3]:
# Check for missing dates in any entry (not just Colorado)
def check_missing_dates_any(df, label):
    print(f"\n--- {label} Dataset: Missing Dates ---")
    missing_dates_count = df['Date Local'].isnull().sum() if 'Date Local' in df.columns else None
    print(f"Total missing dates: {missing_dates_count}")
    if missing_dates_count:
        display(df[df['Date Local'].isnull()].head())
    else:
        print("No missing dates found.")

check_missing_dates_any(cleaned_df, 'Cleaned')
check_missing_dates_any(enriched_df, 'Enriched')


--- Cleaned Dataset: Missing Dates ---
Total missing dates: 0
No missing dates found.

--- Enriched Dataset: Missing Dates ---
Total missing dates: 0
No missing dates found.
