In [1]:
from helper_functions import get_hourly_weather_data, get_hourly_aqi_data, hourly_to_daily, get_city_state
import pandas as pd

In [3]:
# Testing variance in manhattan 

locations = [
    (40.7244, -73.9981, "Soho"),
    (40.7824, -73.9679, "Central Park"),
    (40.8360, -73.9411, "Washington Heights")
]

weather_dfs = []
aqi_dfs = []

for lat, lon, label in locations:
    print(f"Processing {label} Manhattan...")

    # Get weather and AQI data
    weather_df = get_hourly_weather_data(lat, lon)
    aqi_df = get_hourly_aqi_data(lat, lon)

    # Convert to daily
    weather_df = hourly_to_daily(weather_df)
    aqi_df = hourly_to_daily(aqi_df)

    # Add location tag
    weather_df['location'] = label
    aqi_df['location'] = label

    weather_dfs.append(weather_df)
    aqi_dfs.append(aqi_df)

weather_combined = pd.concat(weather_dfs, ignore_index=True)
aqi_combined = pd.concat(aqi_dfs, ignore_index=True)

# Weather Summary
weather_vars = [col for col in weather_combined.columns if col not in ['date', 'location', 'city', 'state']]
weather_summary = []

for var in weather_vars:
    # Within-location variance over time
    within_var = weather_combined.groupby('location')[var].var()

    # Between-location variance per day
    pivot = weather_combined.pivot(index='date', columns='location', values=var)
    between_var = pivot.var(axis=1).mean()

    for loc in within_var.index:
        weather_summary.append({
            'variable': var,
            'location': loc,
            'within_location_variance': within_var[loc],
            'between_location_variance': between_var
        })

weather_summary_df = pd.DataFrame(weather_summary)

# AQI summary
aqi_vars = [col for col in aqi_combined.columns if col not in ['date', 'location', 'city', 'state']]

aqi_summary = []

for var in aqi_vars:
    within_var = aqi_combined.groupby('location')[var].var()
    pivot = aqi_combined.pivot(index='date', columns='location', values=var)
    between_var = pivot.var(axis=1).mean()

    for loc in within_var.index:
        aqi_summary.append({
            'variable': var,
            'location': loc,
            'within_location_variance': within_var[loc],
            'between_location_variance': between_var
        })

aqi_summary_df = pd.DataFrame(aqi_summary)


# Display and Save

weather_summary_df.to_csv("weather_variance_summary_Manhattan.csv", index=False, float_format="%.15f")
aqi_summary_df.to_csv("aqi_variance_summary_Manhattan.csv", index=False, float_format="%.15f")

# Spatial threshold to determine if we need to average
SPATIAL_RATIO_THRESHOLD = 0.5

# Collapse summary to one row per variable
aqi_collapsed = aqi_summary_df.groupby('variable')[['within_location_variance', 'between_location_variance']].mean()
aqi_collapsed['spatial_ratio'] = aqi_collapsed['between_location_variance'] / aqi_collapsed['within_location_variance']

weather_collapsed = weather_summary_df.groupby('variable')[['within_location_variance', 'between_location_variance']].mean()
weather_collapsed['spatial_ratio'] = weather_collapsed['between_location_variance'] / weather_collapsed['within_location_variance']

# Count/list variables over threshold
aqi_over_threshold = aqi_collapsed[aqi_collapsed['spatial_ratio'] > SPATIAL_RATIO_THRESHOLD]
aqi_over_threshold_count = len(aqi_over_threshold)
print(f"Number of AQI variables needing averaging: {aqi_over_threshold_count} out of {len(aqi_collapsed)}")
print(aqi_over_threshold.index.tolist())

weather_over_threshold = weather_collapsed[weather_collapsed['spatial_ratio'] > SPATIAL_RATIO_THRESHOLD]
weather_over_threshold_count = len(weather_over_threshold)
print(f"Number of weather variables needing averaging: {weather_over_threshold_count} out of {len(weather_collapsed)}")
print(weather_over_threshold.index.tolist())


Processing Soho Manhattan...
Processing Central Park Manhattan...
Processing Washington Heights Manhattan...
Number of AQI variables needing averaging: 0 out of 51
[]
Number of weather variables needing averaging: 2 out of 54
['precipitation_min', 'rain_min']


In [4]:
# Testing variance in NYC

locations = [
    (40.6394, -73.9576, "Kings"),
    (40.7615, -73.9821, "NYC"),
    (40.7113, -73.7983, "Queens"),
    (40.5863, -74.1137, "Stanten Island"),
    (40.8505, -73.8567, "Bronx")
]

weather_dfs = []
aqi_dfs = []

for lat, lon, label in locations:
    print(f"Processing {label} Long Island...")

    # Get weather and AQI data
    weather_df = get_hourly_weather_data(lat, lon)
    aqi_df = get_hourly_aqi_data(lat, lon)

    # Convert to daily
    weather_df = hourly_to_daily(weather_df)
    aqi_df = hourly_to_daily(aqi_df)

    # Add location tag
    weather_df['location'] = label
    aqi_df['location'] = label

    weather_dfs.append(weather_df)
    aqi_dfs.append(aqi_df)

weather_combined = pd.concat(weather_dfs, ignore_index=True)
aqi_combined = pd.concat(aqi_dfs, ignore_index=True)

# Weather Summary
weather_vars = [col for col in weather_combined.columns if col not in ['date', 'location', 'city', 'state']]
weather_summary = []

for var in weather_vars:
    # Within-location variance over time
    within_var = weather_combined.groupby('location')[var].var()

    # Between-location variance per day
    pivot = weather_combined.pivot(index='date', columns='location', values=var)
    between_var = pivot.var(axis=1).mean()

    for loc in within_var.index:
        weather_summary.append({
            'variable': var,
            'location': loc,
            'within_location_variance': within_var[loc],
            'between_location_variance': between_var
        })

weather_summary_df = pd.DataFrame(weather_summary)

# AQI summary
aqi_vars = [col for col in aqi_combined.columns if col not in ['date', 'location', 'city', 'state']]

aqi_summary = []

for var in aqi_vars:
    within_var = aqi_combined.groupby('location')[var].var()
    pivot = aqi_combined.pivot(index='date', columns='location', values=var)
    between_var = pivot.var(axis=1).mean()

    for loc in within_var.index:
        aqi_summary.append({
            'variable': var,
            'location': loc,
            'within_location_variance': within_var[loc],
            'between_location_variance': between_var
        })

aqi_summary_df = pd.DataFrame(aqi_summary)


# Display and Save

weather_summary_df.to_csv("weather_variance_summary_NYC.csv", index=False, float_format="%.15f")
aqi_summary_df.to_csv("aqi_variance_summary_NYC.csv", index=False, float_format="%.15f")

# Spatial threshold to determine if we need to average
SPATIAL_RATIO_THRESHOLD = 0.5

# Collapse summary to one row per variable
aqi_collapsed = aqi_summary_df.groupby('variable')[['within_location_variance', 'between_location_variance']].mean()
aqi_collapsed['spatial_ratio'] = aqi_collapsed['between_location_variance'] / aqi_collapsed['within_location_variance']

weather_collapsed = weather_summary_df.groupby('variable')[['within_location_variance', 'between_location_variance']].mean()
weather_collapsed['spatial_ratio'] = weather_collapsed['between_location_variance'] / weather_collapsed['within_location_variance']

# Count/list variables over threshold
aqi_over_threshold = aqi_collapsed[aqi_collapsed['spatial_ratio'] > SPATIAL_RATIO_THRESHOLD]
aqi_over_threshold_count = len(aqi_over_threshold)
print(f"Number of AQI variables needing averaging: {aqi_over_threshold_count} out of {len(aqi_collapsed)}")
print(aqi_over_threshold.index.tolist())

weather_over_threshold = weather_collapsed[weather_collapsed['spatial_ratio'] > SPATIAL_RATIO_THRESHOLD]
weather_over_threshold_count = len(weather_over_threshold)
print(f"Number of weather variables needing averaging: {weather_over_threshold_count} out of {len(weather_collapsed)}")
print(weather_over_threshold.index.tolist())


Processing Kings Long Island...
Processing NYC Long Island...
Processing Queens Long Island...
Processing Stanten Island Long Island...
Processing Bronx Long Island...
Number of AQI variables needing averaging: 0 out of 51
[]
Number of weather variables needing averaging: 4 out of 54
['precipitation_max', 'precipitation_min', 'rain_max', 'rain_min']


In [5]:
# Testing variance in Long Island 

locations = [
    (40.9690, -72.2799, "East Hampton"),
    (40.7919, -73.1397, "Brentwood/Medford"),
    (40.7092, -73.6350, "Hempstead")
]

weather_dfs = []
aqi_dfs = []

for lat, lon, label in locations:
    print(f"Processing {label} Long Island...")

    # Get weather and AQI data
    weather_df = get_hourly_weather_data(lat, lon)
    aqi_df = get_hourly_aqi_data(lat, lon)

    # Convert to daily
    weather_df = hourly_to_daily(weather_df)
    aqi_df = hourly_to_daily(aqi_df)

    # Add location tag
    weather_df['location'] = label
    aqi_df['location'] = label

    weather_dfs.append(weather_df)
    aqi_dfs.append(aqi_df)

weather_combined = pd.concat(weather_dfs, ignore_index=True)
aqi_combined = pd.concat(aqi_dfs, ignore_index=True)

# Weather Summary
weather_vars = [col for col in weather_combined.columns if col not in ['date', 'location', 'city', 'state']]
weather_summary = []

for var in weather_vars:
    # Within-location variance over time
    within_var = weather_combined.groupby('location')[var].var()

    # Between-location variance per day
    pivot = weather_combined.pivot(index='date', columns='location', values=var)
    between_var = pivot.var(axis=1).mean()

    for loc in within_var.index:
        weather_summary.append({
            'variable': var,
            'location': loc,
            'within_location_variance': within_var[loc],
            'between_location_variance': between_var
        })

weather_summary_df = pd.DataFrame(weather_summary)

# AQI summary
aqi_vars = [col for col in aqi_combined.columns if col not in ['date', 'location', 'city', 'state']]

aqi_summary = []

for var in aqi_vars:
    within_var = aqi_combined.groupby('location')[var].var()
    pivot = aqi_combined.pivot(index='date', columns='location', values=var)
    between_var = pivot.var(axis=1).mean()

    for loc in within_var.index:
        aqi_summary.append({
            'variable': var,
            'location': loc,
            'within_location_variance': within_var[loc],
            'between_location_variance': between_var
        })

aqi_summary_df = pd.DataFrame(aqi_summary)


# Display and Save

weather_summary_df.to_csv("weather_variance_summary_LongIsland.csv", index=False, float_format="%.15f")
aqi_summary_df.to_csv("aqi_variance_summary_LongIsland.csv", index=False, float_format="%.15f")

# Spatial threshold to determine if we need to average
SPATIAL_RATIO_THRESHOLD = 0.5

# Collapse summary to one row per variable
aqi_collapsed = aqi_summary_df.groupby('variable')[['within_location_variance', 'between_location_variance']].mean()
aqi_collapsed['spatial_ratio'] = aqi_collapsed['between_location_variance'] / aqi_collapsed['within_location_variance']

weather_collapsed = weather_summary_df.groupby('variable')[['within_location_variance', 'between_location_variance']].mean()
weather_collapsed['spatial_ratio'] = weather_collapsed['between_location_variance'] / weather_collapsed['within_location_variance']

# Count/list variables over threshold
aqi_over_threshold = aqi_collapsed[aqi_collapsed['spatial_ratio'] > SPATIAL_RATIO_THRESHOLD]
aqi_over_threshold_count = len(aqi_over_threshold)
print(f"Number of AQI variables needing averaging: {aqi_over_threshold_count} out of {len(aqi_collapsed)}")
print(aqi_over_threshold.index.tolist())

weather_over_threshold = weather_collapsed[weather_collapsed['spatial_ratio'] > SPATIAL_RATIO_THRESHOLD]
weather_over_threshold_count = len(weather_over_threshold)
print(f"Number of weather variables needing averaging: {weather_over_threshold_count} out of {len(weather_collapsed)}")
print(weather_over_threshold.index.tolist())


Processing East Hampton Long Island...
Processing Brentwood/Medford Long Island...
Processing Hempstead Long Island...
Number of AQI variables needing averaging: 22 out of 51
['carbon_dioxide_max', 'carbon_dioxide_mean', 'carbon_dioxide_min', 'carbon_monoxide_max', 'carbon_monoxide_mean', 'methane_max', 'methane_mean', 'nitrogen_dioxide_max', 'nitrogen_dioxide_mean', 'nitrogen_dioxide_min', 'ozone_min', 'sulphur_dioxide_max', 'sulphur_dioxide_mean', 'sulphur_dioxide_min', 'us_aqi_carbon_monoxide_max', 'us_aqi_carbon_monoxide_mean', 'us_aqi_nitrogen_dioxide_max', 'us_aqi_nitrogen_dioxide_mean', 'us_aqi_nitrogen_dioxide_min', 'us_aqi_sulphur_dioxide_max', 'us_aqi_sulphur_dioxide_mean', 'us_aqi_sulphur_dioxide_min']
Number of weather variables needing averaging: 4 out of 54
['evapotranspiration_max', 'evapotranspiration_mean', 'precipitation_max', 'rain_max']


In [6]:
# Testing variance in Chicago

locations = [
    (41.8728, -87.6249, "Downtown"),
    (41.8744, -87.7340, "West"),
    (42.0028, -87.6874, "North"),
    (41.7585, -87.6339, "South")
]

weather_dfs = []
aqi_dfs = []

for lat, lon, label in locations:
    print(f"Processing {label} Chicago...")

    # Get weather and AQI data
    weather_df = get_hourly_weather_data(lat, lon)
    aqi_df = get_hourly_aqi_data(lat, lon)

    # Convert to daily
    weather_df = hourly_to_daily(weather_df)
    aqi_df = hourly_to_daily(aqi_df)

    # Add location tag
    weather_df['location'] = label
    aqi_df['location'] = label

    weather_dfs.append(weather_df)
    aqi_dfs.append(aqi_df)

weather_combined = pd.concat(weather_dfs, ignore_index=True)
aqi_combined = pd.concat(aqi_dfs, ignore_index=True)

# Weather Summary
weather_vars = [col for col in weather_combined.columns if col not in ['date', 'location', 'city', 'state']]
weather_summary = []

for var in weather_vars:
    # Within-location variance over time
    within_var = weather_combined.groupby('location')[var].var()

    # Between-location variance per day
    pivot = weather_combined.pivot(index='date', columns='location', values=var)
    between_var = pivot.var(axis=1).mean()

    for loc in within_var.index:
        weather_summary.append({
            'variable': var,
            'location': loc,
            'within_location_variance': within_var[loc],
            'between_location_variance': between_var
        })

weather_summary_df = pd.DataFrame(weather_summary)

# AQI summary
aqi_vars = [col for col in aqi_combined.columns if col not in ['date', 'location', 'city', 'state']]

aqi_summary = []

for var in aqi_vars:
    within_var = aqi_combined.groupby('location')[var].var()
    pivot = aqi_combined.pivot(index='date', columns='location', values=var)
    between_var = pivot.var(axis=1).mean()

    for loc in within_var.index:
        aqi_summary.append({
            'variable': var,
            'location': loc,
            'within_location_variance': within_var[loc],
            'between_location_variance': between_var
        })

aqi_summary_df = pd.DataFrame(aqi_summary)


# Display and Save

weather_summary_df.to_csv("weather_variance_summary_Chicago.csv", index=False, float_format="%.15f")
aqi_summary_df.to_csv("aqi_variance_summary_Chicago.csv", index=False, float_format="%.15f")

# Spatial threshold to determine if we need to average
SPATIAL_RATIO_THRESHOLD = 0.5

# Collapse summary to one row per variable
aqi_collapsed = aqi_summary_df.groupby('variable')[['within_location_variance', 'between_location_variance']].mean()
aqi_collapsed['spatial_ratio'] = aqi_collapsed['between_location_variance'] / aqi_collapsed['within_location_variance']

weather_collapsed = weather_summary_df.groupby('variable')[['within_location_variance', 'between_location_variance']].mean()
weather_collapsed['spatial_ratio'] = weather_collapsed['between_location_variance'] / weather_collapsed['within_location_variance']

# Count/list variables over threshold
aqi_over_threshold = aqi_collapsed[aqi_collapsed['spatial_ratio'] > SPATIAL_RATIO_THRESHOLD]
aqi_over_threshold_count = len(aqi_over_threshold)
print(f"Number of AQI variables needing averaging: {aqi_over_threshold_count} out of {len(aqi_collapsed)}")
print(aqi_over_threshold.index.tolist())

weather_over_threshold = weather_collapsed[weather_collapsed['spatial_ratio'] > SPATIAL_RATIO_THRESHOLD]
weather_over_threshold_count = len(weather_over_threshold)
print(f"Number of weather variables needing averaging: {weather_over_threshold_count} out of {len(weather_collapsed)}")
print(weather_over_threshold.index.tolist())


Processing Downtown Chicago...
Processing West Chicago...
Processing North Chicago...
Processing South Chicago...
Number of AQI variables needing averaging: 0 out of 51
[]
Number of weather variables needing averaging: 2 out of 54
['precipitation_max', 'rain_max']
