### Imports

In [160]:
%load_ext autoreload
%autoreload 2

from openaq_anomaly_prediction.config import Configuration as config
from openaq_anomaly_prediction.utils.logging import logger
from openaq_anomaly_prediction.load.openaq import OpenAQ_Client

openaq = OpenAQ_Client()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[38;2;102;102;102m14:52:01[0m [32m[1m SUCCESS [1m|[0m[32m[1m  [0m[37m[4m[1mOPENAQ: APIKEY=default[0m[37m[4m[0m[37m[0m


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint
from openaq import OpenAQ

In [3]:
from pandas import json_normalize

In [4]:
config.getenv("OPENAQ_API_KEY")

'0942d3c2c4c9503a675043c169119ff1a9c0e8160da51bbe18caa6f470ffafc8'

In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)

### Functions (OpenAQ API)

In [100]:
import requests
from pandas import json_normalize

def request_openaq(url: str, params: dict) -> dict[str, pd.DataFrame]:

    # TODO: Add a safe-guard for rate limiting from the request headers.
    # Test the remaining requests within the limit, and if close to the limit, wait until reset.
    headers = {
        "X-API-Key": config.getenv("OPENAQ_API_KEY"),
        "accept": "application/json"
    }

    # 3. Make the Request
    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status() # Raises error for 4xx or 5xx
        
        # 4. Process Data
        data = response.json()

        # pprint(data)  # Print metadata for debugging
        # print(data['meta'])  # Print metadata for debugging

        # meta = json_normalize(data['meta'])
        results = json_normalize(data['results'])
        return { "meta": data['meta'], "results": results }

    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")
        print(response.text) # Print the error message from OpenAQ
        raise err

### Get locations

In [7]:
# GET LOCATIONS WITHIN BBOX

# IMPORTANT: API requires bbox as a comma-separated string,
# not a python list/tuple when using 'requests'
bbox_list = [76.772461,28.161110,77.768372,28.943516]
bbox_str = ",".join(map(str, bbox_list))

locations = request_openaq(
    "https://api.openaq.org/v3/locations", {"bbox": bbox_str, "limit": 1000,},
)
locations.shape

(107, 26)

In [8]:
# 1 location record example
display(locations.head(1).to_dict(orient='records'))

[{'id': 13,
  'name': 'Delhi Technological University, Delhi - CPCB',
  'locality': None,
  'timezone': 'Asia/Kolkata',
  'isMobile': False,
  'isMonitor': True,
  'instruments': [{'id': 2, 'name': 'Government Monitor'}],
  'sensors': [{'id': 13866,
    'name': 'no2 µg/m³',
    'parameter': {'id': 5,
     'name': 'no2',
     'units': 'µg/m³',
     'displayName': 'NO₂ mass'}},
   {'id': 24,
    'name': 'o3 µg/m³',
    'parameter': {'id': 3,
     'name': 'o3',
     'units': 'µg/m³',
     'displayName': 'O₃ mass'}},
   {'id': 13864,
    'name': 'pm25 µg/m³',
    'parameter': {'id': 2,
     'name': 'pm25',
     'units': 'µg/m³',
     'displayName': 'PM2.5'}}],
  'licenses': None,
  'bounds': [77.12, 28.744, 77.12, 28.744],
  'distance': None,
  'country.id': 9,
  'country.code': 'IN',
  'country.name': 'India',
  'owner.id': 4,
  'owner.name': 'Unknown Governmental Organization',
  'provider.id': 168,
  'provider.name': 'CPCB',
  'coordinates.latitude': 28.744,
  'coordinates.longitude': 7

In [9]:
locations.iloc[0]["sensors"]

[{'id': 13866,
  'name': 'no2 µg/m³',
  'parameter': {'id': 5,
   'name': 'no2',
   'units': 'µg/m³',
   'displayName': 'NO₂ mass'}},
 {'id': 24,
  'name': 'o3 µg/m³',
  'parameter': {'id': 3,
   'name': 'o3',
   'units': 'µg/m³',
   'displayName': 'O₃ mass'}},
 {'id': 13864,
  'name': 'pm25 µg/m³',
  'parameter': {'id': 2,
   'name': 'pm25',
   'units': 'µg/m³',
   'displayName': 'PM2.5'}}]

### Sensors Overview

In [10]:
# FLATTEN THE SENSORS AND THE PARAMETERS WITHIN EACH LOCATION
def flatten_sensors(row):
    flattened = [
        {
            'location_id': row.get('id'),
            'id': sensor.get('id'),
            'name': sensor.get('name'),
            'parameter_id': sensor.get('parameter', {}).get('id'),
            'parameter_name': sensor.get('parameter', {}).get('name'),
            'parameter_units': sensor.get('parameter', {}).get('units'),
            'parameter_displayName': sensor.get('parameter', {}).get('displayName'),
        }
        for sensor in row.get('sensors', [])
    ]
    return flattened

# 1. Flatten the sensors for each location
df_sensors_flat = locations.copy()
df_sensors_flat['sensors_flat'] = df_sensors_flat.apply(flatten_sensors, axis=1)

# 2. Create a new DataFrame holding every single "sensor"
df_sensors_exploded = df_sensors_flat["sensors_flat"].explode()
df_sensors = pd.DataFrame(df_sensors_exploded.tolist())
display(df_sensors.head())

# # 2.ALTERNATIVE (GEMINI): Using pd.json_normalize to directly explode and flatten nested lists all at once
# df_sensors = pd.json_normalize(
#     data=locations.to_dict(orient='records'), 
#     record_path='sensors',       # The key containing the list to explode
#     meta=['id'],                 # The parent columns to keep
#     meta_prefix='location_',     # Prefix for parent columns (id -> location_id)
#     sep='_'                      # Separator for nested keys (parameter.id -> parameter_id)
# )
# display(df_sensors.head())

Unnamed: 0,location_id,id,name,parameter_id,parameter_name,parameter_units,parameter_displayName
0,13,13866,no2 µg/m³,5,no2,µg/m³,NO₂ mass
1,13,24,o3 µg/m³,3,o3,µg/m³,O₃ mass
2,13,13864,pm25 µg/m³,2,pm25,µg/m³,PM2.5
3,15,27,co µg/m³,4,co,µg/m³,CO mass
4,15,28,no2 µg/m³,5,no2,µg/m³,NO₂ mass


In [11]:
# ANALYSE THE SENSORS DATAFRAME
parameter_column = "name"  # parameter_id | name (NOT parameter_name)

print(f"\n{'='*44}\n{'SENSORS OVERVIEW'}\n")
print(f"Unique {parameter_column}: {df_sensors.nunique().loc[parameter_column]}")
print(f"Unique location_id: {df_sensors.nunique().loc['location_id']}\n")
# display(df_sensors.nunique())

groupby = df_sensors.groupby([parameter_column, "location_id"])["location_id"].count()#.sort_values(ascending=False)
# display(groupby)
groupby = groupby.groupby([parameter_column])
# display(groupby.count())  # or groupby.value_counts()

# ---------------------------------------------------------------------
# Total number of locations per parameter_name (at least 1 sensor of that parameter)
sensors_count_total = pd.DataFrame(groupby.count().sort_values(ascending=False).reset_index()).rename(columns={"location_id": "nb_locations"})
sensors_count_total["coverage%"] = round(sensors_count_total["nb_locations"] / locations.shape[0] * 100, 2)
print(f"\n{'='*44}\n{'COVERAGE OF THE SENSORS (%TOTAL LOCATIONS)':^44}\n{'='*44}")
display(sensors_count_total)


# ---------------------------------------------------------------------
# Detailed sensor counts per location for each metric
sensors_count_per_location = pd.DataFrame(groupby.value_counts().reset_index()).rename(columns={"location_id": "count_per_location", "count": "nb_locations"})
# print(f"\n{'='*50}\n{'DISTRIBUTION OF THE SENSOR COUNT PER LOCATION':^50}\n{'='*50}")
# display(sensors_count_per_location)


# ---------------------------------------------------------------------
# Only keep sensors with more than 1 sensor per location
names_with_multiple_sensors = sensors_count_per_location[sensors_count_per_location["count_per_location"] > 1]["name"].unique()
sensors_count_with_multiple_sensors = sensors_count_per_location[sensors_count_per_location["name"].isin(names_with_multiple_sensors)].sort_values(by=["name", "count_per_location"], ascending=[True, False])
print(f"\n{'='*44}\n{'MULTIPLE SENSORS AT THE SAME LOCATION':^44}\n{'='*44}")
display(sensors_count_with_multiple_sensors)


SENSORS OVERVIEW

Unique name: 18
Unique location_id: 107


 COVERAGE OF THE SENSORS (%TOTAL LOCATIONS) 


Unnamed: 0,name,nb_locations,coverage%
0,pm25 µg/m³,105,98.13
1,o3 µg/m³,95,88.79
2,pm10 µg/m³,94,87.85
3,no2 µg/m³,87,81.31
4,co µg/m³,82,76.64
5,so2 µg/m³,73,68.22
6,no2 ppb,64,59.81
7,co ppb,63,58.88
8,nox ppb,63,58.88
9,no ppb,63,58.88



   MULTIPLE SENSORS AT THE SAME LOCATION    


Unnamed: 0,name,count_per_location,nb_locations
7,o3 µg/m³,2,50
8,o3 µg/m³,1,45
10,pm10 µg/m³,2,53
11,pm10 µg/m³,1,41
12,pm25 µg/m³,2,53
13,pm25 µg/m³,1,52


#### Gemini alternative

In [12]:
# ---------------------------------------------------------------------
# FULL GEMINI VERSION

import pandas as pd

# Assume df_sensors is your fully flattened sensor DataFrame
# Assume locations is your original locations DataFrame
parameter_column = "name" # You can switch this to 'parameter_id' or 'parameter_name'

print(f"\n{'='*40}\n{'SENSORS OVERVIEW'}\n")
print(f"Unique {parameter_column}: {df_sensors[parameter_column].nunique()}")
print(f"Unique location_id: {df_sensors['location_id'].nunique()}\n")

# --- CORE AGGREGATION STEP ---
# Calculate the number of sensors of a certain type (parameter_column)
# that exist at each location_id. This is the foundation for both analyses.
sensors_per_location_count = df_sensors.groupby(
    [parameter_column, "location_id"]
).size().reset_index(name='sensor_count_at_location')


# =====================================================================
# ANALYSIS 1: COVERAGE OF THE SENSORS (%TOTAL LOCATIONS)
# =====================================================================

# 1. Count how many locations have *at least one* sensor of each type.
# We just need to count the unique location_ids for each parameter_column.
sensors_count_total = sensors_per_location_count.groupby(parameter_column)['location_id'].count().reset_index(name='nb_locations')

# 2. Calculate Coverage %
total_locations = locations.shape[0]
sensors_count_total["coverage%"] = round(sensors_count_total["nb_locations"] / total_locations * 100, 2)
sensors_count_total = sensors_count_total.sort_values(by="nb_locations", ascending=False)

print(f"\n{'='*44}\n{'COVERAGE OF THE SENSORS (%TOTAL LOCATIONS)':^44}\n{'='*44}")
display(sensors_count_total)


# =====================================================================
# ANALYSIS 2: MULTIPLE SENSORS AT THE SAME LOCATION
# =====================================================================

# 1. Filter the core count table to only include rows where sensor_count_at_location > 1
multiple_sensors_df = sensors_per_location_count[
    sensors_per_location_count['sensor_count_at_location'] > 1
]

# 2. Now group this filtered table by the parameter to count how many locations
# have multiple sensors for that parameter.
# We use value_counts() on the sensor_count_at_location column to get the distribution.
sensors_count_with_multiple_sensors = multiple_sensors_df.groupby(parameter_column)['sensor_count_at_location'].value_counts().reset_index(name='nb_locations')
sensors_count_with_multiple_sensors.rename(columns={'sensor_count_at_location': 'count_per_location'}, inplace=True)
sensors_count_with_multiple_sensors = sensors_count_with_multiple_sensors.sort_values(by=[parameter_column, "nb_locations"], ascending=[True, False])


print(f"\n{'='*44}\n{'MULTIPLE SENSORS AT THE SAME LOCATION (Count Distribution)':^44}\n{'='*44}")
display(sensors_count_with_multiple_sensors)


SENSORS OVERVIEW

Unique name: 18
Unique location_id: 107


 COVERAGE OF THE SENSORS (%TOTAL LOCATIONS) 


Unnamed: 0,name,nb_locations,coverage%
10,pm25 µg/m³,105,98.13
7,o3 µg/m³,95,88.79
9,pm10 µg/m³,94,87.85
4,no2 µg/m³,87,81.31
1,co µg/m³,82,76.64
13,so2 µg/m³,73,68.22
3,no2 ppb,64,59.81
0,co ppb,63,58.88
5,nox ppb,63,58.88
2,no ppb,63,58.88



MULTIPLE SENSORS AT THE SAME LOCATION (Count Distribution)


Unnamed: 0,name,count_per_location,nb_locations
0,o3 µg/m³,2,50
1,pm10 µg/m³,2,53
2,pm25 µg/m³,2,53


### Measurements

In [13]:
location_id = 13
sensor_id = 13866

measurements = request_openaq(
    f"https://api.openaq.org/v3/sensors/{sensor_id}/measurements", {"limit": 1000,},
)

measurements["location_id"] = location_id
measurements["sensor_id"] = sensor_id

In [14]:
# Add sensor_id manually
print(measurements.shape)
measurements.head(1).to_dict(orient='records')

(1000, 26)


[{'value': 11.3,
  'coordinates': None,
  'summary': None,
  'flagInfo.hasFlags': False,
  'parameter.id': 5,
  'parameter.name': 'no2',
  'parameter.units': 'µg/m³',
  'parameter.displayName': None,
  'period.label': 'raw',
  'period.interval': '01:00:00',
  'period.datetimeFrom.utc': '2016-11-02T17:30:00Z',
  'period.datetimeFrom.local': '2016-11-02T23:00:00+05:30',
  'period.datetimeTo.utc': '2016-11-02T18:30:00Z',
  'period.datetimeTo.local': '2016-11-03T00:00:00+05:30',
  'coverage.expectedCount': 1,
  'coverage.expectedInterval': '01:00:00',
  'coverage.observedCount': 1,
  'coverage.observedInterval': '00:15:00',
  'coverage.percentComplete': 100.0,
  'coverage.percentCoverage': 0.0,
  'coverage.datetimeFrom.utc': '2016-11-02T18:15:00Z',
  'coverage.datetimeFrom.local': '2016-11-02T23:45:00+05:30',
  'coverage.datetimeTo.utc': '2016-11-02T18:30:00Z',
  'coverage.datetimeTo.local': '2016-11-03T00:00:00+05:30',
  'location_id': 13,
  'sensor_id': 13866}]

In [15]:
from datetime import datetime as dt

# Create new columns with proper datetime objects
df_measurements = measurements.copy()
df_measurements["datetimeFrom_utc"] = pd.to_datetime(df_measurements["period.datetimeFrom.utc"])
df_measurements["datetimeFrom_local"] = pd.to_datetime(df_measurements["period.datetimeFrom.local"])
df_measurements["datetimeTo_utc"] = pd.to_datetime(df_measurements["period.datetimeTo.utc"])
df_measurements["datetimeTo_local"] = pd.to_datetime(df_measurements["period.datetimeTo.local"])

# Filter measurements for a specific date (e.g., 2016-11-03)
date_mask = df_measurements["datetimeFrom_local"].dt.date == pd.to_datetime("2016-11-03").date()
ddf_measurements_filtered= df_measurements[date_mask][["location_id", "sensor_id", "parameter.id", "datetimeFrom_local", "datetimeTo_local", "period.interval", "value"]]
ddf_measurements_filtered

# df_measurements[date_mask]
# df_measurements["datetimeFrom_utc_str"] = df_measurements.dt.strftime('%Y-%m-%d %H:%M:%S')


Unnamed: 0,location_id,sensor_id,parameter.id,datetimeFrom_local,datetimeTo_local,period.interval,value
1,13,13866,5,2016-11-03 16:15:00+05:30,2016-11-03 17:15:00+05:30,01:00:00,8.6
2,13,13866,5,2016-11-03 16:30:00+05:30,2016-11-03 17:30:00+05:30,01:00:00,8.6
3,13,13866,5,2016-11-03 16:45:00+05:30,2016-11-03 17:45:00+05:30,01:00:00,9.3
4,13,13866,5,2016-11-03 17:00:00+05:30,2016-11-03 18:00:00+05:30,01:00:00,10.6
5,13,13866,5,2016-11-03 17:15:00+05:30,2016-11-03 18:15:00+05:30,01:00:00,8.6
6,13,13866,5,2016-11-03 17:30:00+05:30,2016-11-03 18:30:00+05:30,01:00:00,8.0
7,13,13866,5,2016-11-03 17:45:00+05:30,2016-11-03 18:45:00+05:30,01:00:00,8.5
8,13,13866,5,2016-11-03 18:00:00+05:30,2016-11-03 19:00:00+05:30,01:00:00,8.7
9,13,13866,5,2016-11-03 18:15:00+05:30,2016-11-03 19:15:00+05:30,01:00:00,7.5
10,13,13866,5,2016-11-03 18:30:00+05:30,2016-11-03 19:30:00+05:30,01:00:00,5.7


### Hourly Measurements

In [177]:
location_id = 235
sensor_id = 12235609

res = request_openaq(
    f"https://api.openaq.org/v3/sensors/{sensor_id}/measurements/hourly",
    {
        "datetime_from": "2025-06-01T00:00:00+00:00",
        "datetime_to": "2025-06-02T00:00:00+00:00",
        "limit": 100,
        "pages": 1,
    },
)
res["meta"]

# hourly_measurements = res["results"]
# hourly_measurements["sensor_id"] = sensor_id
# hourly_measurements["location_id"] = location_id

{'name': 'openaq-api', 'website': '/', 'page': 1, 'limit': 100, 'found': 24}

In [180]:
res["results"]["parameter.name"].value_counts()

parameter.name
pm10    24
Name: count, dtype: int64

In [111]:
res["meta"]

{'name': 'openaq-api', 'website': '/', 'page': 1, 'limit': 1000, 'found': 0}

In [166]:
res["results"]

Unnamed: 0,sensor_id,location_id


In [107]:
print(hourly_measurements.shape)
hourly_measurements.head(1)#.to_dict(orient='records')

(1000, 34)


Unnamed: 0,value,coordinates,flagInfo.hasFlags,parameter.id,parameter.name,parameter.units,parameter.displayName,period.label,period.interval,period.datetimeFrom.utc,period.datetimeFrom.local,period.datetimeTo.utc,period.datetimeTo.local,summary.min,summary.q02,summary.q25,summary.median,summary.q75,summary.q98,summary.max,summary.avg,summary.sd,coverage.expectedCount,coverage.expectedInterval,coverage.observedCount,coverage.observedInterval,coverage.percentComplete,coverage.percentCoverage,coverage.datetimeFrom.utc,coverage.datetimeFrom.local,coverage.datetimeTo.utc,coverage.datetimeTo.local,sensor_id,location_id
0,266.0,,False,1,pm10,µg/m³,,1 hour,01:00:00,2025-02-18T21:30:00Z,2025-02-19T03:00:00+05:30,2025-02-18T22:30:00Z,2025-02-19T04:00:00+05:30,260.0,260.24,263.0,266.0,269.0,271.76,272.0,266.0,8.485281,4,01:00:00,2,00:30:00,50.0,50.0,2025-02-18T22:00:00Z,2025-02-19T03:30:00+05:30,2025-02-18T22:30:00Z,2025-02-19T04:00:00+05:30,12235609,235


In [18]:
hourly_measurements.iloc[-1]

value                                               23.1
coordinates                                         None
flagInfo.hasFlags                                  False
parameter.id                                           5
parameter.name                                       no2
parameter.units                                    µg/m³
parameter.displayName                               None
period.label                                      1 hour
period.interval                                 01:00:00
period.datetimeFrom.utc             2017-02-21T20:30:00Z
period.datetimeFrom.local      2017-02-22T02:00:00+05:30
period.datetimeTo.utc               2017-02-21T21:30:00Z
period.datetimeTo.local        2017-02-22T03:00:00+05:30
summary.min                                         22.2
summary.q02                                       22.234
summary.q25                                       22.625
summary.median                                     23.05
summary.q75                    

In [19]:
from datetime import datetime as dt
# DOCUMENTATION
# For a given timestamp (local), the 'hourly' measurements represent the average value
# over the PREVIOUS hour interval (datetimeFrom minus period.interval=1h) excluding the previous hour itself.
# E.g., for datetimeFrom.local = '2016-11-03 01:00:00' and period.interval = '1h',
# the measurement value corresponds to the average of the values between '2016-11-03 00:00:01' and '2016-11-03 01:00:00'.

# Create new columns with proper datetime objects
df_hourly_measurements = hourly_measurements.copy()
df_hourly_measurements["datetimeFrom_utc"] = pd.to_datetime(df_hourly_measurements["period.datetimeFrom.utc"])
df_hourly_measurements["datetimeFrom_local"] = pd.to_datetime(df_hourly_measurements["period.datetimeFrom.local"])
df_hourly_measurements["datetimeTo_utc"] = pd.to_datetime(df_hourly_measurements["period.datetimeTo.utc"])
df_hourly_measurements["datetimeTo_local"] = pd.to_datetime(df_hourly_measurements["period.datetimeTo.local"])

# Filter measurements for a specific date (e.g., 2016-11-03)
date_mask = df_hourly_measurements["datetimeFrom_local"].dt.date == pd.to_datetime("2016-11-04").date()
ddf_hourly_measurements_filtered= df_hourly_measurements[date_mask][["location_id", "sensor_id", "parameter.id", "datetimeFrom_local", "datetimeTo_local", "period.interval", "value"]]
ddf_hourly_measurements_filtered

# df_measurements[date_mask]
# df_measurements["datetimeFrom_utc_str"] = df_measurements.dt.strftime('%Y-%m-%d %H:%M:%S')


Unnamed: 0,location_id,sensor_id,parameter.id,datetimeFrom_local,datetimeTo_local,period.interval,value
8,13,13866,5,2016-11-04 00:00:00+05:30,2016-11-04 01:00:00+05:30,01:00:00,7.15
9,13,13866,5,2016-11-04 01:00:00+05:30,2016-11-04 02:00:00+05:30,01:00:00,2.5
10,13,13866,5,2016-11-04 02:00:00+05:30,2016-11-04 03:00:00+05:30,01:00:00,8.4
11,13,13866,5,2016-11-04 03:00:00+05:30,2016-11-04 04:00:00+05:30,01:00:00,10.9
12,13,13866,5,2016-11-04 04:00:00+05:30,2016-11-04 05:00:00+05:30,01:00:00,11.3
13,13,13866,5,2016-11-04 05:00:00+05:30,2016-11-04 06:00:00+05:30,01:00:00,10.5
14,13,13866,5,2016-11-04 06:00:00+05:30,2016-11-04 07:00:00+05:30,01:00:00,10.1
15,13,13866,5,2016-11-04 07:00:00+05:30,2016-11-04 08:00:00+05:30,01:00:00,8.4
16,13,13866,5,2016-11-04 08:00:00+05:30,2016-11-04 09:00:00+05:30,01:00:00,9.38
17,13,13866,5,2016-11-04 09:00:00+05:30,2016-11-04 10:00:00+05:30,01:00:00,8.55


### Instruments

In [20]:
id = 2

instruments = request_openaq(
    "https://api.openaq.org/v3/instruments/2", {"limit": 1000,},
)

In [21]:
instruments

Unnamed: 0,id,name,isMonitor,manufacturer.id,manufacturer.name
0,2,Government Monitor,True,4,Unknown Governmental Organization


### Download archives from AWS

In [38]:
import boto3
import os
import gzip

from botocore import UNSIGNED
from botocore.config import Config


def get_location_archives_aws(location_id: int,year: int, month: int | None = None, **kwargs) -> list[str]:
    """Download all archives CSV files from OpenAQ S3 bucket and unzip them locally."""

    debug = kwargs.get("debug", False)
    trace_prefix = kwargs.get("trace_prefix", "")

    # Configuration (OpenAQ)
    s3_prefix_month = f"/month={month:02d}" if month is not None else ""
    BUCKET_NAME = "openaq-data-archive"
    S3_PREFIX = f"records/csv.gz/locationid={location_id}/year={year}{s3_prefix_month}"
    LOCAL_GZ_DIR = os.path.join(config.DATA_PATH, "tmp")
    LOCAL_CSV_DIR = os.path.join(config.DATA_PATH, "aws")

    s3 = boto3.client(
        "s3", region_name="us-east-1", config=Config(signature_version=UNSIGNED)
    )

    # ---------------------------------------------------------------------
    # LIST ALL FILES UNDER THE S3 PREFIX

    logger.debug(f"Scanning s3://{BUCKET_NAME}/{S3_PREFIX}")

    paginator = s3.get_paginator("list_objects_v2")
    pages = paginator.paginate(Bucket=BUCKET_NAME, Prefix=S3_PREFIX)

    download_list = []

    # We loop through pages just to collect the keys first
    for page in pages:
        if "Contents" in page:
            for obj in page["Contents"]:
                key = obj["Key"]
                if not key.endswith("/"):  # ignore folders (ending with /)
                    download_list.append(key)

    total_files = len(download_list)
    if total_files > 0:
        logger.info(f"Found {total_files} files. Starting downloads...")
    else:
        logger.warning(f"No files found for s3://{BUCKET_NAME}/{S3_PREFIX}. Exiting.")
        return []

    # pprint(download_list)

    # ---------------------------------------------------------------------
    # DOWNLOAD ALL FILES AND UNZIP

    # Create directories if they don't exist
    os.makedirs(LOCAL_GZ_DIR, exist_ok=True)
    os.makedirs(LOCAL_CSV_DIR, exist_ok=True)

    # for s3_key in tqdm(download_list, unit='file', desc="Downloading files"):
    errors_s3_keys = []
    csv_file_paths = []
    for index, s3_key in enumerate(download_list):
        if debug:
            logger.trace(f"{trace_prefix}Downloading {index + 1}/{total_files}: {s3_key}")

        # Create paths: local_file_path and temp_gz_path
        relative_path = s3_key[len(S3_PREFIX) :].lstrip(os.path.sep)
        relative_path = relative_path.split(
            os.path.sep
        )[
            -1
        ]  # cleanup to filename only (will erase files with same name in different subdirs)
        gz_file_path = os.path.join(LOCAL_GZ_DIR, relative_path)
        csv_file_path = os.path.join(LOCAL_CSV_DIR, relative_path)[
            :-3
        ]  # remove .gz extension
        # logger.trace(f"gz_file_path: {gz_file_path}")
        # logger.trace(f"csv_file_path: {csv_file_path}")

        # 3. Download
        try:
            # Download to tmp/*.gz
            s3.download_file(BUCKET_NAME, s3_key, gz_file_path)

            # Unzip to csv/*.csv
            with gzip.open(gz_file_path, "rb") as f_in:
                with open(csv_file_path, "wb") as f_out:
                    f_out.write(f_in.read())
            os.remove(gz_file_path)

            # Add the final path to the list
            csv_file_paths.append(csv_file_path)

        except Exception as e:  # don't break loop on single file error
            errors_s3_keys.append(s3_key)
            logger.error(f"Error downloading {s3_key}: {e}")

    os.rmdir(LOCAL_GZ_DIR)

    logger.success(f"Finished downloading {total_files} files.")

    if len(errors_s3_keys) > 0:
        logger.warning(f"{len(errors_s3_keys)} errors during download:")
        for error_key in errors_s3_keys:
            logger.warning(f" - {error_key}")

    return csv_file_paths

In [23]:
csv_files = get_location_archives_aws(location_id=13, year=2016, month=12)

[38;2;102;102;102m09:40:38[0m [34m[1m   DEBUG [1m|[0m[34m[1m  [0m[37mScanning s3://openaq-data-archive/records/csv.gz/locationid=13/year=2016/month=12[0m
[38;2;102;102;102m09:40:39[0m [1m    INFO [1m|[0m[1m  [0m[37mFound 23 files. Starting downloads...[0m
[38;2;102;102;102m09:40:45[0m [32m[1m SUCCESS [1m|[0m[32m[1m  [0m[37m[4m[1mFinished downloading 23 files.[0m[37m[4m[0m[37m[0m


In [26]:
df[["parameter", "units"]].value_counts()

parameter  units
pm25       µg/m³    675
no2        µg/m³    553
Name: count, dtype: int64

In [27]:
# display(locations.head(1).to_dict(orient='records'))


#### Test

# RETRY FROM location_id= 5627

In [None]:
MANUAL_LOCATION_IDS = [
    # 13,
    # 15,
    # 16,
    # 17,
    # 50,
    # 103,
    # 235,
    # 236,
    # 301,
    # 431,
    # 2503,
    # 2587,
    # 2597,
    # 5404,
    # 5509,
    # 5540,
    # 5541,
    # 5570,
    # 5581,
    # 5586,
    # 5588,
    # 5598,
    # 5610,
    # 5613,
    # 5616,
    # 5617,
    # 5622,
    # 5626,
    # 5627,
    # 5630,
    5634,
    5639,
    5641,
    5650,
    5665,
    5742,
    5743,
    5744,
    5753,
    5754,
    5755,
    5759,
    5760,
    5761,
    5762,
    5763,
    5765,
    5766,
    6314,
    6356,
    6357,
    6358,
    6359,
    6924,
    6929,
    6931,
    6932,
    6934,
    6936,
    6938,
    6949,
    6953,
    6957,
    6960,
    6972,
    6978,
    6980,
    6986,
    6988,
    7005,
    7044,
    7434,
    8092,
    8118,
    8235,
    8239,
    8319,
    8365,
    8472,
    8475,
    8915,
    8917,
    10484,
    10485,
    10486,
    10487,
    10488,
    10489,
    10820,
    10825,
    10831,
    10900,
    10908,
    10919,
    10920,
    10921,
    11603,
    11607,
    2860223,
    3024966,
    3409519,
    3410004,
    4663956,
    6105800,
    6119272,
    6125489,
    6145551,
]

In [40]:
# newdelhi_location_ids = locations["id"].to_list()
# newdelhi_location_ids
# # newdelhi_location_ids = [13, 15]

location_processed = 0
number_of_locations = len(MANUAL_LOCATION_IDS)
for location_id in MANUAL_LOCATION_IDS:
    csv_files = get_location_archives_aws(location_id=location_id, year=2025, debug=True, trace_prefix=f"[{location_processed + 1}/{number_of_locations}] ")
    location_processed += 1

[38;2;102;102;102m10:05:15[0m [34m[1m   DEBUG [1m|[0m[34m[1m  [0m[37mScanning s3://openaq-data-archive/records/csv.gz/locationid=5627/year=2025[0m
[38;2;102;102;102m10:05:16[0m [1m    INFO [1m|[0m[1m  [0m[37mFound 287 files. Starting downloads...[0m
[38;2;102;102;102m10:05:16[0m [38;2;102;102;102m   TRACE [1m|[0m[38;2;102;102;102m  [0m[38;2;102;102;102m 󰘍 [1/79] Downloading 1/287: records/csv.gz/locationid=5627/year=2025/month=02/location-5627-20250219.csv.gz[0m
[38;2;102;102;102m10:05:16[0m [38;2;102;102;102m   TRACE [1m|[0m[38;2;102;102;102m  [0m[38;2;102;102;102m 󰘍 [1/79] Downloading 2/287: records/csv.gz/locationid=5627/year=2025/month=02/location-5627-20250220.csv.gz[0m
[38;2;102;102;102m10:05:16[0m [38;2;102;102;102m   TRACE [1m|[0m[38;2;102;102;102m  [0m[38;2;102;102;102m 󰘍 [1/79] Downloading 3/287: records/csv.gz/locationid=5627/year=2025/month=02/location-5627-20250221.csv.gz[0m
[38;2;102;102;102m10:05:16[0m [38;2;102;102;102m 

In [149]:
# BASELINE: Retrieve the same data using the API (hourly mean)

location_id = 235
sensor_id = 12235605

res = request_openaq(
    f"https://api.openaq.org/v3/sensors/{sensor_id}/measurements/hourly",
    {
        "datetime_from": "2025-08-21 00:45:00+05:30",
        "datetime_to": "2025-08-21 04:00:00+05:30",
        "limit": 1000,
        "pages": 1,
    },
)

baseline_measurements = res["results"]
baseline_measurements["sensor_id"] = sensor_id
baseline_measurements["location_id"] = location_id

In [152]:
# 1. Create "chunks" of CSVs so avoid memory overhead.
# 2. Add a project name
import glob


def create_dataframe_from_csv_files(
    filename: str,
    csv_filenames: list[str] | None = None,
    parent_path: str = "",
    **kwargs,
) -> None:
    """Create a single DataFrame by concatenating multiple CSV files."""

    OUTPUT_CSV_DIR = os.path.join(config.DATA_PATH, "csv")
    os.makedirs(OUTPUT_CSV_DIR, exist_ok=True)

    print(f"filename: {filename}")
    print(f"parent_path: {parent_path}")

    input_path = parent_path
    output_file = filename + ".csv"
    output_path = os.path.join(OUTPUT_CSV_DIR, output_file)

    # print(csv_filenames)
    # return
    if csv_filenames is not None:
        print(f"Using provided list of {len(csv_filenames)} CSV files.")
        all_files = [os.path.join(input_path, f) for f in csv_filenames]
    else:

        all_files = glob.glob(os.path.join(input_path, "*.csv"))

    # print(f"glob path: {os.path.join(input_path, '*.csv')}")
    print(f"Found {len(all_files)} files.")

    # 2. Iterate through files and write to the output file
    for i, csv_path in enumerate(all_files):
        # Read the file
        try:
            df = pd.read_csv(csv_path)
        except Exception as e:
            print(f"Error reading {csv_path}: {e}. Skipping.")
            continue  # Skip to the next file

        if i == 0:
            # First file: Write the header row
            df.to_csv(output_path, mode="w", index=False, header=True)
        else:
            # Subsequent files: Append without the header row
            df.to_csv(output_path, mode="a", index=False, header=False)

    return None

create_dataframe_from_csv_files("new_delhi_235_20250821", csv_filenames=["location-235-20250821.csv"], parent_path=config.DATA_PATH / "aws")

df_test = pd.read_csv(config.DATA_PATH / "csv" / "new_delhi_235_20250821.csv")
# df_test.sort_values(by="datetime")

df_test["datetime"] = pd.to_datetime(df_test["datetime"])
df_test = df_test.sort_values(by=["sensors_id", "datetime"])


# Filter measurements for a specific date (e.g., 2016-11-03)
# date_mask = df_test["sensor_id"] == 12235605
# df_test_filtered= df_test[date_mask][["location_id", "sensor_id", "parameter.id", "datetimeFrom_local", "datetimeTo_local", "period.interval", "value"]]
# display(df_test_filtered)




# display(df_test[["location_id", "sensors_id", "datetime", "parameter", "units", "value"]].head(10))
display(baseline_measurements[["location_id", "sensor_id", "period.datetimeFrom.local", "parameter.name", "parameter.units", "value"]].head(10))
        
# GEMINI

# 2. Set 'datetime' as the index. This is necessary for resampling.
df_test = df_test.set_index('datetime')

# Columns to aggregate by (these are the non-time-series columns for a single sensor's hourly mean)
groupby_cols = ['location_id', 'sensors_id', 'location', 'parameter', 'units']
# Columns to keep which are not aggregated (e.g. lat, lon). We'll take the first non-null value for these in each group.
other_cols = ['lat', 'lon']

# 3. Define the aggregation dictionary
agg_dict = {'value': 'mean'} # Calculate the mean of 'value'
for col in other_cols:
    agg_dict[col] = 'first' # Take the first value for static columns (lat, lon)

hourly_aggregated_df = df_test.groupby(groupby_cols).resample('h').agg(agg_dict)
hourly_aggregated_df = hourly_aggregated_df.reset_index().sort_values(by=["sensors_id", "datetime"])
display(hourly_aggregated_df.head())
# END


# display(df_test["sensors_id"].value_counts())


# df_test[["location_id", "sensors_id", "datetime", "parameter", "units", "value"]].head(10)

# df_test.info()
# df = create_dataframe_from_csv_files(
#     "new_delhi", parent_path=config.DATA_PATH / "aws"
# )


filename: new_delhi_235_20250821
parent_path: /home/deniscck/code/denis-cck/openaq_anomaly_prediction/data/aws
Using provided list of 1 CSV files.
Found 1 files.


Unnamed: 0,location_id,sensor_id,period.datetimeFrom.local,parameter.name,parameter.units,value
0,235,12235605,2025-08-21T00:00:00+05:30,co,ppb,1.45
1,235,12235605,2025-08-21T01:00:00+05:30,co,ppb,1.43
2,235,12235605,2025-08-21T02:00:00+05:30,co,ppb,1.36
3,235,12235605,2025-08-21T03:00:00+05:30,co,ppb,1.46


Unnamed: 0,location_id,sensors_id,location,parameter,units,datetime,value,lat,lon
0,235,12235605,"Anand Vihar, New Delhi - DPCC-3379575",co,ppb,2025-08-21 00:00:00+05:30,1.45,28.646835,77.316032
1,235,12235605,"Anand Vihar, New Delhi - DPCC-3379575",co,ppb,2025-08-21 01:00:00+05:30,1.435,28.646835,77.316032
2,235,12235605,"Anand Vihar, New Delhi - DPCC-3379575",co,ppb,2025-08-21 02:00:00+05:30,1.41,28.646835,77.316032
3,235,12235605,"Anand Vihar, New Delhi - DPCC-3379575",co,ppb,2025-08-21 03:00:00+05:30,1.4,28.646835,77.316032
4,235,12235605,"Anand Vihar, New Delhi - DPCC-3379575",co,ppb,2025-08-21 04:00:00+05:30,1.625,28.646835,77.316032


In [53]:
df = create_dataframe_from_csv_files(csv_files, path=config.DATA_PATH / "aws")
# df.to_csv(config.DATA_PATH / "all_newdelhi_2025.csv", index=True)

*.csv


filename: 
parent_path: /home/deniscck/code/denis-cck/openaq_anomaly_prediction/data/aws
glob path: /home/deniscck/code/denis-cck/openaq_anomaly_prediction/data/aws/*.csv
Found 17495 files.


KeyboardInterrupt: 