## Initial AQ Data Collection
These scripts allow for querying the OpenAQ database through API to collect the necessary data and save it to csv.

### Library imports


In [1]:
import pandas as pd
import datetime
import time
import requests
import json

api_key = 'd637e58ce257e19005eb547ad5e3e481613f6928b19b3331693bdd1c3ca0db41'


In [2]:


locations_url = "https://api.openaq.org/v2/locations"
# api_key = userdata.get('openaq_api_key')
api_key = 'd637e58ce257e19005eb547ad5e3e481613f6928b19b3331693bdd1c3ca0db41'
# Get the list of locations in Canada
parameters = {
    "country": "CA",
    "limit": 10000,
    "api_key": api_key
}

response = requests.get(locations_url, params=parameters)

data = json.loads(response.text)

# Extract the list of locations
locations_df = pd.json_normalize(data['results'])

# Remove values where city (province) is not known
locations_df.dropna(subset=['city'], inplace=True)
locations_df = locations_df[locations_df['city'] != 'N/A']

print(len(locations_df))
print(locations_df['city'].unique())


215
['ONTARIO' 'ALBERTA' 'BRITISH COLUMBIA' 'NEW BRUNSWICK' 'NOVA SCOTIA'
 'SASKATCHEWAN' 'QUEBEC' 'NORTHWEST TERRITORIES' 'NEWFOUNDLAND'
 'PRINCE EDWARD ISLAND' 'MANITOBA' 'AROOSTOOK' 'Sherbrooke' 'Halifax'
 'Kelowna' 'Toronto' 'Lethbridge']


In [3]:

# Clean up city and map to province accordingly
locations_df['province'] = locations_df['city'].apply(lambda x: {
  'QUEBEC': 'QC',
  'ALBERTA': 'AB',
  'ONTARIO': 'ON',
  'PRINCE EDWARD ISLAND': 'PE',
  'SASKATCHEWAN': 'SK',
  'BRITISH COLUMBIA': 'BC',
  'NORTHWEST TERRITORIES': 'NT',
  'NEW BRUNSWICK': 'NB',
  'NEWFOUNDLAND': 'NL',
  'NOVA SCOTIA': 'NS',
  'MANITOBA': 'MB',
  'AROOSTOOK': 'NB',
  'Sherbrooke': 'QC',
  'Halifax': 'NS',
  'Kelowna': 'BC',
  'Toronto': 'ON',
  'Lethbridge': 'AB'
}[x])

locations_df = locations_df.drop('city', axis=1)

# Do something with location['parameters'] where unit = 'pm25' get average
locations_df.to_csv('canada_aq_locations.csv', index=False)

locations_df.head()

Unnamed: 0,id,name,entity,country,sources,isMobile,isAnalysis,parameters,sensorType,lastUpdated,firstUpdated,measurements,bounds,manufacturers,coordinates.latitude,coordinates.longitude,province
0,953,Pickle Lake,,CA,,False,,"[{'id': 10, 'unit': 'ppm', 'count': 46565, 'av...",,2024-04-21T11:00:00+00:00,2016-03-06T19:00:00+00:00,46565,"[-90.2175, 54.4494, -90.2175, 54.4494]","[{'modelName': 'Government Monitor', 'manufact...",54.4494,-90.2175,ON
1,285,Wagner2,,CA,,False,,"[{'id': 7, 'unit': 'ppm', 'count': 33591, 'ave...",,2024-04-21T11:00:00+00:00,2016-03-10T07:00:00+00:00,67197,"[-114.449722, 53.493889, -114.449722, 53.493889]","[{'modelName': 'Government Monitor', 'manufact...",53.493889,-114.449722,AB
2,287,St. Lina,,CA,,False,,"[{'id': 9, 'unit': 'ppm', 'count': 17146, 'ave...",,2024-04-21T11:00:00+00:00,2016-03-10T07:00:00+00:00,69668,"[-111.50264, 54.216473, -111.50264, 54.216473]","[{'modelName': 'Government Monitor', 'manufact...",54.216473,-111.50264,AB
3,297,Steeper,,CA,,False,,"[{'id': 7, 'unit': 'ppm', 'count': 31206, 'ave...",,2024-04-21T11:00:00+00:00,2016-03-10T07:00:00+00:00,127847,"[-117.09111, 53.1325, -117.09111, 53.1325]","[{'modelName': 'Government Monitor', 'manufact...",53.1325,-117.09111,AB
5,7975,Vanderhoof Courthous,,CA,,False,,"[{'id': 1, 'unit': 'µg/m³', 'count': 9064, 'av...",,2024-04-21T11:00:00+00:00,2018-10-04T21:00:00+00:00,17852,"[-124.0061, 54.0163, -124.0061, 54.0163]","[{'modelName': 'Government Monitor', 'manufact...",54.0163,-124.0061,BC


### Fetch measurements for each location
 This takes over one hour

In [None]:

# Parameters for the API query
base_params = {
    'limit': 10000,  # Get the maximum allowed per request
    'parameter': 'pm25',  # Or any other desired parameter
    'api_key': api_key
}


# Initialize an empty DataFrame to store the results
all_data_df = pd.DataFrame()

location_ids = locations_df['id'].unique()

for location_id in location_ids:
    for year in range(2023, 2025):  # Iterate from 2021 to 2024
       params = base_params.copy()  # Create a copy of base parameters
       params['location_id'] = location_id
       start_date = datetime.date(year, 1, 1)  # Start of the year
       end_date = datetime.date(year, 12, 31)  # End of the year
       params['date_from'] = start_date.strftime('%Y-%m-%d')
       params['date_to'] = end_date.strftime('%Y-%m-%d')

       location_not_ok = True
       while location_not_ok:
          print(params)
          time.sleep(2)
          response = requests.get("https://api.openaq.org/v2/measurements", params=params)

          if response.status_code == 200:
              location_not_ok = False
              data = response.json()
              results = data['results']
              print(len(results))
              if results:
                  temp_df = pd.DataFrame(results)
                  all_data_df = pd.concat([all_data_df, temp_df], ignore_index=True)
                  temp_df.to_csv(f"aq_data/{location_id}-{year}.csv", index=False)

          elif response.status_code == 429:  # Too many requests
              retry_after = response.headers.get('Retry-After')
              print(f"Too many requests. Retry after {retry_after} seconds.")

              print(f"Error for location {location_id} Rate limit exceeded. Waiting 30 seconds before retrying")
              time.sleep(30)
          else:
              print(f"Error for location {location_id}: {response.status_code}")
              break  # Exit loop on other errors

# Save the data (if any data was collected)
if not all_data_df.empty:
    all_data_df.to_csv("openaq_historical_data.csv", index=False)
    print("Historical OpenAQ data saved to openaq_historical_data.csv")
else:
    print("No data found for the specified locations and time period.")