In [1]:
# Import statements for packages used...
import os, glob, shutil, sys, requests, json
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets

from botocore import UNSIGNED
from botocore.config import Config
from io import StringIO
from datetime import datetime
from types import SimpleNamespace
from IPython.display import clear_output

from io import BytesIO
from tqdm import tqdm
import time
import gzip


In [2]:
def getNoaaData(stationids=['72287493134'], start_year=2016, end_year=2024):
    noaagsod_df = pd.DataFrame()
    filenameNOAA = f"noaa_{start_year}_{end_year}.csv"
    print(filenameNOAA)
    print(f"Get Noaa data for station {stationids}, from {start_year} to {end_year}")
    defaultColumns   = ['DATE','NAME','STATION','LATITUDE','LONGITUDE',
                            'DEWP','WDSP','MAX','MIN','PRCP','TEMP','MXSPD'] 

    #save/load downloaded data 
    if os.path.exists(filenameNOAA):
        print('Loading NOAA GSOD data from local file: ', filenameNOAA)
        noaagsod_df = pd.read_csv(filenameNOAA)
    else:
        noaagsod_bucket = 'noaa-gsod-pds'
        print(f'Accessing and preparing data from ASDI-hosted NOAA GSOD dataset in Amazon S3 (bucket: {noaagsod_bucket})...')
        s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

        for year in tqdm(range(start_year, end_year+1)):
            for stationid in tqdm(stationids):
                key = f'{year}/{stationid}.csv'                                    
                csv_obj = s3.get_object(Bucket=noaagsod_bucket, Key=key)                                    
                csv_string = csv_obj['Body'].read().decode('utf-8')                                          
                noaagsod_df = pd.concat([noaagsod_df, pd.read_csv(StringIO(csv_string))], ignore_index=True)
        noaagsod_df = noaagsod_df[defaultColumns]
        noaagsod_df.to_csv(filenameNOAA, index=False)
        
    return noaagsod_df


In [3]:
noaa_df = getNoaaData(stationids=['72287493134','72389093193','72389693144','72494693232','72287493134','72278023183','70261026411','41640099999'])

noaa_2016_2024.csv
Get Noaa data for station ['72287493134', '72389093193', '72389693144', '72494693232', '72287493134', '72278023183', '70261026411', '41640099999'], from 2016 to 2024
Loading NOAA GSOD data from local file:  noaa_2016_2024.csv


In [4]:
noaa_df

Unnamed: 0,DATE,NAME,STATION,LATITUDE,LONGITUDE,DEWP,WDSP,MAX,MIN,PRCP,TEMP,MXSPD
0,2016-01-01,"LOS ANGELES DOWNTOWN USC, CA US",72287493134,34.0236,-118.291100,19.1,1.6,64.9,41.0,0.00,52.7,5.1
1,2016-01-02,"LOS ANGELES DOWNTOWN USC, CA US",72287493134,34.0236,-118.291100,20.1,2.1,64.9,43.0,0.00,54.8,4.1
2,2016-01-03,"LOS ANGELES DOWNTOWN USC, CA US",72287493134,34.0236,-118.291100,38.2,0.7,64.9,44.1,0.00,52.6,4.1
3,2016-01-04,"LOS ANGELES DOWNTOWN USC, CA US",72287493134,34.0236,-118.291100,44.1,1.1,69.1,44.1,0.00,59.3,5.1
4,2016-01-05,"LOS ANGELES DOWNTOWN USC, CA US",72287493134,34.0236,-118.291100,51.9,2.1,69.1,54.0,0.01,57.4,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...
25449,2024-11-13,"LAHORE CITY, PK",41640099999,31.5500,74.333333,64.0,0.5,76.1,62.6,0.00,68.3,1.9
25450,2024-11-14,"LAHORE CITY, PK",41640099999,31.5500,74.333333,62.8,0.2,71.6,61.7,0.00,65.6,1.9
25451,2024-11-15,"LAHORE CITY, PK",41640099999,31.5500,74.333333,62.3,0.3,71.2,61.2,0.00,65.6,1.9
25452,2024-11-16,"LAHORE CITY, PK",41640099999,31.5500,74.333333,61.6,0.3,71.6,59.9,0.00,65.3,1.9


In [17]:
station_lat_lon = noaa_df.groupby(['STATION','LATITUDE','LONGITUDE']).size()
station_list = station_lat_lon.keys()
for (station_id, lat, lon) in station_list:
    print(station_id, lat, lon)

41640099999 31.55 74.3333333
70261026411 64.80309 -147.87606
72278023183 33.4278 -112.00365
72287493134 34.0236 -118.2911
72389093193 36.77999 -119.72016
72389693144 36.31667 -119.4
72494693232 37.33333 -121.81667


In [7]:
def getAQLocationforStation(station_list, parameters_id=[]):
        aq_locations_url = "https://api.openaq.org/v3/locations"
        headers = {
                'Content-Type': 'application/json',
                'X-API-Key': '455c6342a3e1edba80c628d820d4ccd59d3c218b298cf267e0ec80d05f5f16d3'
        } 
        locations = {}

        for station_id, lat, lon in station_list:
            location = []
            aq_reqParams = {
                'limit': 10,
                'page': 1,
                'offset': 0,
                'sort': 'desc',
                'order_by': 'id',
                'parameters_id': parameters_id,
                'coordinates': f'{lat},{lon}',
                'radius': 16100,
                'isMobile': 'false',
                'sensorType': 'reference grade',
                'dumpRaw': 'false'
            }
            aq_resp = requests.get(aq_locations_url, aq_reqParams, headers=headers)
            aq_data = aq_resp.json() 
            if aq_data['results'] and len(aq_data['results']) >= 1:
                for i in range(0, len(aq_data['results'])):
                    locations[aq_data['results'][i]['id']]=station_id
        print(locations)
        return locations


In [8]:
locations = getAQLocationforStation(station_list, parameters_id=[2]) #make sure has pm25

{8664: 41640099999, 1894641: 41640099999, 1880: 70261026411, 7089: 70261026411, 280: 72278023183, 410: 72278023183, 442: 72278023183, 445: 72278023183, 514: 72278023183, 526: 72278023183, 541: 72278023183, 564: 72278023183, 605: 72278023183, 921006: 72278023183, 1575: 72287493134, 1948: 72287493134, 7936: 72287493134, 290486: 72287493134, 290493: 72287493134, 290494: 72287493134, 290497: 72287493134, 290514: 72287493134, 290516: 72287493134, 290539: 72287493134, 790: 72389093193, 895: 72389093193, 234614: 72389093193, 921008: 72389093193, 2832269: 72389093193, 3009053: 72389093193, 877: 72389693144, 8896: 72389693144, 28692: 72389693144, 288307: 72389693144, 3019000: 72389693144, 2007: 72494693232, 2008: 72494693232, 1662618: 72494693232, 3035039: 72494693232, 3055550: 72494693232, 3274019: 72494693232}


In [9]:
def getQADataFrame(locations, year_start=2016, year_end=2024, country='us'):

    aq_df = pd.DataFrame()
    filenameAQ = f"aq_{year_start}_{year_end}.csv"
    aqColumns = ['station', 'location_id','date','parameter','value']

    if os.path.exists(filenameAQ):
        print('Loading AQ data from local file: ', filenameAQ)
        aq_df = pd.read_csv(filenameAQ)
    else:
        openaq_bucket = 'openaq-data-archive'
        s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

        location_objs = []
        for locationid in locations.keys():
            for year in range(year_start, year_end+1):
                prefix = f'records/csv.gz/country={country}/locationid={locationid}/year={year}/'
                objs = s3.list_objects(Bucket=openaq_bucket, Prefix=prefix)
                if 'Contents' in objs:
                    print(f"{locationid} on {year} has {len(objs['Contents'])} days' data")
                    location_objs.extend(objs['Contents'])

        for obj in tqdm(location_objs):
            gz_obj = s3.get_object(Bucket=openaq_bucket, Key=obj['Key'])  
            gz_obj_content = gz_obj['Body'].read()
            with gzip.GzipFile(fileobj=BytesIO(gz_obj_content), mode='rb') as f:
                data = f.read()
            aq_df = pd.concat([aq_df, pd.read_csv(StringIO(data.decode('utf-8')))], ignore_index=True)

        #aq_df = pd.concat(aq_list, ignore_index=True)
        #set station by location
        aq_df['station'] = aq_df.apply(lambda x: locations[x['location_id']] , axis=1)
        aq_df['date'] = pd.to_datetime(aq_df['datetime'],utc=True).dt.date # tbd: covert to string in case join with noaa date column
        aq_df = aq_df[aqColumns]
        aq_df.to_csv(filenameAQ, index=False)
        aq_df = pd.read_csv(filenameAQ) # comment this if covert date column to string in case join with noaa date column

    #average parameter of date
    aq_average_values = aq_df.groupby(['station', 'date', 'parameter'])['value'].mean().reset_index()
    
    return aq_average_values


In [10]:
locations={8664: 41640099999,2007: 72494693232}

In [11]:
#noaa_df = getNoaaDataFrame()
aq_df = getQADataFrame(locations, year_start=2021, year_end=2024)
aq_df


Loading AQ data from local file:  aq_2021_2024.csv


Unnamed: 0,station,date,parameter,value
0,70261026411,2021-01-01,co,0.785714
1,70261026411,2021-01-01,o3,-0.002000
2,70261026411,2021-01-01,pm10,65.000000
3,70261026411,2021-01-01,pm25,73.178571
4,70261026411,2021-01-01,so2,0.021929
...,...,...,...,...
16293,72494693232,2023-04-16,no,0.001119
16294,72494693232,2023-04-16,no2,0.005769
16295,72494693232,2023-04-16,nox,0.006894
16296,72494693232,2023-04-16,o3,0.035500


In [15]:
def merge_train_data(noaa_df, aq_df):
    noaa_df = noaa_df.rename(columns={'STATION': 'station', 'DATE': 'date'})
    merged_df = pd.merge(noaa_df, aq_df, how="inner", on=['station', 'date'])
    train_cols = ['DEWP','WDSP','MAX','MIN','PRCP','TEMP','MXSPD','value'] 

    parameters = ['o3', 'pm25', 'pm10', 'co', 'so2', 'no2']
    #parameters = ['pm25','o3'] 
    for parameter in parameters:
        train_df = merged_df[merged_df['parameter']== parameter]
        print(train_df)
        train_df = train_df[train_cols]
        train_df.to_csv(f'train_{parameter}.csv', index=False)
    return merged_df
    
merged_df = merge_train_data(noaa_df, aq_df)
merged_df.to_csv(f'merged_train_data.csv', index=False)

             date                                    NAME      station  \
2      2021-01-01         LOS ANGELES DOWNTOWN USC, CA US  72287493134   
8      2021-01-02         LOS ANGELES DOWNTOWN USC, CA US  72287493134   
14     2021-01-03         LOS ANGELES DOWNTOWN USC, CA US  72287493134   
20     2021-01-04         LOS ANGELES DOWNTOWN USC, CA US  72287493134   
26     2021-01-05         LOS ANGELES DOWNTOWN USC, CA US  72287493134   
...           ...                                     ...          ...   
19442  2023-04-12  FAIRBANKS INTERNATIONAL AIRPORT, AK US  70261026411   
19447  2023-04-13  FAIRBANKS INTERNATIONAL AIRPORT, AK US  70261026411   
19452  2023-04-14  FAIRBANKS INTERNATIONAL AIRPORT, AK US  70261026411   
19457  2023-04-15  FAIRBANKS INTERNATIONAL AIRPORT, AK US  70261026411   
19462  2023-04-16  FAIRBANKS INTERNATIONAL AIRPORT, AK US  70261026411   

       LATITUDE  LONGITUDE  DEWP  WDSP   MAX   MIN  PRCP  TEMP  MXSPD  \
2      34.02360 -118.29110  31.3   0.4