In [1]:
import requests
import json
import pandas as pd

# Define the base URL for the POST request
url = 'https://data.airquality.nsw.gov.au/api/Data/get_Observations'

# Construct the payload as described in the API documentation
payload = {
    "Parameters": ["PM10","PM2.5","CO","NH3", "NO","NO2","SO2", "OZONE","TSPd" # Air quality parameter
                   ,"SOLAR","TEMP","SD1","WDR","WSP", "Humid", 'NEPH'],                       
    "Sites": [39],   # List of site IDs
    "StartDate": "2015-4-01",                    # Start date for the query
    "EndDate": "2015-5-01",                      # End date for the query
    "Categories": ["Averages"],                   # Data categories
    "SubCategories": ["Hourly"],                  # Subcategories (Hourly data)
    "Frequency": ["Hourly Average"]  # Frequency
}

# Set the headers for the request, if required
headers = {
    'Content-Type': 'application/json'  # API expects a JSON payload
}

# Make the POST request
response = requests.post(url, headers=headers, data=json.dumps(payload))

# Check if the request was successful
if response.status_code == 200:
    # Parse the response JSON
    data = response.json()
    
    # Convert the JSON data to a DataFrame
    df = pd.DataFrame(data)

else:
    # Print the error if the request failed
    print(f"Error: {response.status_code} - {response.text}")


In [2]:
df.head()

Unnamed: 0,Site_Id,Parameter,Date,Hour,HourDescription,Value,AirQualityCategory,DeterminingPollutant
0,39,"{'ParameterCode': 'CO', 'ParameterDescription'...",2015-04-01,1,12 am - 1 am,0.754289,,
1,39,"{'ParameterCode': 'HUMID', 'ParameterDescripti...",2015-04-01,1,12 am - 1 am,94.199,,
2,39,"{'ParameterCode': 'NEPH', 'ParameterDescriptio...",2015-04-01,1,12 am - 1 am,0.232,,
3,39,"{'ParameterCode': 'NO', 'ParameterDescription'...",2015-04-01,1,12 am - 1 am,6.026604,,
4,39,"{'ParameterCode': 'NO2', 'ParameterDescription...",2015-04-01,1,12 am - 1 am,1.959503,GOOD,


In [3]:
df.tail()

Unnamed: 0,Site_Id,Parameter,Date,Hour,HourDescription,Value,AirQualityCategory,DeterminingPollutant
10075,39,"{'ParameterCode': 'SO2', 'ParameterDescription...",2015-04-30,24,11 pm - 12 am,0.034686,GOOD,
10076,39,"{'ParameterCode': 'SOLAR', 'ParameterDescripti...",2015-04-30,24,11 pm - 12 am,-14.142,,
10077,39,"{'ParameterCode': 'TEMP', 'ParameterDescriptio...",2015-04-30,24,11 pm - 12 am,16.094,,
10078,39,"{'ParameterCode': 'WDR', 'ParameterDescription...",2015-04-30,24,11 pm - 12 am,231.69,,
10079,39,"{'ParameterCode': 'WSP', 'ParameterDescription...",2015-04-30,24,11 pm - 12 am,0.369,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10080 entries, 0 to 10079
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Site_Id               10080 non-null  int64  
 1   Parameter             10080 non-null  object 
 2   Date                  10080 non-null  object 
 3   Hour                  10080 non-null  int64  
 4   HourDescription       10080 non-null  object 
 5   Value                 9873 non-null   float64
 6   AirQualityCategory    3489 non-null   object 
 7   DeterminingPollutant  0 non-null      object 
dtypes: float64(1), int64(2), object(5)
memory usage: 630.1+ KB


In [8]:
import requests
import json
import pandas as pd
from datetime import datetime, timedelta
import time  # Import the time module for adding sleep

# Define the base URL for the POST request
url = 'https://data.airquality.nsw.gov.au/api/Data/get_Observations'

# Initialize an empty DataFrame to store the combined data
combined_df = pd.DataFrame()

# Define the initial start and end dates
start_date = datetime.strptime("2015-04-01", "%Y-%m-%d")
end_date = start_date + timedelta(days=28)

# Set the number of iterations you want to loop through
num_iterations = 124  # Approximate number of iterations to get to today

# Loop to go through each 28-day period and collect data
for i in range(num_iterations):
    
    # Format the dates as strings for the API request
    start_date_str = start_date.strftime("%Y-%m-%d")
    end_date_str = end_date.strftime("%Y-%m-%d")
    
    # Construct the payload for each 28-day period
    payload = {
        "Parameters": ["PM10", "PM2.5", "CO", "NH3", "NO", "NO2", "SO2", "OZONE", "TSPd",
                      "RAIN", "SOLAR", "TEMP", "SD1", "WDR", "WSP", "Humid", "NEPH"],                       
        "Sites": [39],   # List of site IDs
        "StartDate": start_date_str,  # Update the start date
        "EndDate": end_date_str,      # Update the end date
        "Categories": ["Averages"],   
        "SubCategories": ["Hourly"], 
        "Frequency": ["Hourly Average"]
    }

    # Set the headers for the request, if required
    headers = {
        'Content-Type': 'application/json'
    }

    # Make the POST request
    response = requests.post(url, headers=headers, data=json.dumps(payload))

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the response JSON and convert it to a DataFrame
        data = response.json()
        df = pd.DataFrame(data)
        
        # Append the DataFrame to the combined DataFrame
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    else:
        print(f"Error: {response.status_code} - {response.text}")
    
    # Move forward by 28 days
    start_date = end_date
    end_date = start_date + timedelta(days=28)

    # Sleep for 2 seconds between requests to avoid hitting rate limits
    print(f"Iteration {i + 1}/{num_iterations} completed. Sleeping for 2 seconds.")
    time.sleep(5)  # Pause for 5 seconds between iterations

# After the loop, combined_df contains all the data
print(combined_df.head())  # Display the first few rows of the combined DataFrame


Iteration 1/124 completed. Sleeping for 2 seconds.
Iteration 2/124 completed. Sleeping for 2 seconds.
Iteration 3/124 completed. Sleeping for 2 seconds.
Iteration 4/124 completed. Sleeping for 2 seconds.
Iteration 5/124 completed. Sleeping for 2 seconds.
Iteration 6/124 completed. Sleeping for 2 seconds.
Iteration 7/124 completed. Sleeping for 2 seconds.
Iteration 8/124 completed. Sleeping for 2 seconds.
Iteration 9/124 completed. Sleeping for 2 seconds.
Iteration 10/124 completed. Sleeping for 2 seconds.
Iteration 11/124 completed. Sleeping for 2 seconds.
Iteration 12/124 completed. Sleeping for 2 seconds.
Iteration 13/124 completed. Sleeping for 2 seconds.
Iteration 14/124 completed. Sleeping for 2 seconds.
Iteration 15/124 completed. Sleeping for 2 seconds.
Iteration 16/124 completed. Sleeping for 2 seconds.
Iteration 17/124 completed. Sleeping for 2 seconds.
Iteration 18/124 completed. Sleeping for 2 seconds.
Iteration 19/124 completed. Sleeping for 2 seconds.
Iteration 20/124 comp

In [9]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1239840 entries, 0 to 1239839
Data columns (total 8 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   Site_Id               1239840 non-null  int64 
 1   Parameter             1239840 non-null  object
 2   Date                  1239840 non-null  object
 3   Hour                  1239840 non-null  int64 
 4   HourDescription       1239840 non-null  object
 5   Value                 1134607 non-null  object
 6   AirQualityCategory    379592 non-null   object
 7   DeterminingPollutant  0 non-null        object
dtypes: int64(2), object(6)
memory usage: 75.7+ MB


In [10]:
df=combined_df

In [11]:
import pandas as pd

# Check the structure of the 'Parameter' column to ensure it's consistent
print(df['Parameter'].head())  # Check if this is indeed a dictionary or JSON-like

# Step 1: Extract 'ParameterCode' and 'ParameterDescription' from the 'Parameter' column
# We handle cases where 'Parameter' might not be a dictionary
df['ParameterCode'] = df['Parameter'].apply(lambda x: x.get('ParameterCode') if isinstance(x, dict) else None)
df['ParameterDescription'] = df['Parameter'].apply(lambda x: x.get('ParameterDescription') if isinstance(x, dict) else None)

# Step 2: Now pivot the DataFrame to have one row per time observation, with multiple columns
df_wide = df.pivot_table(index=['Site_Id', 'Date', 'Hour', 'HourDescription'],
                         columns='ParameterCode', 
                         values='Value', 
                         aggfunc='first').reset_index()

# Optional: Flatten the MultiIndex columns if necessary
df_wide.columns = [col if not isinstance(col, tuple) else col[1] for col in df_wide.columns]

# Step 3: Display the wide-format DataFrame

0    {'ParameterCode': 'CO', 'ParameterDescription'...
1    {'ParameterCode': 'HUMID', 'ParameterDescripti...
2    {'ParameterCode': 'NEPH', 'ParameterDescriptio...
3    {'ParameterCode': 'NO', 'ParameterDescription'...
4    {'ParameterCode': 'NO2', 'ParameterDescription...
Name: Parameter, dtype: object


In [12]:
df_wide.head()

Unnamed: 0,Site_Id,Date,Hour,HourDescription,CO,HUMID,NEPH,NO,NO2,OZONE,PM10,PM2.5,RAIN,SD1,SO2,SOLAR,TEMP,WDR,WSP
0,39,2015-04-01,1,12 am - 1 am,0.754289,94.199,0.232,6.026604,1.959503,0.0493,13.925,7.381,,108.103,0.096339,-13.13,17.708,10.815,0.13
1,39,2015-04-01,2,1 am - 2 am,,94.513,0.196,,,,18.096,5.359,,59.602,,-12.835,18.285,325.715,0.605
2,39,2015-04-01,3,2 am - 3 am,0.430155,95.974,0.177,1.928971,1.746824,0.129325,3.914,2.542,,77.922,0.041979,-11.332,17.876,270.408,0.26
3,39,2015-04-01,4,3 am - 4 am,0.407481,96.58,0.218,2.555419,1.651218,0.056325,3.088,5.263,,77.073,0.064485,-11.606,17.231,261.068,0.109
4,39,2015-04-01,5,4 am - 5 am,0.369815,97.003,0.198,2.426823,1.576204,0.047325,16.753,9.678,,95.759,0.057291,-11.636,17.277,23.419,0.084


In [13]:
df_wide.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79365 entries, 0 to 79364
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Site_Id          79365 non-null  int64 
 1   Date             79365 non-null  object
 2   Hour             79365 non-null  int64 
 3   HourDescription  79365 non-null  object
 4   CO               74155 non-null  object
 5   HUMID            79266 non-null  object
 6   NEPH             78699 non-null  object
 7   NO               74478 non-null  object
 8   NO2              74479 non-null  object
 9   OZONE            74962 non-null  object
 10  PM10             78351 non-null  object
 11  PM2.5            77521 non-null  object
 12  RAIN             53467 non-null  object
 13  SD1              78864 non-null  object
 14  SO2              74279 non-null  object
 15  SOLAR            79029 non-null  object
 16  TEMP             79331 non-null  object
 17  WDR              78864 non-null

In [14]:
df_wide.head()  # Check for NaNs at the top


Unnamed: 0,Site_Id,Date,Hour,HourDescription,CO,HUMID,NEPH,NO,NO2,OZONE,PM10,PM2.5,RAIN,SD1,SO2,SOLAR,TEMP,WDR,WSP
0,39,2015-04-01,1,12 am - 1 am,0.754289,94.199,0.232,6.026604,1.959503,0.0493,13.925,7.381,,108.103,0.096339,-13.13,17.708,10.815,0.13
1,39,2015-04-01,2,1 am - 2 am,,94.513,0.196,,,,18.096,5.359,,59.602,,-12.835,18.285,325.715,0.605
2,39,2015-04-01,3,2 am - 3 am,0.430155,95.974,0.177,1.928971,1.746824,0.129325,3.914,2.542,,77.922,0.041979,-11.332,17.876,270.408,0.26
3,39,2015-04-01,4,3 am - 4 am,0.407481,96.58,0.218,2.555419,1.651218,0.056325,3.088,5.263,,77.073,0.064485,-11.606,17.231,261.068,0.109
4,39,2015-04-01,5,4 am - 5 am,0.369815,97.003,0.198,2.426823,1.576204,0.047325,16.753,9.678,,95.759,0.057291,-11.636,17.277,23.419,0.084


In [15]:

# Fill NaNs at the edges with forward and backward filling
df_wide.fillna(method='ffill', inplace=True)
df_wide.fillna(method='bfill', inplace=True)

# Then apply interpolation to fill NaNs between rows
df_wide.interpolate(method='linear', axis=0, inplace=True)

# Check the resulting DataFrame
print(df_wide)


       Site_Id        Date  Hour HourDescription        CO   HUMID   NEPH  \
0           39  2015-04-01     1    12 am - 1 am  0.754289  94.199  0.232   
1           39  2015-04-01     2     1 am - 2 am  0.754289  94.513  0.196   
2           39  2015-04-01     3     2 am - 3 am  0.430155  95.974  0.177   
3           39  2015-04-01     4     3 am - 4 am  0.407481  96.580  0.218   
4           39  2015-04-01     5     4 am - 5 am  0.369815  97.003  0.198   
...        ...         ...   ...             ...       ...     ...    ...   
79360       39  2024-10-01    20     7 pm - 8 pm  0.179125  71.596  0.238   
79361       39  2024-10-01    21     8 pm - 9 pm  0.189102  71.044  0.253   
79362       39  2024-10-01    22    9 pm - 10 pm  0.153507  66.722  0.203   
79363       39  2024-10-01    23   10 pm - 11 pm  0.138774  65.168  0.183   
79364       39  2024-10-01    24   11 pm - 12 am  0.144522  65.136  0.175   

             NO       NO2     OZONE    PM10   PM2.5  RAIN      SD1       SO

In [16]:
df_wide.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79365 entries, 0 to 79364
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Site_Id          79365 non-null  int64  
 1   Date             79365 non-null  object 
 2   Hour             79365 non-null  int64  
 3   HourDescription  79365 non-null  object 
 4   CO               79365 non-null  float64
 5   HUMID            79365 non-null  float64
 6   NEPH             79365 non-null  float64
 7   NO               79365 non-null  float64
 8   NO2              79365 non-null  float64
 9   OZONE            79365 non-null  float64
 10  PM10             79365 non-null  float64
 11  PM2.5            79365 non-null  float64
 12  RAIN             79365 non-null  float64
 13  SD1              79365 non-null  float64
 14  SO2              79365 non-null  float64
 15  SOLAR            79365 non-null  float64
 16  TEMP             79365 non-null  float64
 17  WDR         

In [17]:
df_wide.head()

Unnamed: 0,Site_Id,Date,Hour,HourDescription,CO,HUMID,NEPH,NO,NO2,OZONE,PM10,PM2.5,RAIN,SD1,SO2,SOLAR,TEMP,WDR,WSP
0,39,2015-04-01,1,12 am - 1 am,0.754289,94.199,0.232,6.026604,1.959503,0.0493,13.925,7.381,0.4,108.103,0.096339,-13.13,17.708,10.815,0.13
1,39,2015-04-01,2,1 am - 2 am,0.754289,94.513,0.196,6.026604,1.959503,0.0493,18.096,5.359,0.4,59.602,0.096339,-12.835,18.285,325.715,0.605
2,39,2015-04-01,3,2 am - 3 am,0.430155,95.974,0.177,1.928971,1.746824,0.129325,3.914,2.542,0.4,77.922,0.041979,-11.332,17.876,270.408,0.26
3,39,2015-04-01,4,3 am - 4 am,0.407481,96.58,0.218,2.555419,1.651218,0.056325,3.088,5.263,0.4,77.073,0.064485,-11.606,17.231,261.068,0.109
4,39,2015-04-01,5,4 am - 5 am,0.369815,97.003,0.198,2.426823,1.576204,0.047325,16.753,9.678,0.4,95.759,0.057291,-11.636,17.277,23.419,0.084


In [18]:
df_wide['datetime'] = pd.to_datetime(df_wide['Date']) + pd.to_timedelta(df_wide['Hour'], unit='h')

# Set 'datetime' as the index
df_wide.set_index('datetime', inplace=True)
df_wide.head()

Unnamed: 0_level_0,Site_Id,Date,Hour,HourDescription,CO,HUMID,NEPH,NO,NO2,OZONE,PM10,PM2.5,RAIN,SD1,SO2,SOLAR,TEMP,WDR,WSP
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-04-01 01:00:00,39,2015-04-01,1,12 am - 1 am,0.754289,94.199,0.232,6.026604,1.959503,0.0493,13.925,7.381,0.4,108.103,0.096339,-13.13,17.708,10.815,0.13
2015-04-01 02:00:00,39,2015-04-01,2,1 am - 2 am,0.754289,94.513,0.196,6.026604,1.959503,0.0493,18.096,5.359,0.4,59.602,0.096339,-12.835,18.285,325.715,0.605
2015-04-01 03:00:00,39,2015-04-01,3,2 am - 3 am,0.430155,95.974,0.177,1.928971,1.746824,0.129325,3.914,2.542,0.4,77.922,0.041979,-11.332,17.876,270.408,0.26
2015-04-01 04:00:00,39,2015-04-01,4,3 am - 4 am,0.407481,96.58,0.218,2.555419,1.651218,0.056325,3.088,5.263,0.4,77.073,0.064485,-11.606,17.231,261.068,0.109
2015-04-01 05:00:00,39,2015-04-01,5,4 am - 5 am,0.369815,97.003,0.198,2.426823,1.576204,0.047325,16.753,9.678,0.4,95.759,0.057291,-11.636,17.277,23.419,0.084
