In [48]:
import requests
import json
import pandas as pd

# Define the base URL for the POST request
url = 'https://data.airquality.nsw.gov.au/api/Data/get_Observations'

# Construct the payload as described in the API documentation
payload = {
    "Parameters": ["PM10","PM2.5","CO","NH3", "NO","NO2","SO2", "OZONE","TSPd" # Air quality parameter
                   ,"SOLAR","TEMP","SD1","WDR","WSP", "Humid", 'NEPH'],                       
    "Sites": [39],   # List of site IDs
    "StartDate": "2015-4-01",                    # Start date for the query
    "EndDate": "2015-5-01",                      # End date for the query
    "Categories": ["Averages"],                   # Data categories
    "SubCategories": ["Hourly"],                  # Subcategories (Hourly data)
    "Frequency": ["Hourly Average"]  # Frequency
}

# Set the headers for the request, if required
headers = {
    'Content-Type': 'application/json'  # API expects a JSON payload
}

# Make the POST request
response = requests.post(url, headers=headers, data=json.dumps(payload))

# Check if the request was successful
if response.status_code == 200:
    # Parse the response JSON
    data = response.json()
    
    # Convert the JSON data to a DataFrame
    df = pd.DataFrame(data)

else:
    # Print the error if the request failed
    print(f"Error: {response.status_code} - {response.text}")


In [49]:
df.head()

Unnamed: 0,Site_Id,Parameter,Date,Hour,HourDescription,Value,AirQualityCategory,DeterminingPollutant
0,39,"{'ParameterCode': 'CO', 'ParameterDescription'...",2015-04-01,1,12 am - 1 am,0.754289,,
1,39,"{'ParameterCode': 'HUMID', 'ParameterDescripti...",2015-04-01,1,12 am - 1 am,94.199,,
2,39,"{'ParameterCode': 'NEPH', 'ParameterDescriptio...",2015-04-01,1,12 am - 1 am,0.232,,
3,39,"{'ParameterCode': 'NO', 'ParameterDescription'...",2015-04-01,1,12 am - 1 am,6.026604,,
4,39,"{'ParameterCode': 'NO2', 'ParameterDescription...",2015-04-01,1,12 am - 1 am,1.959503,GOOD,


In [50]:
df.tail()

Unnamed: 0,Site_Id,Parameter,Date,Hour,HourDescription,Value,AirQualityCategory,DeterminingPollutant
10075,39,"{'ParameterCode': 'SO2', 'ParameterDescription...",2015-04-30,24,11 pm - 12 am,0.034686,GOOD,
10076,39,"{'ParameterCode': 'SOLAR', 'ParameterDescripti...",2015-04-30,24,11 pm - 12 am,-14.142,,
10077,39,"{'ParameterCode': 'TEMP', 'ParameterDescriptio...",2015-04-30,24,11 pm - 12 am,16.094,,
10078,39,"{'ParameterCode': 'WDR', 'ParameterDescription...",2015-04-30,24,11 pm - 12 am,231.69,,
10079,39,"{'ParameterCode': 'WSP', 'ParameterDescription...",2015-04-30,24,11 pm - 12 am,0.369,,


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10080 entries, 0 to 10079
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Site_Id               10080 non-null  int64  
 1   Parameter             10080 non-null  object 
 2   Date                  10080 non-null  object 
 3   Hour                  10080 non-null  int64  
 4   HourDescription       10080 non-null  object 
 5   Value                 9873 non-null   float64
 6   AirQualityCategory    3489 non-null   object 
 7   DeterminingPollutant  0 non-null      object 
dtypes: float64(1), int64(2), object(5)
memory usage: 630.1+ KB


0    {'ParameterCode': 'CO', 'ParameterDescription'...
1    {'ParameterCode': 'HUMID', 'ParameterDescripti...
2    {'ParameterCode': 'NEPH', 'ParameterDescriptio...
3    {'ParameterCode': 'NO', 'ParameterDescription'...
4    {'ParameterCode': 'NO2', 'ParameterDescription...
Name: Parameter, dtype: object


Unnamed: 0,Site_Id,Date,Hour,HourDescription,CO,HUMID,NEPH,NO,NO2,OZONE,PM10,PM2.5,SD1,SO2,SOLAR,TEMP,WDR,WSP
0,39,2015-10-03,1,12 am - 1 am,0.232861,87.608,0.217,-0.073341,0.695523,2.071075,14.482,7.103,32.864,0.034228,-10.441,15.913,298.11,0.571
1,39,2015-10-03,2,1 am - 2 am,,90.634,0.203,,,,12.61,6.077,54.57,,-10.192,15.247,288.102,0.434
2,39,2015-10-03,3,2 am - 3 am,0.268038,92.911,0.224,-0.035312,1.216225,0.96485,13.69,5.43,54.848,0.033287,-10.892,14.857,277.566,0.367
3,39,2015-10-03,4,3 am - 4 am,0.353759,93.283,0.275,0.314788,2.326032,0.12965,15.225,5.871,31.098,0.051825,-12.08,14.43,289.752,0.567
4,39,2015-10-03,5,4 am - 5 am,0.41682,93.831,0.287,1.175953,2.332475,0.08135,16.771,6.264,37.532,0.055412,-12.841,13.961,282.367,0.545


In [44]:
df_wide.tail()


Unnamed: 0,Site_Id,Date,Hour,HourDescription,CO,HUMID,NEPH,NO,NO2,OZONE,PM10,PM2.5,SD1,SO2,SOLAR,TEMP,WDR,WSP
19,39,2015-10-03,20,7 pm - 8 pm,0.435917,67.097,0.556,-0.022233,3.045342,1.80875,39.33,18.153,42.619,0.2363,-12.487,22.583,60.898,1.235
20,39,2015-10-03,21,8 pm - 9 pm,0.435524,65.687,0.594,-0.035815,3.182338,1.66195,38.719,25.714,43.861,0.246665,-12.295,22.417,6.334,0.662
21,39,2015-10-03,22,9 pm - 10 pm,0.436738,65.281,0.538,-0.02173,3.478723,1.20245,33.41,28.064,47.88,0.218261,-11.685,21.499,330.908,0.49
22,39,2015-10-03,23,10 pm - 11 pm,0.531161,69.644,0.581,0.021328,3.846828,0.55065,32.374,23.957,32.095,0.183578,-11.571,20.298,348.641,0.525
23,39,2015-10-03,24,11 pm - 12 am,0.497638,59.766,0.45,0.014286,3.373414,0.62995,21.835,22.615,24.429,0.124977,-11.449,20.027,332.511,0.543


In [45]:
df_wide.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Site_Id          24 non-null     int64  
 1   Date             24 non-null     object 
 2   Hour             24 non-null     int64  
 3   HourDescription  24 non-null     object 
 4   CO               23 non-null     float64
 5   HUMID            24 non-null     float64
 6   NEPH             24 non-null     float64
 7   NO               23 non-null     float64
 8   NO2              23 non-null     float64
 9   OZONE            23 non-null     float64
 10  PM10             24 non-null     float64
 11  PM2.5            24 non-null     float64
 12  SD1              24 non-null     float64
 13  SO2              23 non-null     float64
 14  SOLAR            24 non-null     float64
 15  TEMP             24 non-null     float64
 16  WDR              24 non-null     float64
 17  WSP              2

In [52]:
import requests
import json
import pandas as pd
from datetime import datetime, timedelta
import time  # Import the time module for adding sleep

# Define the base URL for the POST request
url = 'https://data.airquality.nsw.gov.au/api/Data/get_Observations'

# Initialize an empty DataFrame to store the combined data
combined_df = pd.DataFrame()

# Define the initial start and end dates
start_date = datetime.strptime("2015-04-01", "%Y-%m-%d")
end_date = start_date + timedelta(days=28)

# Set the number of iterations you want to loop through
num_iterations = 124  # Approximate number of iterations to get to today

# Loop to go through each 28-day period and collect data
for i in range(num_iterations):
    
    # Format the dates as strings for the API request
    start_date_str = start_date.strftime("%Y-%m-%d")
    end_date_str = end_date.strftime("%Y-%m-%d")
    
    # Construct the payload for each 28-day period
    payload = {
        "Parameters": ["PM10", "PM2.5", "CO", "NH3", "NO", "NO2", "SO2", "OZONE", "TSPd",
                       "SOLAR", "TEMP", "SD1", "WDR", "WSP", "Humid", "NEPH"],                       
        "Sites": [39],   # List of site IDs
        "StartDate": start_date_str,  # Update the start date
        "EndDate": end_date_str,      # Update the end date
        "Categories": ["Averages"],   
        "SubCategories": ["Hourly"], 
        "Frequency": ["Hourly Average"]
    }

    # Set the headers for the request, if required
    headers = {
        'Content-Type': 'application/json'
    }

    # Make the POST request
    response = requests.post(url, headers=headers, data=json.dumps(payload))

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the response JSON and convert it to a DataFrame
        data = response.json()
        df = pd.DataFrame(data)
        
        # Append the DataFrame to the combined DataFrame
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    else:
        print(f"Error: {response.status_code} - {response.text}")
    
    # Move forward by 28 days
    start_date = end_date
    end_date = start_date + timedelta(days=28)

    # Sleep for 2 seconds between requests to avoid hitting rate limits
    print(f"Iteration {i + 1}/{num_iterations} completed. Sleeping for 2 seconds.")
    time.sleep(5)  # Pause for 5 seconds between iterations

# After the loop, combined_df contains all the data
print(combined_df.head())  # Display the first few rows of the combined DataFrame


Iteration 1/124 completed. Sleeping for 2 seconds.
Iteration 2/124 completed. Sleeping for 2 seconds.
Iteration 3/124 completed. Sleeping for 2 seconds.
Iteration 4/124 completed. Sleeping for 2 seconds.
Iteration 5/124 completed. Sleeping for 2 seconds.
Iteration 6/124 completed. Sleeping for 2 seconds.
Iteration 7/124 completed. Sleeping for 2 seconds.
Iteration 8/124 completed. Sleeping for 2 seconds.
Iteration 9/124 completed. Sleeping for 2 seconds.
Iteration 10/124 completed. Sleeping for 2 seconds.
Error: 502 - The specified CGI application encountered an error and the server terminated the process.
Iteration 11/124 completed. Sleeping for 2 seconds.
Iteration 12/124 completed. Sleeping for 2 seconds.
Iteration 13/124 completed. Sleeping for 2 seconds.
Iteration 14/124 completed. Sleeping for 2 seconds.
Iteration 15/124 completed. Sleeping for 2 seconds.
Iteration 16/124 completed. Sleeping for 2 seconds.
Iteration 17/124 completed. Sleeping for 2 seconds.
Iteration 18/124 compl

In [59]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157184 entries, 0 to 1157183
Data columns (total 8 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   Site_Id               1157184 non-null  int64 
 1   Parameter             1157184 non-null  object
 2   Date                  1157184 non-null  object
 3   Hour                  1157184 non-null  int64 
 4   HourDescription       1157184 non-null  object
 5   Value                 1081165 non-null  object
 6   AirQualityCategory    379620 non-null   object
 7   DeterminingPollutant  0 non-null        object
dtypes: int64(2), object(6)
memory usage: 70.6+ MB


In [60]:
df=combined_df

In [61]:
import pandas as pd

# Check the structure of the 'Parameter' column to ensure it's consistent
print(df['Parameter'].head())  # Check if this is indeed a dictionary or JSON-like

# Step 1: Extract 'ParameterCode' and 'ParameterDescription' from the 'Parameter' column
# We handle cases where 'Parameter' might not be a dictionary
df['ParameterCode'] = df['Parameter'].apply(lambda x: x.get('ParameterCode') if isinstance(x, dict) else None)
df['ParameterDescription'] = df['Parameter'].apply(lambda x: x.get('ParameterDescription') if isinstance(x, dict) else None)

# Step 2: Now pivot the DataFrame to have one row per time observation, with multiple columns
df_wide = df.pivot_table(index=['Site_Id', 'Date', 'Hour', 'HourDescription'],
                         columns='ParameterCode', 
                         values='Value', 
                         aggfunc='first').reset_index()

# Optional: Flatten the MultiIndex columns if necessary
df_wide.columns = [col if not isinstance(col, tuple) else col[1] for col in df_wide.columns]

# Step 3: Display the wide-format DataFrame

0    {'ParameterCode': 'CO', 'ParameterDescription'...
1    {'ParameterCode': 'HUMID', 'ParameterDescripti...
2    {'ParameterCode': 'NEPH', 'ParameterDescriptio...
3    {'ParameterCode': 'NO', 'ParameterDescription'...
4    {'ParameterCode': 'NO2', 'ParameterDescription...
Name: Parameter, dtype: object


In [62]:
df_wide.head()

Unnamed: 0,Site_Id,Date,Hour,HourDescription,CO,HUMID,NEPH,NO,NO2,OZONE,PM10,PM2.5,SD1,SO2,SOLAR,TEMP,WDR,WSP
0,39,2015-04-01,1,12 am - 1 am,0.754289,94.199,0.232,6.026604,1.959503,0.0493,13.925,7.381,108.103,0.096339,-13.13,17.708,10.815,0.13
1,39,2015-04-01,2,1 am - 2 am,,94.513,0.196,,,,18.096,5.359,59.602,,-12.835,18.285,325.715,0.605
2,39,2015-04-01,3,2 am - 3 am,0.430155,95.974,0.177,1.928971,1.746824,0.129325,3.914,2.542,77.922,0.041979,-11.332,17.876,270.408,0.26
3,39,2015-04-01,4,3 am - 4 am,0.407481,96.58,0.218,2.555419,1.651218,0.056325,3.088,5.263,77.073,0.064485,-11.606,17.231,261.068,0.109
4,39,2015-04-01,5,4 am - 5 am,0.369815,97.003,0.198,2.426823,1.576204,0.047325,16.753,9.678,95.759,0.057291,-11.636,17.277,23.419,0.084


In [63]:
df_wide.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79365 entries, 0 to 79364
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Site_Id          79365 non-null  int64 
 1   Date             79365 non-null  object
 2   Hour             79365 non-null  int64 
 3   HourDescription  79365 non-null  object
 4   CO               74154 non-null  object
 5   HUMID            79266 non-null  object
 6   NEPH             78697 non-null  object
 7   NO               74478 non-null  object
 8   NO2              74479 non-null  object
 9   OZONE            74961 non-null  object
 10  PM10             78352 non-null  object
 11  PM2.5            77550 non-null  object
 12  SD1              78864 non-null  object
 13  SO2              74278 non-null  object
 14  SOLAR            79029 non-null  object
 15  TEMP             79331 non-null  object
 16  WDR              78864 non-null  object
 17  WSP              78862 non-null