In [1]:
from io import StringIO
import boto3
import pandas as pd
import numpy as np
from datetime import datetime

s3 = boto3.client("s3")

def list_csv_files(bucket_name, key_path):
    csv_files = []
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=key_path)
    
    for page in page_iterator:
        if 'Contents' in page:
            for content in page['Contents']:
                if content['Key'].endswith('.csv'):
                    csv_files.append(content['Key'])
    
    return csv_files

def read_csv_files_to_dataframes(bucket_name, csv_files):
    dataframes = []
    for key in csv_files:
        # Get the object from S3
        obj = s3.get_object(Bucket=bucket_name, Key=key)
        # Read the CSV file content
        data = obj['Body'].read().decode('utf-8')
        # Convert to DataFrame
        df = pd.read_csv(StringIO(data))
        dataframes.append(df)
    return dataframes

def write_csv_to_s3(df, bucket_name, output_key):
    csv_buffer = StringIO()
    df.to_csv(csv_buffer)
    s3.put_object(Bucket=bucket_name, Key=output_key, Body=csv_buffer.getvalue())
    print(f"Uploaded to s3://{bucket_name}/{output_key}")

In [2]:
bucket_name = 'niwa-water-demand-modelling'
key_path = 'TransformedOutputs/InferenceData/'
target_files = list_csv_files(bucket_name, key_path)
input_files = list_csv_files(bucket_name, "InferenceData/")

In [3]:
auto_ml_job_dict = {
    'NorthWellingtonMoa': 'Canvas1734649444174',
    'WellingtonLowLevel': 'Canvas1734648978161',
    'Petone': 'Canvas1733434154045',
    'WellingtonHighWestern': 'Canvas1733085655509',
    'WellingtonHighMoa': 'Canvas1733372214860',
    'NorthWellingtonPorirua': 'Canvas1733369877242',
    'Porirua': 'Canvas1733437572452',
    'Wainuiomata': 'Canvas1734649248674',
    'UpperHutt': 'Canvas1734649294393',
    'LowerHutt': 'Canvas1734649384856'
}

for key in list(auto_ml_job_dict.keys()):
    key = f"/{key}/"
    key_inputs = [e for e in input_files if key in e]
    key_files = ["/".join(e.split("/")[1:]) for e in target_files if key in e]
    unfinished = [e for e in key_inputs if e not in key_files]
    print(f"{key}: {len(key_files)}")
    # find out which input file is not covered
    print(f"{key}: {len(unfinished)} files not processed: {unfinished}")

/NorthWellingtonMoa/: 1
/NorthWellingtonMoa/: 0 files not processed: []
/WellingtonLowLevel/: 1
/WellingtonLowLevel/: 0 files not processed: []
/Petone/: 1
/Petone/: 0 files not processed: []
/WellingtonHighWestern/: 1
/WellingtonHighWestern/: 0 files not processed: []
/WellingtonHighMoa/: 1
/WellingtonHighMoa/: 0 files not processed: []
/NorthWellingtonPorirua/: 1
/NorthWellingtonPorirua/: 0 files not processed: []
/Porirua/: 1
/Porirua/: 0 files not processed: []
/Wainuiomata/: 1
/Wainuiomata/: 0 files not processed: []
/UpperHutt/: 1
/UpperHutt/: 0 files not processed: []
/LowerHutt/: 1
/LowerHutt/: 0 files not processed: []


In [4]:
target_files

['TransformedOutputs/InferenceData/LowerHutt/Lower Hutt.csv',
 'TransformedOutputs/InferenceData/NorthWellingtonMoa/North Wellington Moa.csv',
 'TransformedOutputs/InferenceData/NorthWellingtonPorirua/North Wellington Porirua.csv',
 'TransformedOutputs/InferenceData/Petone/Petone.csv',
 'TransformedOutputs/InferenceData/Porirua/Porirua.csv',
 'TransformedOutputs/InferenceData/UpperHutt/Upper Hutt.csv',
 'TransformedOutputs/InferenceData/Wainuiomata/Wainuiomata.csv',
 'TransformedOutputs/InferenceData/WellingtonHighMoa/Wellington High Moa.csv',
 'TransformedOutputs/InferenceData/WellingtonHighWestern/Wellington High Western.csv',
 'TransformedOutputs/InferenceData/WellingtonLowLevel/Wellington Low Level.csv']

In [5]:
input_files

['InferenceData/LowerHutt/Lower Hutt.csv',
 'InferenceData/NorthWellingtonMoa/North Wellington Moa.csv',
 'InferenceData/NorthWellingtonPorirua/North Wellington Porirua.csv',
 'InferenceData/Petone/Petone.csv',
 'InferenceData/Porirua/Porirua.csv',
 'InferenceData/UpperHutt/Upper Hutt.csv',
 'InferenceData/Wainuiomata/Wainuiomata.csv',
 'InferenceData/WellingtonHighMoa/Wellington High Moa.csv',
 'InferenceData/WellingtonHighWestern/Wellington High Western.csv',
 'InferenceData/WellingtonLowLevel/Wellington Low Level.csv']

In [8]:
y_cols = ['Lower Hutt', 'Petone',
       'Wainuiomata', 'Upper Hutt', 'Porirua', 'Wellington High Moa',
       'Wellington High Western', 'Wellington Low Level',
       'North Wellington Moa', 'North Wellington Porirua']

# find out unique experiments
experiments = ["InferenceData"]

for exp in experiments:
    exp_files = [e for e in target_files if exp in e]
    exp_input_files = [e for e in input_files if exp in e]
    ordered_files = []
    df_list = []
    # process by site name
    for y_col in y_cols:
        # find input file
        input_file = [e for e in exp_input_files if f"/{y_col}.csv" in e]
        # find prediction file from this experiment
        target_file = [e for e in exp_files if f"/{y_col}.csv" in e]
        if len(input_file) == 1 and len(target_file) == 1:
            df_input = read_csv_files_to_dataframes(bucket_name, input_file)[0]
            df_target = read_csv_files_to_dataframes(bucket_name, target_file)[0]
            if "replicate" in df_input.columns:
                rep_unique = df_input["replicate"].unique()
                # check if only 1 replicate
                if len(rep_unique)>1:
                    # include replicate as index
                    df = pd.concat([df_input[["Date", "replicate"]], df_target[[y_col]]], axis=1)
                    df  = df[["Date", "replicate", y_col]].set_index(["replicate", "Date"])
                else:
                    df = pd.concat([df_input[["Date"]], df_target[[y_col]]], axis=1)
                    df  = df[["Date", y_col]].set_index("Date")
            else:
                df = pd.concat([df_input[["Date"]], df_target[[y_col]]], axis=1)
                df  = df[["Date", y_col]].set_index("Date")
                
            df_list.append(df)
            print(f"{exp}: retrieved {y_col} results")
    df_ps = pd.concat(df_list, axis=1)
    output_key = f"InferenceResults/full_results.csv"
    write_csv_to_s3(df_ps, bucket_name, output_key)

InferenceData: retrieved Lower Hutt results
InferenceData: retrieved Petone results
InferenceData: retrieved Wainuiomata results
InferenceData: retrieved Upper Hutt results
InferenceData: retrieved Porirua results
InferenceData: retrieved Wellington High Moa results
InferenceData: retrieved Wellington High Western results
InferenceData: retrieved Wellington Low Level results
InferenceData: retrieved North Wellington Moa results
InferenceData: retrieved North Wellington Porirua results
Uploaded to s3://niwa-water-demand-modelling/InferenceResults/full_results.csv


In [7]:
df_ps

Unnamed: 0_level_0,Lower Hutt,Petone,Wainuiomata,Upper Hutt,Porirua,Wellington High Moa,Wellington High Western,Wellington Low Level,North Wellington Moa,North Wellington Porirua
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
01/09/2024,223.154755,230.896225,186.004868,168.090225,185.531143,214.083084,220.229645,244.450546,203.763916,184.828232
02/09/2024,212.598816,240.208755,167.502060,176.108932,171.483170,243.984985,247.212601,245.661499,167.810196,169.211243
03/09/2024,212.135498,225.363312,155.973129,170.592972,169.509460,258.360657,253.196136,242.971024,171.700806,163.283264
04/09/2024,210.835983,239.748505,161.773651,162.360367,165.709839,234.812683,243.767883,224.805420,168.479919,163.106964
05/09/2024,226.159409,259.387268,187.275940,177.860153,182.298996,264.547028,254.227249,252.171722,190.366013,182.467621
...,...,...,...,...,...,...,...,...,...,...
21/01/2025,263.511108,311.682831,233.051254,245.018433,231.719528,336.512543,330.396545,296.239227,221.370605,232.707108
22/01/2025,267.169983,282.588226,237.012543,248.110031,247.137253,360.697296,357.754181,300.779572,219.127457,247.080536
23/01/2025,256.430023,289.305115,204.114380,245.063766,196.977875,302.770599,325.132568,273.458557,172.353546,198.517197
24/01/2025,232.982010,283.266846,189.443085,214.084869,196.132446,280.145111,277.262085,262.586609,159.820160,198.681244


## clean-up transformed outputs

In [22]:
s3 = boto3.client('s3')

def delete_files_with_pattern(bucket_name, prefix, suffix):
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
    
    for page in page_iterator:
        if 'Contents' in page:
            for content in page['Contents']:
                if content['Key'].endswith(suffix):
                    print(f"Deleting {content['Key']}")
                    s3.delete_object(Bucket=bucket_name, Key=content['Key'])

bucket_name = 'niwa-water-demand-modelling'
prefix = 'TransformedOutputs/Simulation/'
suffix = '.csv.out'
delete_files_with_pattern(bucket_name, prefix, suffix)

Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_0.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_1.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_2.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_3.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_4.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_5.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_6.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_7.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_8.