In [17]:
from io import StringIO
import boto3
import pandas as pd
import numpy as np
from datetime import datetime

s3 = boto3.client("s3")

def list_csv_files(bucket_name, key_path):
    csv_files = []
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=key_path)
    
    for page in page_iterator:
        if 'Contents' in page:
            for content in page['Contents']:
                if content['Key'].endswith('.csv'):
                    csv_files.append(content['Key'])
    
    return csv_files

def read_csv_files_to_dataframes(bucket_name, csv_files):
    dataframes = []
    for key in csv_files:
        # Get the object from S3
        obj = s3.get_object(Bucket=bucket_name, Key=key)
        # Read the CSV file content
        data = obj['Body'].read().decode('utf-8')
        # Convert to DataFrame
        df = pd.read_csv(StringIO(data))
        dataframes.append(df)
    return dataframes

def write_csv_to_s3(df, bucket_name, output_key):
    csv_buffer = StringIO()
    df.to_csv(csv_buffer)
    s3.put_object(Bucket=bucket_name, Key=output_key, Body=csv_buffer.getvalue())
    print(f"Uploaded to s3://{bucket_name}/{output_key}")

In [18]:
bucket_name = 'niwa-water-demand-modelling'
key_path = 'TransformedOutputs/Simulation/'
target_files = list_csv_files(bucket_name, key_path)
input_files = list_csv_files(bucket_name, "Simulation/")

In [20]:
auto_ml_job_dict = {
    'NorthWellingtonMoa': 'Canvas1734649444174',
    'WellingtonLowLevel': 'Canvas1734648978161',
    'Petone': 'Canvas1733434154045',
    'WellingtonHighWestern': 'Canvas1733085655509',
    'WellingtonHighMoa': 'Canvas1733372214860',
    'NorthWellingtonPorirua': 'Canvas1733369877242',
    'Porirua': 'Canvas1733437572452',
    'Wainuiomata': 'Canvas1734649248674',
    'UpperHutt': 'Canvas1734649294393',
    'LowerHutt': 'Canvas1734649384856'
}

for key in list(auto_ml_job_dict.keys()):
    key = f"/{key}/"
    key_inputs = [e for e in input_files if key in e]
    key_files = ["/".join(e.split("/")[1:]) for e in target_files if key in e]
    unfinished = [e for e in key_inputs if e not in key_files]
    print(f"{key}: {len(key_files)}")
    # find out which input file is not covered
    print(f"{key}: {len(unfinished)} files not processed: {unfinished}")

/NorthWellingtonMoa/: 16
/NorthWellingtonMoa/: 0 files not processed: []
/WellingtonLowLevel/: 16
/WellingtonLowLevel/: 0 files not processed: []
/Petone/: 16
/Petone/: 0 files not processed: []
/WellingtonHighWestern/: 16
/WellingtonHighWestern/: 0 files not processed: []
/WellingtonHighMoa/: 16
/WellingtonHighMoa/: 0 files not processed: []
/NorthWellingtonPorirua/: 16
/NorthWellingtonPorirua/: 0 files not processed: []
/Porirua/: 16
/Porirua/: 0 files not processed: []
/Wainuiomata/: 16
/Wainuiomata/: 0 files not processed: []
/UpperHutt/: 16
/UpperHutt/: 0 files not processed: []
/LowerHutt/: 16
/LowerHutt/: 0 files not processed: []


In [19]:
y_cols = ['Lower Hutt', 'Petone',
       'Wainuiomata', 'Upper Hutt', 'Porirua', 'Wellington High Moa',
       'Wellington High Western', 'Wellington Low Level',
       'North Wellington Moa', 'North Wellington Porirua']

# find out unique experiments
experiments = [e.split("/")[2] for e in target_files]
experiments = np.unique(experiments)

for exp in experiments:
    exp_files = [e for e in target_files if exp in e]
    exp_input_files = [e for e in input_files if exp in e]
    ordered_files = []
    df_list = []
    # process by site name
    for y_col in y_cols:
        # find input file
        input_file = [e for e in exp_input_files if f"/{y_col}.csv" in e]
        # find prediction file from this experiment
        target_file = [e for e in exp_files if f"/{y_col}.csv" in e]
        if len(input_file) == 1 and len(target_file) == 1:
            df_input = read_csv_files_to_dataframes(bucket_name, input_file)[0]
            df_target = read_csv_files_to_dataframes(bucket_name, target_file)[0]
            if "replicate" in df_input.columns:
                rep_unique = df_input["replicate"].unique()
                # check if only 1 replicate
                if len(rep_unique)>1:
                    # include replicate as index
                    df = pd.concat([df_input[["Date", "replicate"]], df_target[[y_col]]], axis=1)
                    df  = df[["Date", "replicate", y_col]].set_index(["replicate", "Date"])
                else:
                    df = pd.concat([df_input[["Date"]], df_target[[y_col]]], axis=1)
                    df  = df[["Date", y_col]].set_index("Date")
            else:
                df = pd.concat([df_input[["Date"]], df_target[[y_col]]], axis=1)
                df  = df[["Date", y_col]].set_index("Date")
                
            df_list.append(df)
            print(f"{exp}: retrieved {y_col} results")
    df_ps = pd.concat(df_list, axis=1)
    output_key = f"Simulation/results/{exp}_full_results.csv"
    write_csv_to_s3(df_ps, bucket_name, output_key)

Final_HydroClimaticFile_ACCESS-CM2_ssp370: retrieved Lower Hutt results
Final_HydroClimaticFile_ACCESS-CM2_ssp370: retrieved Petone results
Final_HydroClimaticFile_ACCESS-CM2_ssp370: retrieved Wainuiomata results
Final_HydroClimaticFile_ACCESS-CM2_ssp370: retrieved Upper Hutt results
Final_HydroClimaticFile_ACCESS-CM2_ssp370: retrieved Porirua results
Final_HydroClimaticFile_ACCESS-CM2_ssp370: retrieved Wellington High Moa results
Final_HydroClimaticFile_ACCESS-CM2_ssp370: retrieved Wellington High Western results
Final_HydroClimaticFile_ACCESS-CM2_ssp370: retrieved Wellington Low Level results
Final_HydroClimaticFile_ACCESS-CM2_ssp370: retrieved North Wellington Moa results
Final_HydroClimaticFile_ACCESS-CM2_ssp370: retrieved North Wellington Porirua results
Uploaded to s3://niwa-water-demand-modelling/Simulation/results/Final_HydroClimaticFile_ACCESS-CM2_ssp370_full_results.csv
Final_HydroClimaticFile_AWI-CM-1-1-MR_ssp126: retrieved Lower Hutt results
Final_HydroClimaticFile_AWI-CM-1

In [21]:
df_ps

Unnamed: 0_level_0,Lower Hutt,Petone,Wainuiomata,Upper Hutt,Porirua,Wellington High Moa,Wellington High Western,Wellington Low Level,North Wellington Moa,North Wellington Porirua
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1965-01-01,223.362259,241.161682,197.892685,187.889969,181.872101,156.607376,158.887939,238.805969,167.017532,180.329132
1965-01-02,262.413177,248.868301,233.087173,235.495316,207.572052,207.596512,182.126297,246.366104,199.616638,205.812958
1965-01-03,264.657898,238.075851,233.181351,257.321655,212.355469,242.150406,229.354614,252.990662,208.518906,212.355469
1965-01-04,260.818512,245.203140,215.514084,232.516953,226.947693,184.492950,188.456665,253.301010,197.677444,225.249756
1965-01-05,259.282410,254.030945,208.626541,228.963730,204.717575,310.345459,307.831177,257.110291,194.305084,204.717575
...,...,...,...,...,...,...,...,...,...,...
2099-12-27,268.909424,242.246277,240.231232,233.126999,241.792938,206.896301,218.896057,241.365784,220.588394,230.507919
2099-12-28,223.784332,182.759491,190.394165,201.342834,198.732468,158.191223,171.105820,206.487442,208.858521,204.129700
2099-12-29,216.919052,219.889145,184.104263,191.779404,193.221512,261.582886,264.011261,230.366409,174.090820,188.385239
2099-12-30,237.343109,239.606522,205.736099,197.506958,197.276031,266.393036,261.483093,231.794983,182.687592,195.724686


## clean-up transformed outputs

In [22]:
s3 = boto3.client('s3')

def delete_files_with_pattern(bucket_name, prefix, suffix):
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
    
    for page in page_iterator:
        if 'Contents' in page:
            for content in page['Contents']:
                if content['Key'].endswith(suffix):
                    print(f"Deleting {content['Key']}")
                    s3.delete_object(Bucket=bucket_name, Key=content['Key'])

bucket_name = 'niwa-water-demand-modelling'
prefix = 'TransformedOutputs/Simulation/'
suffix = '.csv.out'
delete_files_with_pattern(bucket_name, prefix, suffix)

Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_0.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_1.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_2.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_3.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_4.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_5.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_6.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_7.csv.out
Deleting TransformedOutputs/Simulation/Final_HydroClimaticFile_ACCESS-CM2_ssp370/LowerHutt/Lower Hutt_8.