In [None]:
from climakitae.explore.vulnerability import cava_data
from climakitae.explore.vulnerability_table import create_vul_table

import pandas as pd
import os
import glob
import logging
import shutil
import subprocess
from multiprocessing import Pool

instance_name = os.popen('curl -H "Metadata-Flavor: Google" http://metadata/computeMetadata/v1/instance/name').read()
print("instance name:",instance_name)

if __name__ == '__main__':
    _dir = os.getcwd()
    _loc_files = glob.glob(_dir + '/*lab*.csv')
print(_loc_files)

num_processes = len(_loc_files) 
print('number of parallel processes:',num_processes)

# set up location for output files to be moved
BUCKET_NAME="analyticsengine"
OUTPUT_FOLDER="NetCDF_Output/1_0WL"

# Custom function to check for match based on pattern for file names
def custom_match(search_string, file_name):
    search_parts = search_string.split('*')
    if all(part in file_name for part in search_parts):
        return True
    return False

search_string = f'one_in_100_1_day_precipitation_1degreeWL_*.nc'
result = subprocess.run(["gsutil", "ls", f"gs://{BUCKET_NAME}/{OUTPUT_FOLDER}/{search_string}"], capture_output=True, text=True)

if result.returncode == 0:
    files = result.stdout.split('\n')
else:
    print("Error running gsutil ls command:", result.stderr)
print('locations that have already been run:')
print(files)

In [None]:

def process_file(file, csv_file_name):
    df = pd.read_csv(file)
    
    # create a log file
    logging.basicConfig(filename=f'{instance_name.strip()}_{csv_file_name}_1.0GWL_precip_output.log', level=logging.INFO)

    # Write to the log file
    logging.info(f"Processing {csv_file_name}")
        
    for pt in range(len(df)):
        # first check if this location and scenario has been run before 
        lat_str = str(df.loc[pt, 'lat']).replace('.', '')
        lon_str = str(df.loc[pt, 'lon']).replace('.', '')
        search_string = f'one_in_100_1_day_precipitation_1degreeWL_{lat_str}*_{lon_str}*.nc'
        
        # Write to the log file
        logging.info(f"checking if we need to run for SCE location {df.loc[pt, 'gridcode']}")

        # Check if file exists in the pre-fetched list
        found_files = [file for file in files if custom_match(search_string, file)]
        if found_files:
            logging.info(f"Already processes SCE location  {df.loc[pt, 'gridcode']}")
            logging.info(f"Found files for '{search_string}':")
        else: # if the file does not exist run the cava_data retrevial method
            logging.info(f"Processing SCE location {df.loc[pt, 'gridcode']}")
        
            data = cava_data(
                ## Set-up
                df[pt:pt+1],
                downscaling_method="Statistical",  # LOCA2 data 
                approach="Warming Level",  
                warming_level=1.0,

                ## 1-in-X event specific arguments
                variable="Precipitation (total)",
                metric_calc="max", # daily maximum precipitation
                one_in_x=100, # One-in-X
                distr="gev", # change distribution
                units="inches", # change units

                ## Export
                export_method="calculate",
                file_format="NetCDF")

            lat_str = str(df.loc[pt,'lat']).replace('.', '')
            lon_str = str(df.loc[pt,'lon']).replace('.', '')
            filename = glob.glob(f'one_in_100_1_day_precipitation_1degreeWL_{lat_str}*_{lon_str}*.nc')
            filename = filename[0]

            # Move the created .nc file to a different location
            subprocess.run(["gsutil", "mv", filename, f"gs://{BUCKET_NAME}/{OUTPUT_FOLDER}/{filename}"])

             # Write to the log file
            logging.info(f"Moved {filename} to gs://{BUCKET_NAME}/{OUTPUT_FOLDER}/{filename}")

# Create a pool of processes to parallelize the processing
with Pool(processes=num_processes) as pool:
    # Map the process_file function with arguments file and filename
    pool.starmap(process_file, [(file, os.path.basename(file)) for file in _loc_files])