## StateMod net reservoir evaporation annual data  File (.eva)

In [1]:
import os

import numpy as np
import pandas as pd
from SALib.sample import latin
from joblib import Parallel, delayed


In [2]:
def populate_dict(line, d, column_widths, column_list, data_types):
    """Populate the input dictionary with values from each line based on column widths."""

    start_index = 0
    for idx, i in enumerate(column_list):
        
        if idx == 0:
            end_index = column_widths[i]
            
        else:
            end_index = start_index + column_widths[i]
            
        # extract portion of the line based on the known column width    
        string_extraction = line[start_index : end_index]
        
        # convert to desired data type
        out_string = data_types[i](string_extraction)
                                 
        # append to dict
        d[i].append(out_string)
        
        # advance start index for next iteration
        start_index += column_widths[i]

    return d


def prep_data(field_dict, 
              template_file, 
              column_list, 
              value_columns, 
              column_widths,
              data_types,
              comment="#", 
              skip_rows=0):
    """Ingest statemod template file and format into a data frame."""
    
    # empty string to hold header data
    header = ""

    capture = False
    with open(template_file) as template:

        for idx, line in enumerate(template):

            if capture:

                # populate dictionary with data content
                field_dict = populate_dict(line, field_dict, column_widths, column_list, data_types)

            else:

                # passes all commented lines in header
                if line[0] != comment:
                    
                    if skip_rows == 0:

                        field_dict = populate_dict(line, field_dict, column_widths, column_list, data_types)
                        capture = True
                    
                    else:
                        
                        # count down the number of rows to skip
                        skip_rows -= 1
                        
                        # store any header and preliminary lines to use in restoration
                        header += line

                else:
                    header += line
                    

    # convert dictionary to a pandas data frame  
    df = pd.DataFrame(field_dict)
                    
    return df, header


def set_alignment(value, n_spaces=0, align="left"):
    """Set left or right alignment."""
    
    # set align attribute to lower case
    lowercase_align = align.casefold()
    
    if lowercase_align == "left":
        return f"{value}{n_spaces * ' '}"

    elif lowercase_align == "right":
        return f"{n_spaces * ' '}{value}"

    else:
        raise ValueError(f"Choice for alignment '{align}' not supported.  Must be 'left' or 'right'.")


def pad_with_spaces(value, expected_width, align="left"):
    """Pad a string with the number of spaces specified by the user."""
    
    # strip all whitespace padding from value
    value_stripped = value.strip()
    
    # get length of data in field
    field_length = len(value_stripped)

    if field_length <= expected_width:
        
        # get the number of missing spaces
        missing_spaces = expected_width - field_length
        
        return set_alignment(value=value_stripped, 
                             n_spaces=missing_spaces, 
                             align=align)

    else:
        raise AssertionError(f"Column width '{field_length}' exceeds the expected width '{expected_width}'")

        
def add_zero_padding(x, precision=2):
    """Some fields expect zero padding that gets rounded off by pandas.  
    This method adds that back in.
    
    """
    
    # get length of precision
    x_length = len(x.split(".")[-1])
    
    if x_length < precision:
        
        # determine the number of zeros needed
        n_zeros = precision - x_length
        
        return f"{x}{'0' * n_zeros}"
    else:
        return x
    

def construct_outfile_name(template_file, output_dir, scenario, sample_id):
    """Construct output file name from input template."""
    
    # extract file basename
    template_basename = os.path.basename(template_file)
    
    # split basename into filename and extension
    template_name_parts = os.path.splitext(template_basename)
    
    return os.path.join(output_dir, f"{template_name_parts[0]}_scenario-{scenario}_sample-{sample_id}{template_name_parts[-1]}")


def apply_adjustment(data_df, 
                     value_columns, 
                     query_field,
                     target_ids, 
                     factor):
    """Apply adjustment to template file values for target ids using a sample factor."""

    return (data_df[value_columns] * factor).where(data_df[query_field].isin(target_ids), data_df[value_columns])


def construct_data_string(df, column_names, column_widths, column_alignment):
    """Format line and construct data string."""
    
    data = ""
    for idx in df.index:
        
        for i in column_names:
            
            data += pad_with_spaces(df[i][idx], column_widths[i], align=column_alignment[i])
            
        data += "\n"
        
    return data


def workhorse(municipal_ids, 
              standard_ids, 
              sample, 
              sample_id, 
              output_dir, 
              column_widths,
              column_names,
              column_alignment,
              value_columns,
              problem,
              data_df,
              query_field,
              header):
    
    # break out values from sample
    municipal_factor = sample[problem["names"].index("municipal")]
    standard_factor = sample[problem["names"].index("standard")]
    
    # copy template data frame for alteration 
    df = data_df.copy()
    
    # strip the query field of any whitespace
    df[query_field] = df[query_field].str.strip()
    
    # modify value columns associated structures based on the lhs draw
    df[value_columns] = apply_adjustment(df, value_columns, query_field, municipal_ids, municipal_factor)
    df[value_columns] = apply_adjustment(df, value_columns, query_field, standard_ids, standard_factor)
    
    # reconstruct precision
    df[value_columns] = df[value_columns].round(4)

    # convert all fields to str type
    df = df.astype(str)

    # add formatted data to output string
    data = construct_data_string(df, column_names, column_widths, column_alignment)

    # write output file
    output_file = construct_outfile_name(template_file, output_dir, scenario, sample_id)
    
    with open(output_file, "w") as out:

        # write header
        out.write(header)
        
        # write data
        out.write(data)



## Setup

In [4]:
# set random seed for reproducibility
seed_value = 123

# directory where the data is stored
data_dir = "/Users/d3y010/projects/statemod/data/inputs/cm2015_StateMod/StateMod"

# template file as a source for modification
template_file = os.path.join(data_dir, "cm2015.eva")

# directory to write modified files to
output_dir = "/Users/d3y010/Desktop/statemod"

# scenario name
scenario = "test"

# character indicating row is a comment
comment = "#"
    
# dictionary to hold values for each field
data_dict = {"prefix": [],
             "id": [], 
             "oct": [], 
             "nov": [], 
             "dec": [], 
             "jan": [], 
             "feb": [], 
             "mar": [], 
             "apr": [], 
             "may": [],
             "jun": [],
             "jul": [],
             "aug": [],
             "sep": []}

# define the column widths for the output file
column_widths = {"prefix": 5,
                 "id": 12, 
                 "oct": 8, 
                 "nov": 8, 
                 "dec": 8, 
                 "jan": 8, 
                 "feb": 8, 
                 "mar": 8, 
                 "apr": 8, 
                 "may": 8,
                 "jun": 8,
                 "jul": 8,
                 "aug": 8,
                 "sep": 8}

column_alignment = {"prefix": "left",
                     "id": "left", 
                     "oct": "right", 
                     "nov": "right", 
                     "dec": "right", 
                     "jan": "right", 
                     "feb": "right", 
                     "mar": "right", 
                     "apr": "right", 
                     "may": "right",
                     "jun": "right",
                     "jul": "right",
                     "aug": "right",
                     "sep": "right"}

data_types = {"prefix": str,
                 "id": str, 
                 "oct": np.float64, 
                 "nov": np.float64, 
                 "dec": np.float64, 
                 "jan": np.float64, 
                 "feb": np.float64, 
                 "mar": np.float64, 
                 "apr": np.float64, 
                 "may": np.float64,
                 "jun": np.float64,
                 "jul": np.float64,
                 "aug": np.float64,
                 "sep": np.float64}

# list of columns to process
column_list = ["prefix", "id", "oct", "nov", "dec", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep"]

# list of value columns that may be modified
value_columns = ["oct", "nov", "dec", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep"]

# field to conduct queries for
query_field = "id"

# struct associated with each category
municipal_ids = ["10001", "10004"]
standard_ids = ["10005", "10006"]


## Ingest and process template file

In [5]:
%%time

df, header = prep_data(field_dict=data_dict, 
                       template_file=template_file, 
                       column_list=column_list,
                       value_columns=value_columns,
                       column_widths=column_widths,
                       data_types=data_types,
                       comment="#", 
                       skip_rows=1)

df


CPU times: user 1.39 ms, sys: 569 µs, total: 1.95 ms
Wall time: 1.47 ms


Unnamed: 0,prefix,id,oct,nov,dec,jan,feb,mar,apr,may,jun,jul,aug,sep
0,,10001,0.13,-0.01,-0.11,-0.12,-0.03,0.06,0.15,0.28,0.4,0.39,0.29,0.25
1,,10002,0.13,0.04,-0.05,-0.06,0.02,0.09,0.16,0.3,0.41,0.41,0.27,0.24
2,,10003,0.13,0.04,-0.02,-0.03,0.05,0.11,0.22,0.33,0.48,0.43,0.32,0.28
3,,10004,0.1361,0.0666,0.048,0.0477,0.0593,0.1228,0.1932,0.2765,0.3351,0.3584,0.209,0.2253
4,,10005,0.13,0.06,0.02,0.03,0.06,0.11,0.22,0.32,0.42,0.41,0.35,0.24
5,,10006,0.13,0.02,-0.09,-0.1,-0.02,0.07,0.17,0.32,0.41,0.42,0.29,0.24
6,,10007,0.03,-0.15,-0.16,-0.08,-0.07,-0.01,0.15,0.29,0.41,0.29,0.07,0.08
7,,10008,0.14,0.03,-0.05,-0.05,0.01,0.06,0.16,0.25,0.35,0.32,0.26,0.22
8,,10009,0.03,0.01,-0.06,-0.06,0.01,0.05,0.07,0.29,0.38,0.32,0.22,0.08
9,,10010,0.08,0.01,-0.02,-0.01,0.01,0.04,0.12,0.18,0.24,0.21,0.18,0.16


## Build LHS

In [6]:
%%time

# build our problem
problem = {
    'num_vars': 2,
    'names': ['municipal', 'standard'],
    'bounds': [[-1.0, 1.0],
               [-1.0, 1.0]]
}

# generate 4 files
n_samples = 4

# generate our sample so we can test
lhs = latin.sample(problem, n_samples, seed_value)

lhs


CPU times: user 155 µs, sys: 24 µs, total: 179 µs
Wall time: 174 µs


array([[ 0.9903821 ,  0.84241487],
       [-0.65176541, -0.22434262],
       [-0.38657427, -0.85693033],
       [ 0.35973448,  0.21155323]])

## Run all LHS in parallel

In [7]:
%%time

results = Parallel(n_jobs=-1, backend="loky")(delayed(workhorse)(municipal_ids, 
                                                                  standard_ids, 
                                                                  sample, 
                                                                  sample_id, 
                                                                  output_dir, 
                                                                  column_widths,
                                                                  column_list,
                                                                  column_alignment,
                                                                  value_columns,
                                                                  problem,
                                                                  df,
                                                                  query_field,
                                                                  header) for sample_id, sample in enumerate(lhs))


CPU times: user 29 ms, sys: 42.8 ms, total: 71.8 ms
Wall time: 682 ms
