In [1]:
!pip install scipy
!pip install pyDOE



In [2]:
import scipy
from scipy.stats import pearsonr
import pandas as pd
import numpy as np
import subprocess
import os
import time
from pyDOE import lhs

In [3]:
## Function that execute system call to replace necessary line with param value
# Params:
# - model: model name (filename without extension)
# - param_name: name of the parameter
# - declaration: type of declaration used in the code (local or as property of an object)
# - value: value to be set

def fn_param_update(model, param_name, declaration, value):
    
    ## Get the global setting object
    setting_obj = shared.env.settings
    
    if setting_obj.MOCKING_MODE:   ## Mocking mode is useful for local testing, not to run actual simulation!
        return 0
    
    sys_call_param = [
        "sh",
        setting_obj.SH_UPDATE_PARAM,
        os.path.join(setting_obj.PARAM_FILE_PATH, f"{model}.{setting_obj.PARAM_FILE_EXTENSION}"),
        f"'{param_name} = {declaration}{param_name}={value}'"
    ]
    
    if setting_obj.CONTAINERIZED:
        sys_call_param += [
            f"{model}.{setting_obj.PARAM_FILE_EXTENSION}",
            os.path.join(setting_obj.CONTAINER_HOME_PATH, "runtime"),
            setting_obj.FINGERPRINT
        ]
        
    output = os.system(" ".join(sys_call_param))
    
    if output is None or "status" not in output.__dict__:
        return 0

    
    return -1

In [4]:
def fn_param_push():
    setting_obj = shared.env.settings
    
    if setting_obj.MOCKING_MODE:
        return 0
    
    sys_call_param = [
        "sh",
        setting_obj.SH_PUSH_PARAM,
        os.path.join(setting_obj.CONTAINER_HOME_PATH, "runtime"),
        setting_obj.FINGERPRINT
    ]
    
    output = os.system(" ".join(sys_call_param))
    
    if output is None or "status" not in output.__dict__:
        return 0
    
    return -1

In [5]:
## Function that will try to update all parameters based on a flag that signaling a change (flag "changed" T/F)
## NOTE: DUring first run, if there are not existent model files, then "changed" flag need to be set to T for all parameters, otherwise part of them won't be set at all
# Params:
# - params: dataframe (tidy) containing necessary columns

def fn_simulation_config(params):
    # Get global settings object
    setting_obj = shared.env.settings
    
    # Apply param updates and get status
    status = []
    for p in params:
        status.append(fn_param_update(p['model'], p['param_name'], p['declaration'], p['param_name_left_hand'], p['value']))
    
    # Push params if containerized
    if setting_obj.CONTAINERIZED and setting_obj.SH_PUSH_PARAM is not None:
        push_status = fn_param_push()
        if push_status != 0:
            status = [push_status] * len(status)
    
    # Return status
    return status

In [6]:
def fn_worker_setup(setting_obj):
    command = [
        "sh",
        setting_obj["SH_SETUP_WORKER"],
        setting_obj["CONTAINER_HOME_PATH"],
        os.path.sep.join([setting_obj["CONTAINER_HOME_PATH"], "runtime"]),
        setting_obj["FINGERPRINT"],
        setting_obj["IMAGE_NAME"]
    ]
    output = subprocess.run(command, capture_output=True, text=True)
    
    if output.returncode is None:
        return 0
        
    output.check_returncode()

In [7]:
def fn_worker_remove(setting_obj):
    sys_call = ["sh", setting_obj['SH_REMOVE_WORKER'], "/".join([setting_obj['CONTAINER_HOME_PATH'], "runtime"]), setting_obj['FINGERPRINT']]
    output = subprocess.run(sys_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if output.returncode is None:
        return 0
    output.check_returncode()

In [8]:
## Function that execute system call to run the simulation with SimMobility
# Params:
# - /

def fn_simulation_call():
    # Get global settings object
    setting_obj = shared.env.settings
    
    if setting_obj.MOCKING_MODE:
        return 0
    
    if os.path.exists(setting_obj.ACTIVITY_FILE):
        os.remove(setting_obj.ACTIVITY_FILE)
    
    sys_call_param = [
        "sh",
        setting_obj.SH_EXEC_SIMULATION
    ]
    
    if setting_obj.CONTAINERIZED:
        sys_call_param.extend([os.path.join(setting_obj.CONTAINER_HOME_PATH, "runtime"), setting_obj.FINGERPRINT])
    
    output = os.system(sys_call_param)
    
    if os.path.exists(setting_obj.ACTIVITY_FILE):
        return 0
    
    raise Exception(output)

In [9]:
## Function that calculates output statistics
# Params:
# - activity: activity_schedule table (dataframe,tible)
# - pop_size: total population size [currently not in use as no ration of population is considered]

def fn_output_stat(activity, pop_size=0):
    if activity.empty:
        return None
    
    pop_travelled = len(activity['person_id'].unique())
    
    activity['od'] = activity['prev_stop_location'] + '_' + activity['stop_location']
    
    stat_total = activity.groupby('person_id').agg({'tours': pd.Series.nunique, 'trips': 'count'}) \
        .agg({'tours': 'sum', 'trips': 'sum', 'avg_trips_tour': lambda x: round(x['trips']/x['tours'], 3)}) \
        .to_frame().transpose().rename(columns={'tours': 'tours_total', 'trips': 'trips_total', 'avg_trips_tour': 'avg_trips_tour_total'})
    
    stat_purpose = activity.groupby(['person_id', 'tourType']).agg({'tours': pd.Series.nunique, 'trips': 'count'}) \
        .groupby('tourType').agg({'tours': 'sum', 'trips': 'sum', 'avg_trips_tour': lambda x: round(x['trips']/x['tours'], 3)}) \
        .reset_index().rename(columns={'tourType': 'type'})
    stat_purpose['scope'] = 'purpose'
    
    stat_mode = activity.groupby(['person_id', 'stop_mode']).agg({'tours': pd.Series.nunique, 'trips': 'count'}) \
        .groupby('stop_mode').agg({'tours': 'sum', 'trips': 'sum', 'avg_trips_tour': lambda x: round(x['trips']/x['tours'], 3)}) \
        .reset_index().rename(columns={'stop_mode': 'type'})
    stat_mode['scope'] = 'mode'
    
    stat_total['type'] = 'general'
    stat_total['scope'] = 'global'
    
    return pd.concat([stat_total.melt(id_vars=['type', 'scope'], var_name='variable', value_name='value'),
                      stat_purpose.melt(id_vars=['type', 'scope'], var_name='variable', value_name='value'),
                      stat_mode.melt(id_vars=['type', 'scope'], var_name='variable', value_name='value')]) \
             .filter(['scope', 'type', 'variable', 'value'])

In [10]:
### could do it same as the above

In [11]:
def fn_output_stat_virtualcity(activity, pop_size=0):
    if activity.empty:
        return None
    
    pop_travelled = activity.person_id.nunique()
    
    activity['od'] = activity['prev_stop_location'] + '_' + activity['stop_location']
    
    stat_total = (
        activity.groupby('person_id')
        .agg(
            tours=('tour_no', 'nunique'),
            stops=('od', 'nunique'),
            trips=('person_id', 'count')
        )
        .sum()
        .to_frame()
        .T
        .drop(columns=['person_id'])
    )
    
    stat_purpose = (
        activity.groupby(['person_id', 'tourType'])
        .agg(
            tours=('tour_no', 'nunique'),
            stops=('od', 'nunique'),
            trips=('person_id', 'count')
        )
        .groupby('tourType')
        .agg(
            tours=('tours', 'sum'),
            stops=('stops', 'sum'),
            trips=('trips', 'sum')
        )
        .reset_index()
    )
    
    stat_purpose = pd.merge(
        stat_purpose,
        activity.groupby('tourType')
        .agg(travel_ratio=('person_id', lambda x: x.nunique() / pop_travelled))
        .reset_index(),
        on='tourType'
    )
    
    stat_mode = (
        activity.groupby(['person_id', 'stop_mode'])
        .agg(
            tours=('tour_no', 'nunique'),
            stops=('od', 'nunique'),
            trips=('person_id', 'count')
        )
        .groupby('stop_mode')
        .agg(
            tours=('tours', 'sum'),
            stops=('stops', 'sum'),
            trips=('trips', 'sum')
        )
        .reset_index()
    )
    
    stat_mode = pd.merge(
        stat_mode,
        activity.groupby('stop_mode')
        .agg(travel_ratio=('person_id', lambda x: x.nunique() / pop_travelled))
        .reset_index(),
        on='stop_mode'
    )
    
    return pd.concat([
        pd.melt(stat_total.reset_index(), id_vars=['index'], var_name='variable', value_name='value')
        .assign(scope='global', type='general')
        .drop(columns=['index']),
        pd.melt(stat_purpose, id_vars=['tourType'], var_name='variable', value_name='value')
        .assign(scope='purpose'),
        pd.melt(stat_mode, id_vars=['stop_mode'], var_name='variable', value_name='value')
        .assign(scope='mode')
        .rename(columns={'stop_mode': 'type'})
    ], axis=0)[['scope', 'type', 'variable', 'value']]

In [12]:
## Function that calculates output statistics
# Params:
# - activity: activity_schedule table (dataframe,tible)
# - setting_obj: object with overall settings

def fn_output_od(activity):
    # Get global settings object
    setting_obj = shared.env["settings"]
    
    OD_cells = setting_obj["CITY_STATS"]["value"].index.tolist()
    OD_m = pd.DataFrame(0, index=OD_cells, columns=OD_cells)

    if activity.empty:
        return None

    as_df = pd.merge(activity, setting_obj["CITY_STATS"]["district_map"], 
                      left_on="prev_stop_location", right_on="newTAZ") \
             .merge(setting_obj["CITY_STATS"]["district_map"], 
                    left_on="stop_location", right_on="newTAZ",
                    suffixes=("_origin", "_destination"))

    for _, travel_subroute in as_df.iterrows():
        origin_n = travel_subroute["code_origin"]
        destination_n = travel_subroute["code_destination"]
        OD_m.loc[origin_n, destination_n] += 1

    return OD_m

In [13]:
## Function that calculates output statistics (OD and balance between transportation modes)
# Params:
# - activity: activity_schedule table (dataframe,tible)
# - setting_obj: object with overall settings

def fn_output_od_mode_balance(activity):
    # Get global settings object
    setting_obj = shared.env.settings
    
    OD_m = setting_obj['CITY_STATS']['emptyOD']
    output = {
        'total_legs': len(activity),
        'od': OD_m,
        'balance': setting_obj['CITY_STATS']['balance'].assign(estimated_share=0).loc[:, ['mode_category', 'estimated_share', *setting_obj['CITY_STATS']['balance'].columns]],
        'workers_work': 1
    }
    
    if len(activity) == 0:
        return output
    
    as_ = activity.merge(
        setting_obj['CITY_STATS']['district_map'].loc[:, ['code', 'newTAZ']].rename(columns={'code': 'origin_station_code', 'newTAZ': 'prev_stop_location'}),
        how='left', on='prev_stop_location'
    ).merge(
        setting_obj['CITY_STATS']['district_map'].loc[:, ['code', 'newTAZ']].rename(columns={'code': 'destination_station_code', 'newTAZ': 'stop_location'}),
        how='left', on='stop_location'
    ).loc[:, ['origin_station_code', 'destination_station_code']]
    
    for id_sr in range(len(as_)):
        travel_subroute = as_.iloc[id_sr, :]
        origin_n = travel_subroute['origin_station_code']
        destination_n = travel_subroute['destination_station_code']
        OD_m[origin_n, destination_n] += 1
    
    output['od'] = OD_m  # OD_m/sum(OD_m) ## IF OPERATES IN RELATIVE MODE
    
    # Transportation mode balance
    if 'balance' in setting_obj['CITY_STATS']:
        total_legs = len(activity)
        mode_stats = activity.assign(
            mode_category=lambda x: np.where(
                x['stop_mode'].isin(['BusTravel', 'SMS']), 'public',
                np.where(
                    x['stop_mode'].isin(['Car', 'Car Sharing 2', 'Car Sharing 3']), 'car',
                    np.where(x['stop_mode'].isin(['PrivateBus', 'Motorcycle', 'Taxi']), 'other', x['stop_mode'].str.lower())
                )
            )
        ).groupby('mode_category')['mode_category'].agg([('estimated_share', lambda x: len(x) / total_legs)]).reset_index().merge(
            setting_obj['CITY_STATS']['balance'], on='mode_category', how='right'
        ).assign(estimated_share=lambda x: np.where(pd.isna(x['estimated_share']), 0, x['estimated_share']))
        
        output['balance'] = mode_stats.loc[:, ['mode_category', 'estimated_share', *setting_obj['CITY_STATS']['balance'].columns]]
    
    # Workers share scheduled to take a trip
    if 'workers' in setting_obj['CITY_STATS']:
        assigned_workers = activity.query("tourType == 'Work' and primary_stop == True").assign(
            person_id=lambda x: x['person_id'].astype(str)
        ).assign(
            id=lambda x: x['person_id'].str.split('-').str[0].astype(int)
        ).query("id in @setting_obj['CITY_STATS']['workers']['id']").loc[:, 'id'].unique()
        
        output['workers_work'] = 1 - (len(assigned_workers) / len(setting_obj['CITY_STATS']['workers']))
    
    return output

In [14]:
## Comparissons

## Function that compare two vectors in terms of distance
def fn_quantify_vector(vector_1, vector_2):                        # Euclidean
    return np.sqrt(np.sum((vector_1 - vector_2)**2))

def fn_quantify_vector_correlation(vector_1, vector_2):            # Pearson correlation
    return pearsonr(vector_1, vector_2)[0]


## Function that compare two matrix
def fn_quantify_matrix(matrix_1, matrix_2, mat_weights=None, squared=False, absolute=False):
    if mat_weights is None:
        mat_dim = matrix_1.shape
        mat_weights = np.ones((mat_dim[0], mat_dim[1]))
    mat_difference = (matrix_1 - matrix_2)
    if squared:
        mat_difference = mat_difference**2
    if absolute:
        mat_difference = np.abs(mat_difference)
    return fn_matrix_rmse(mat_difference)

def fn_matrix_norm(mat):        # Frobenius matrix norm
    return np.linalg.norm(mat)

def fn_matrix_max_cell_loss(mat, observed_mat):
    rel_mat = mat/observed_mat                                       ## max relative loss/error (loss per cell) - rounded at 4 decimals
    rel_non_tol = len(np.where(rel_mat > 0.1)[0])/mat.shape[0]**2    ## rel number of cells with non-tollerable error
    return round(np.max(rel_mat)+rel_non_tol, 4)

def fn_matrix_rmse(mat):
    return round(np.sqrt(np.mean(mat**2)), 4)

def fn_quantify(vals_1, vals_2, vals_weights=None):            # not sure why its needed 
    return fn_quantify_matrix(vals_1, vals_2, vals_weights)

In [15]:
## Function that quantify inadequacy or discrepancy
# Params:
# - iter_stats: output stats from iteration
# - data_stats: output stats calculated from data

def fn_quantify_inadequacy_stats(iter_stats, city_stats):
    # If empty activity schedule is generated, return max distance
    if iter_stats is None:
        return fn_quantify(np.zeros(len(city_stats)), city_stats)
    
    # Merge iter_stats with city_stats based on the 'stats' column
    joined_stats = pd.merge(iter_stats[['scope', 'type', 'variable', 'value']],
                            city_stats.rename(columns={'value': 'observed_value'}),
                            on=['scope', 'type', 'variable'], how='left')
    
    # Remove rows with missing observed values
    joined_stats.dropna(subset=['observed_value'], inplace=True)
    
    return fn_quantify(joined_stats['value'], joined_stats['observed_value'])


In [16]:
## Function that quantify inadequacy or discrepancy
# Params:
# - iter_stats: output stats from iteration
# - data_stats: output stats calculated from data

def fn_quantify_inadequacy_od(iter_stats):
    # Get global settings object
    setting_obj = shared.env.settings
    
    mat_weights = None
    if "weights" in setting_obj["CITY_STATS"]:
        mat_weights = setting_obj["CITY_STATS"]["weights"]
    
    # If empty activity schedule is generated (i.e., no trips at all), return max distance
    if iter_stats is None:
        od_dim = setting_obj["CITY_STATS"]["value"].shape
        return fn_quantify(np.zeros((od_dim[0], od_dim[1])), setting_obj["CITY_STATS"]["value"], mat_weights)
    
    return fn_quantify(iter_stats, setting_obj["CITY_STATS"]["value"], mat_weights)

In [17]:
## Function that quantify inadequacy or discrepancy
# Params:
# - iter_stats: output stats from iteration
# - data_stats: output stats calculated from data

def fn_quantify_inadequacy_BCKUP(iter_stats):
    # Get global settings object
    setting_obj = shared.env.settings
    
    od_discrepancy = fn_quantify_inadequacy_od(iter_stats["od"])
    balance_discrepancy = 1
    
    if "balance" in iter_stats:
        balance_discrepancy = 1 - fn_quantify_vector_correlation(iter_stats["balance"][:,1], iter_stats["balance"][:,2])
    
    # Calculate the value and base of the function and return as a dictionary
    value = balance_discrepancy * abs(iter_stats["total_legs"] - setting_obj["CITY_STATS"]["total_trips"])
    base = [od_discrepancy, iter_stats["total_legs"], balance_discrepancy]
    return {"value": value, "base": base}

In [18]:
def fn_quantify_inadequacy(iter_stats):
    # Get global settings object
    setting_obj = shared.env['settings']
    
    od_discrepancy = fn_quantify_inadequacy_od(iter_stats['od'])
    balance_discrepancy = 0
    
    if 'balance' in iter_stats:
        balance_discrepancy = fn_quantify_vector(iter_stats['balance'][:,1], iter_stats['balance'][:,2])
        
    workers_work_error = 0
    if 'workers_work' in iter_stats:
        workers_work_error = iter_stats['workers_work']
  # ## ----------- Error value ----------------
  # simulated_num_legs <- iter_stats$total_legs
  # observed_num_legs <- setting_obj$CITY_STATS$total_trips
  # 
  # error_val <- (0.00001 + abs(simulated_num_legs-observed_num_legs)/observed_num_legs) * 
  #              (0.00001 + od_discrepancy) *
  #              (1 + ((abs(simulated_num_legs-observed_num_legs)/observed_num_legs) - od_discrepancy)^2) ## DESC: [ NUM_LEGS_ERROR * OD_ERROR * DIFFERENCE_BETWEEN_THEM^2 ] both sides have equal multiplicative contribution to the loss/error. Minimum (optimum) is close to 0.
  # 
  # error_base <- c()#list(tours_residual = abs(simulated_num_legs-observed_num_legs)/observed_num_legs, od_residual = od_discrepancy) # tours=iter_stats$total_legs
  # 
  # if(exists("logger", shared.env)){
  #   log4r::info(shared.env$logger$LOGGER, paste0("ERROR-TERM: [Sim/Obs]Legs: ", simulated_num_legs, "/", observed_num_legs, " (ABS-DIF: ", abs(simulated_num_legs-observed_num_legs)/observed_num_legs,"); OD_DISCREPANCY: ", od_discrepancy, "; OVERALL-ERROR: ", error_val))
  # }
  # 
  # ## ----------------------------------------
    simulated_num_legs = iter_stats['total_legs']
    observed_num_legs = setting_obj['CITY_STATS']['total_trips']
    error_val = od_discrepancy/100 * (1 + balance_discrepancy) * (1 + workers_work_error)
    error_base = []
    
    # if 'logger' in shared.env and 'LOGGER' in shared.env['logger']:
    #     log4r.info(shared.env['logger']['LOGGER'], f"ERROR-TERM: TOTAL-LEGS: {simulated_num_legs}/{observed_num_legs}; OD: {od_discrepancy}; BALANCE: {balance_discrepancy}; OVERALL-ERROR: {error_val}")


    if "logger" in shared.env and hasattr(shared.env.logger, "LOGGER") and shared.env.logger.LOGGER is not None:
        shared.env.logger.LOGGER.info(f"ERROR-TERM: TOTAL-LEGS: {simulated_num_legs}/{observed_num_legs}; OD: {od_discrepancy}; BALANCE: {balance_discrepancy}; OVERALL-ERROR: {error_val}")
    
    return {'value': error_val, 'base': error_base}

In [19]:
## Function that take activity_schedule at input and return calculated output statistics
# Params:
# - /

def fn_process_activities():
    # Get global settings object
    setting_obj = shared.env.settings

    # Load activities
    activity_schedule = pd.read_csv(setting_obj.ACTIVITY_FILE,
                                    names=["person_id","tour_no","tourType","stop_no","stop_type","stop_location","stopZone","stop_mode","primary_stop","arrival_time","departure_time","prev_stop_location","prev_stopZone","prev_stop_departure_time","pid"],
                                    header=None)

    # TEMPORAL: save file
    activity_schedule.to_csv(setting_obj.OBJECT_PATH + setting_obj.FINGERPRINT + "_activity_schedule_" + str(time.time()) + ".csv")

    # Calculate outcomes (number of tours, trips, stops, VKT?, PKT / general + per mode + per activity)
    #output_stats = fn_output_stat(activity_schedule)
    #output_stats = fn_output_od(activity_schedule, setting_obj)
    output_stats = fn_output_od_mode_balance(activity_schedule)

    return output_stats

In [20]:
## Simulation function updates config file for simulation, runs the simmobility simulation (preday), reads the result and calculate measure of adequacy (similarity) of the result
# Params:
# - parameter dataframe (including name of parameter, model specification and declaration type (local or property))

def fn_simulation(params):
    # Update files with parameters
    param_config_output = fn_simulation_config(params)

    # Execute simulation
    fn_simulation_call()

    # Read and process activity schedules
    outcome = fn_process_activities()

    # Quantify (in)adequacy (discrepancy) of output statistics to the real world data (stats)
    inadequacy = fn_quantify_inadequacy(outcome)
    if shared.env.settings.MOCKING_MODE:
        inadequacy["value"] += np.random.normal(0, 3)

    return {"config_output": param_config_output,
            "inadequacy": inadequacy["value"],
            "base_residuals": inadequacy["base"]}

In [21]:
def fn_perform_simulation(value, param_def):
    # Transform param row into corresponding params tibble
    param_def["value"] = value

    # Simulate
    param_iter_run = fn_simulation(param_def)

    # Warnings for particular param not being applied
    # ... TODO

    # Update the value pool
    fn_update_value_pools(value, param_iter_run["base_residuals"], param_def)

    # Return outcome/inadequacy
    return param_iter_run["inadequacy"]

In [22]:
def fn_perform_simulation_only(value, param_def):
    ## Get global settings object
    setting_obj = shared.env.settings

    # Transform param row into corresponding params tibble
    param_def["value"] = value

    # Update files with parameters
    param_config_output = fn_simulation_config(param_def)

    # Execute simulation
    fn_simulation_call()

    # Load activities
    activity_schedule = pd.read_csv(setting_obj.ACTIVITY_FILE,
                                    names=["person_id","tour_no","tourType","stop_no","stop_type","stop_location","stopZone","stop_mode","primary_stop","arrival_time","departure_time","prev_stop_location","prev_stopZone","prev_stop_departure_time","pid"],
                                    header=None)

    # Save file
    activity_schedule.to_csv(setting_obj.OBJECT_PATH + setting_obj.FINGERPRINT + "_activity_schedule_" + str(int(time.time())) + ".csv", index=False)

    # Return activity_schedule
    return activity_schedule

In [23]:
def fn_scaleup_standard(l_sample, p_def):
    def scale_column(c_i):
        return (l_sample[:, c_i] * (p_def[c_i, 1] - p_def[c_i, 0])) + p_def[c_i, 0]
    
    return np.apply_along_axis(scale_column, 0, np.arange(l_sample.shape[1]))

In [24]:
# def fn_preday_sampling(vals=None param_space=None, sample_size=1, **kwargs):
#     is_initial = True if (vals is None or param_space['space'].shape[0] == 0) else False
    
#     enabled_idx = np.arange(param_space['definition'].shape[0])
#     if 'enabled' in param_space['definition'].dtype.names:
#         enabled_idx = np.where(param_space['definition']['enabled'] == True)[0]
#     dim_names = param_space['definition']['parameter']
#     sample_dim = len(enabled_idx)
    
#     genereted_sample = np.tile(param_space['definition']['initial'], (sample_size, 1))
    
#     if is_initial:
#         genereted_sample[:, enabled_idx] = (lhs.create_sample(param_space['definition'][enabled_idx], sample_size)).T
#         genereted_sample = np.vstack([param_space['definition']['initial'], genereted_sample])
#         genereted_sample = genereted_sample.astype(param_space['definition'].dtype)
#         genereted_sample.dtype.names = param_space['definition'].dtype.names
        
#         return genereted_sample
    
#     if 'SAMPLING_OPTIMAL_TOLERANCE' in kwargs and kwargs['SAMPLING_OPTIMAL_TOLERANCE'] > 0:
#         min_inadequacy = np.nanmin(param_space['space'][param_space['space'][shared.env['settings']['target_col']].notna()][shared.env['settings']['target_col']])
#         vals = param_space['space'][param_space['space'][shared.env['settings']['target_col']] < (min_inadequacy + kwargs['SAMPLING_OPTIMAL_TOLERANCE'])][dim_names].dropna()
        
#         if vals.shape[0] == 1:
#             vals = np.array([tuple(vals.values.tolist()[0])], dtype=param_space['definition'].dtype)
#         else:
#             vals = vals.to_records(index=False)
    
#     if vals.shape[0] == 1:
#         vals = np.tile(vals[:, enabled_idx], (sample_size, 1))
#     else:
#         vals = vals[:, enabled_idx]
    
#     tours_intensity = [0.6, 0.6]  # high and low intensity
#     mode_intensity = [0.5, 0.5]
#     od_intensity = [0.5, 0.5]
#     none_intensity = [0.2, 0.2]
#     sampling_intensities = {
#         'tours': tours_intensity,
#         'mode': mode_intensity,
#         'od': od_intensity,
#         'none': none_intensity
#     }

In [25]:
def fn_preday_sampling(vals=None, param_space=None, sample_size=1, **kwargs):
    # check if the given vals and param_space arguments are None or empty
    print("-------------------------------vals------------------------------------------------",vals)
    print("-------------------------------param_space------------------------------------------------",param_space)
    print("-------------------------------sample_size------------------------------------------------",sample_size)
    is_initial = True if (vals is None or param_space['space'].shape[0] == 0) else False
    
    # get the indexes of enabled parameters
    enabled_idx = np.arange(param_space['definition'].shape[0])
    if 'enabled' in param_space['definition'].dtype.names:
        enabled_idx = np.where(param_space['definition']['enabled'] == True)[0]
    
    # get the names of enabled parameters
    dim_names = param_space['definition']['parameter']
    
    # get the number of enabled parameters
    sample_dim = len(enabled_idx)
    
    # create the generated sample numpy array
    genereted_sample = np.tile(param_space['definition']['initial'], (sample_size, 1))
    
    # if the given vals and param_space arguments are None or empty, generate a new sample
    if is_initial:
        genereted_sample[:, enabled_idx] = (lhs.create_sample(param_space['definition'][enabled_idx], sample_size)).T
        genereted_sample = np.vstack([param_space['definition']['initial'], genereted_sample])
        genereted_sample = genereted_sample.astype(param_space['definition'].dtype)
        genereted_sample.dtype.names = param_space['definition'].dtype.names
        
        return genereted_sample
    
    # if the SAMPLING_OPTIMAL_TOLERANCE keyword argument is given and its value is greater than 0,
    # find the minimum value of target_col column in the param_space array and 
    # filter the rows where the target_col value is less than the sum of the minimum value and SAMPLING_OPTIMAL_TOLERANCE
    if 'SAMPLING_OPTIMAL_TOLERANCE' in kwargs and kwargs['SAMPLING_OPTIMAL_TOLERANCE'] > 0:
        min_inadequacy = np.nanmin(param_space['space'][param_space['space'][shared.env['settings']['target_col']].notna()][shared.env['settings']['target_col']])
        vals = param_space['space'][param_space['space'][shared.env['settings']['target_col']] < (min_inadequacy + kwargs['SAMPLING_OPTIMAL_TOLERANCE'])][dim_names].dropna()
        
        if vals.shape[0] == 1:
            vals = np.array([tuple(vals.values.tolist()[0])], dtype=param_space['definition'].dtype)
        else:
            vals = vals.to_records(index=False)
    
    # if there is only one row in the vals array, tile it to create a new vals array with the same shape as the generated sample
    if vals.shape[0] == 1:
        vals = np.tile(vals[:, enabled_idx], (sample_size, 1))
    else:
        vals = vals[:, enabled_idx]
    
    # set the intensities for each sampling method
    tours_intensity = [0.6, 0.6]  # high and low intensity
    mode_intensity = [0.5, 0.5]
    od_intensity = [0.5, 0.5]
    none_intensity = [0.2, 0.2]
    sampling_intensities = {
        'tours': tours_intensity,
        'mode': mode_intensity,
        'od': od_intensity,
        'none': none_intensity
    }

    # Enable only parameters in the active subspace
    enabled_idx = np.where(param_space['active'])[0]
    dim_names = param_space['names'][enabled_idx]
    sample_dim = len(enabled_idx)
    
    # Set up intensity values for each parameter
    sampling_intensities = shared_env['settings']['SAMPLING_INTENSITY']
    sampling_intensity = {}
    for s_intensity in sampling_intensities:
        
        if (shared_env['param_space']['residual_pool'][s_intensity] is None 
            or len(shared_env['param_space']['residual_pool'][s_intensity]) < 2):
            sampling_intensity[s_intensity] = sampling_intensities[s_intensity][0]
            continue
        
        tail_diff = np.abs(np.diff(shared_env['param_space']['residual_pool'][s_intensity])[-3:])
        thrs = shared_env['settings']['EPSILON'][s_intensity]
        
        if not np.any(tail_diff > thrs['focused_sampling']):
            sampling_intensity[s_intensity] = sampling_intensities[s_intensity][0] 
        elif not np.any(tail_diff > thrs['spread_sampling']):
            sampling_intensity[s_intensity] = sampling_intensities[s_intensity][1]
        else:
            sampling_intensity[s_intensity] = sampling_intensities[s_intensity][0]
            
    # Generate sample for each parameter
    generated_sample = np.zeros((sample_size, len(dim_names)))
    for i, c_i in enumerate(enabled_idx):
        
        p_def = param_space['definition'][c_i]
        p_intensity = sampling_intensity[p_def['direct_influence']]
        p_replaced_size = int(np.ceil(sample_size * p_intensity))
        p_current = vals[np.random.choice(np.arange(vals.shape[0])), c_i]
        
        p_lower_bound = p_def['lower_limit']
        p_upper_bound = p_def['upper_limit']
        p_tunnel_bound = 2
        if 'tunnel_width' in p_def:
            p_tunnel_bound = p_def['tunnel_width']
            p_lower_bound = p_current - p_tunnel_bound
            p_upper_bound = p_current + p_tunnel_bound
        
        p_sample = np.repeat(p_current, sample_size)
        p_sample_replace = np.random.choice(np.arange(sample_size), p_replaced_size, replace=False)
        
        # Uniform sampling
        # p_sample[p_sample_replace] = np.random.uniform(p_lower_bound, p_upper_bound, p_replaced_size)
        
        # Normal around the optimal sampling
        p_sample[p_sample_replace] = np.random.normal(p_current, p_tunnel_bound, p_replaced_size)
        
        generated_sample[:, i] = p_sample
        
    # Add initial parameter values to the generated sample
    if not shared_env['settings'].get('IMPUTED_INITIAL_VALUE_SET', False):
        if not np.allclose(param_space['space'][0, :len(enabled_idx)], 
                            np.array([p['initial'] for p in param_space['definition'][enabled_idx]])):
            init_param_values = np.array([p['initial'] for p in param_space['definition']])
            generated_sample = np.vstack((init_param_values, generated_sample))
        shared_env['settings']['IMPUTED_INITIAL_VALUE_SET'] = True
    
    return generated_sample

In [26]:
## Function to update the value pool that will be used in sampling

def update_value_pools(params, base_residuals, param_def):
    # Get global settings object
    setting_obj = shared.env['settings']
  
    if setting_obj['MOCKING_MODE']:
        base_residuals = base_residuals + np.random.normal(size=len(base_residuals))
  
    residual_object = shared.env['param_space']['residual_pool']
    residual_components = set(residual_object.keys()) - set(['none'])
  
    for comp in residual_components:
        shared.env['param_space']['residual_pool'][comp] = np.concatenate((np.array(residual_object[comp], dtype=np.float64), np.array(base_residuals[comp], dtype=np.float64)))

In [27]:
## Function that loads definition of param data.frame with provided values (if any)
def fn_load_params_space_definition(filepath, omit = [], sys_modules = ["preday"], tunnel_constraint = True, update_initial = False):
    no_tours_models = ["dpb","dps","dpt","nte","ntw","nto","nts","isg"]
    mode_balance_models = ["tme","tmw","stmd"]
    low_priority_models = ["tws","sttd","ttde","ttdw","ttdo","itd"]
  
    # Read file content
    file_content = pd.read_csv(filepath)
    file_content = file_content[file_content['module'].isin(sys_modules) & file_content['include']]
    file_content[['param_name', 'value']] = file_content['param'].str.split('=', expand=True)
    file_content['declaration'] = file_content['param'].str.extract('(.+?(?=beta|cons))')
    file_content['param_name_left_hand'] = file_content['param_name']
    file_content['param_name'] = file_content['param_name'].str.replace('(local |bundled_variables.)', '').str.strip()
    file_content['parameter'] = file_content['model'] + '_' + file_content['param_name']
    file_content['raw_value'] = file_content['value'].str.strip()
    file_content['value'] = file_content['value'].str.replace(' ','').astype(float)
    file_content['direct_influence'] = np.where(file_content['model'].isin(no_tours_models), 'tours',
                                                np.where(file_content['model'].isin(mode_balance_models), 'mode', 'none'))
    file_content['enabled'] = (file_content['lower_limit'] != file_content['upper_limit'])
  
    # If enabled, set initial value as lower limit
    if update_initial:
        file_content['initial'] = np.where(file_content['enabled'], file_content['value'], file_content['lower_limit'])
  
    # Omit specified columns
    if isinstance(omit, str):
        omit_col_name = omit
        omit = file_content.loc[file_content[omit]].parameter.tolist()
        file_content = file_content.drop(columns=omit_col_name)
    else:
        file_content = file_content[~file_content['parameter'].isin(omit)]
  
    # Apply tunnel constraint (lower and upper bounds around current value +/- t_width)
    if tunnel_constraint:
        t_width = 2.0
        t_restricted_width = 0.05
        file_content['lower_limit'] = np.where(~file_content['enabled'], file_content['initial'],
                                               np.where(file_content['param_name'].str.contains('logsum'),
                                                        file_content['initial'] - t_restricted_width,
                                                        file_content['initial'] - t_width))
        file_content['upper_limit'] = np.where(~file_content['enabled'], file_content['initial'],
                                               np.where(file_content['param_name'].str.contains('logsum'),
                                                        file_content['initial'] + t_restricted_width,
                                                        file_content['initial'] + t_width))
        file_content['tunnel_width'] = t_width
  
    # Return output as dictionary
    definition = file_content.drop(columns='value').assign(changed=~file_content['parameter'].isin(omit),
                                                            init_lower_limit=file_content['lower_limit'],
                                                            init_upper_limit=file_content['upper_limit'])
    space = file_content[['value']].T.rename(columns=lambda x: file_content.loc[x, 'parameter']).iloc[:, 1:]
    value_pool = {col: [] for col in definition['parameter']}
    residual_pool = {'tours': [], 'mode': [], 'od': []}
  
    return {'definition': definition,
            'space': space,
            'value_pool': value_pool,
            'residual_pool':residual_pool}
