# Test Problems Data Estimation

In [None]:
import pandas as pd
import numpy as np
import glob
import os
from scipy import stats
from datetime import datetime
from datetime import timedelta
from typing import Union, List

In [None]:
seed_value = 42 # we fix a random seed for reproducibility 

precision = 5  # Precision of 5 minutes

# we focus on the time period between 7 am and midnight
start_hour = 7 
end_hour = 24

# time period according to 5 minutes
time_focus_start = start_hour * 60 // precision  # 7 AM
time_focus_end = end_hour * 60 // precision   # Midnight

# total length of the time horizon in 5 minutes
length_time = int(time_focus_end - time_focus_start)

# time constants
MINUTES_IN_HOUR = 60
SECONDS_IN_HOUR = 3600

# scaling constant
SCALING_FACTOR = 400

#path to the folder including the csv files
data_folder_path = 'MayJulyWeekdays/'

In [None]:
def get_file_list(folder_path, file_pattern):
    """
    Generates a list of file paths from a specified folder that match a given pattern.

    Parameters:
    - folder_path: str - Path to the folder where files are located.
    - file_pattern: str - Pattern to match files.

    Returns:
    - list: List of file paths matching the pattern in the specified folder.
    """
    
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"The specified folder path does not exist: {folder_path}")

    file_list = glob.glob(os.path.join(folder_path, file_pattern))

    if not file_list:
        print(f"No files found in {folder_path} matching the pattern '{file_pattern}'")

    return file_list

## 1. Main Test Problem 

In [None]:
class_no = 17 #main test problem has 17 dimensions (17 different classes)

In [None]:
#generate a folder for saving the system parameters

main_test_folder_path = os.path.join(data_folder_path,f"problem_{class_no}dim")
os.makedirs(main_test_folder_path, exist_ok=True)

### 1.1 Pre-limit arrival process

In [None]:
file_list_cust_subcalls = get_file_list(data_folder_path, file_pattern="*_cust_subcalls.csv")

In [None]:
def filt_arrivals(file_path,precision,service_lookup,service,node=0):
    """
    Filter and process call arrival data.

    Parameters:
    - file_path: str - Path to the data file.
    - precision: int - Time precision for resampling.
    - service_lookup: bool - Whether to filter by service.
    - service: int - Service type to filter.
    - node: int - Node number to filter, default is 0.

    Returns:
    - DataFrame: Processed data with columns ['index', 'call_id'].
    """
    
    try:
        data = pd.read_csv(file_path)
    except Exception as e:
        raise FileNotFoundError(f"Error reading file {file_path}: {e}")
        
    
    """Remove outliers based on queue and service time thresholds."""
    data = data[data["queue_time"] < 900] 
    data = data[data["service_time"] < 1800] 
    
    """Remove calls with specified abnormal outcomes."""
    abnormal_outcomes = [4,13,14,23,30,40,50]
    calls_with_abnormal_outcome = data[data["outcome"].isin(abnormal_outcomes)]
    call_ids_abnormal_outcome = calls_with_abnormal_outcome["call_id"].unique()
    drop_index = data[data["call_id"].isin(call_ids_abnormal_outcome)].index
    data = data.drop(index = drop_index)
    
    #focusing only on the 1st customer subcalls
    data = data[data["cust_subcall"] == 1]
    
    #removing the multiple records of the same call
    data = data.sort_values(["segment_start"])
    data = data.drop_duplicates("call_id",keep = "last")
      
    """Filter data based on service and node criteria."""
    if service_lookup == True:
        data = data[data["service"] == service]
        
        if (service == 1) and (node != 0):
            data = data[data["node"].isin([node])]
        elif (service == 1) and (node == 0):
            node_drop_index = data[(data["service"] == 1) & data["node"].isin([5,6,7])].index #keep only nodes 1,2,3
            data = data.drop(index = node_drop_index)
            
    
    #some calls when they first arrive are on hold and then are transferred to the agents but since they have the same call id we look at their first occurrence
    data = data.reset_index()
    
    #defining the start time of the events
    start = []
    for i in range(len(data)):
        start.append(datetime.fromtimestamp(data["segment_start"][i]+5*60*60).strftime("%A, %B %d, %Y %H:%M:%S")) 
    
    data["start"] = start
    data['Datetime'] = pd.to_datetime(data['start'])
    data["day"] = pd.to_datetime(start).date
    new_data = data
    drop_ind = new_data.drop_duplicates(subset = ["day"])
    
    if len(drop_ind) > 1:
        drop_day = np.array(drop_ind.day)[1]
        data = data[data["day"] != drop_day]
    
    data = data.set_index('Datetime')
    
    #resolution
    resolution = "{pre}T"
    
    #how many arrivals happened within the specific time interval
    data = data.resample(resolution.format(pre = precision)).count()
    data = data.dropna(axis = 0)
    
    #setting up a new column to show the time interval
    new_index = []
    for j in data.index:
        h=j.hour
        m=j.minute
        item=str(h)+':'+str(m)
        new_index.append(item)
        
    data["index"] = new_index
    data = data.reset_index()
    
    return data[["index","call_id"]]

In [None]:
def format_time(hour, minute):
    """Formats time as 'hour:minute'."""
    return f"{hour}:{minute:0d}"
    
def create_time_intervals():
    """Generates a list of time intervals from '0:0' to '23:55' in 5-minute increments."""
    return [format_time(hour, minute) for hour in range(24) for minute in range(0, 60, 5)]

def merge_daily_data(file_list, precision, service_lookup, service, node):
    """
    Merges and processes data from multiple days.

    Parameters:
    - file_list: list of str - List of file paths.
    - precision: int - Time precision for resampling.
    - service_lookup: bool - Whether to filter by service.
    - service: int - Service type to filter.
    - node: int - Node number to filter.

    Returns:
    - DataFrame: Merged and processed data.
    """
    
    if not file_list:
        raise ValueError("The file list is empty.")
        
    time_intervals = create_time_intervals()
    main_dataframe = pd.DataFrame(index=pd.Index(time_intervals))
    main_dataframe = main_dataframe.merge(filt_arrivals(file_list[0],precision,service_lookup,service,node), how = "left", left_on = main_dataframe.index, right_on = "index", suffixes=('', '_0'))
    main_dataframe = main_dataframe.fillna(0)
    
    for i in range(1,len(file_list)):
        
        #merging all the days based on their time arrival 
        main_dataframe = main_dataframe.merge(filt_arrivals(file_list[i],precision,service_lookup,service,node), how = "left", left_on = "index", right_on = "index", suffixes=('', f'_{i}'))
    
    #dropping the null values
    main_dataframe = main_dataframe.fillna(0)
    
    #setting the index to show the arrival intervals
    main_dataframe = main_dataframe.set_index("index")
    
    
    return main_dataframe
        

def means_totals(file_list, precision, service_lookup, service, node=0):
    """
    Calculates the average arrival rate over multiple days.

    Parameters:
    - file_list: list of str - List of file paths.
    - precision: int - Time precision for resampling.
    - service_lookup: bool - Whether to filter by service.
    - service: int - Service type to filter.
    - node: int - Node number to filter, default is 0.

    Returns:
    - Series: Average arrival rate for each time interval.
    """
    merged_data = merge_daily_data(file_list, precision, service_lookup, service, node)
    return merged_data.mean(axis=1)


In [None]:
def process_class_data(class_index, file_list, precision, nodes=None):
    """
    Processes data for a given class index and nodes.

    Parameters:
    - class_index: int - The class index to process.
    - file_list: list - List of file paths for data processing.
    - precision: int - Time precision for resampling.
    - nodes: list or None - List of node numbers to process for the class. If None, process without nodes.

    Returns:
    - dict: Dictionary of the processed data.
    """
    results = {}
    if nodes:
        for node in nodes:
            key = f'class_{class_index}_node_{node}'
            results[key] = means_totals(file_list, precision, service_lookup=True, service=class_index, node=node)
    else:
        key = f'class_{class_index}'
        results[key] = means_totals(file_list, precision, service_lookup=True, service=class_index, node=0)

    return results

In [None]:
def save_data_to_csv(data, filename, time_slice=None):
    """
    Saves given data to a CSV file.

    Parameters:
    - data: DataFrame or Series - The data to be saved.
    - filename: str - The name of the file to save the data.
    - time_slice: tuple of int, optional - Start and end indices for slicing the data by time. If None, save all data.
    """
    if time_slice:
        sliced_data = data[time_slice[0]:time_slice[1]]
    else:
        sliced_data = data

    try:
        np.savetxt(filename, sliced_data, delimiter=",")
    except Exception as e:
        print(f"Error saving data to {filename}: {e}")

In [None]:
total_arrivals = means_totals(file_list_cust_subcalls, precision=5, service_lookup=False, service=0, node=0)
file_name = os.path.join(main_test_folder_path, f"main_test_total_arrivals_partial_{precision}min.csv")
save_data_to_csv(total_arrivals, file_name, time_slice=(time_focus_start, time_focus_end))

In [None]:
# the indices as they appear in the US Bank data set
class_indices = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17]

# Results are stored in results_by_class
results_by_class = {}

for class_ind in class_indices:
    if class_ind == 1:  # class 1 is the retail class
        results_by_class.update(process_class_data(class_ind, file_list_cust_subcalls, precision, nodes=[1, 2, 3]))
    else:
        results_by_class.update(process_class_data(class_ind, file_list_cust_subcalls, precision))

In [None]:
for class_ind in class_indices:
    if class_ind == 1:  # Assuming class 1 is the retail class
        # Retail class is divided into three nodes 
        for node_num in [1, 2, 3]:
            file_name_all = os.path.join(main_test_folder_path, f"main_test_arrivals_class{class_ind}_node{node_num}_all_{precision}min.csv")
            file_name_partial = os.path.join(main_test_folder_path, f"main_test_arrivals_class{class_ind}_node{node_num}_partial_{precision}min.csv")
            save_data_to_csv(results_by_class[f"class_{class_ind}_node_{node_num}"], file_name_all)
            save_data_to_csv(results_by_class[f"class_{class_ind}_node_{node_num}"], file_name_partial, time_slice=(time_focus_start, time_focus_end))
    # remaining classes other than Retail class
    else:
        file_name_all = os.path.join(main_test_folder_path, f"main_test_arrivals_class{class_ind}_all_{precision}min.csv")
        file_name_partial = os.path.join(main_test_folder_path, f"main_test_arrivals_class{class_ind}_partial_{precision}min.csv")
        save_data_to_csv(results_by_class[f"class_{class_ind}"], file_name_all)
        save_data_to_csv(results_by_class[f"class_{class_ind}"], file_name_partial, time_slice=(time_focus_start, time_focus_end))

### 1.2 Service Times/Rates

In [None]:
file_list_agent_records = get_file_list(data_folder_path, file_pattern="*_agent_records.csv")

In [None]:
def filt_service(file_path,service,node=0):
    """
    Filter data based on service and node, removing outliers and abnormal outcomes.

    Parameters:
    - file_path: str - Path to the data file.
    - service: int - Service type to filter.
    - node: int - Node number to filter, default is 0.

    Returns:
    - Series: Work time data after filtering.
    """
    
    try:
        data = pd.read_csv(file_path)
    except Exception as e:
        raise FileNotFoundError(f"Error reading file {file_path}: {e}")
        
    data = data[data["queue_time"] < 900] #eliminating outliers through distribution plot of queueing time
    data = data[data["work_time"] < 1800] #eliminating outliers through distribution plot of work time
    
    #eliminating abnormal outcomes
    abnormal_outcomes = [4,13,14,23,30,40,50]
    calls_with_abnormal_outcome = data[data["outcome"].isin(abnormal_outcomes)]
    call_ids_abnormal_outcome = calls_with_abnormal_outcome["call_id"].unique()
    drop_index = data[data["call_id"].isin(call_ids_abnormal_outcome)].index
    data = data.drop(index = drop_index)
        
    #focusing only on the 1st customer subcalls
    data = data[data["cust_subcall"] == 1]
    data = data[data["service"] == service]
    
    #if service is 1, we focus on splitting to different nodes for Retail class
    if service == 1:
        data = data[data["node"] == node]
        
    return data["work_time"] #in the database worktime is calculated in seconds

In [None]:
def means_service(file_list, service, node):
    """
    Calculates the average service time across multiple days for a given service and node.

    Parameters:
    - file_list: list of str - List of file paths.
    - service: int - Service type to filter.
    - node: int - Node number to filter.

    Returns:
    - float: Average service time.
    """
    if not file_list:
        raise ValueError("The file list is empty.")

    main_dataframe = pd.DataFrame(filt_service(file_list[0],service))
    
    for i in range(1,len(file_list)):
        data = filt_service(file_list[i],service,node)
        df = pd.DataFrame(data)
        main_dataframe = pd.concat([main_dataframe,df],axis=0)
        
    if main_dataframe.empty:
        return 0  # Return 0 or appropriate value if no data is available

    return main_dataframe.mean()[0]

In [None]:
def calculate_service_times(file_list, class_indices, class_names):
    """
    Calculates service times (in seconds) for different classes and nodes.

    Parameters:
    - file_list: list of str - List of file paths.
    - class_indices: list of int - List of class indices corresponding to each class name.
    - class_names: list of str - List of class names.

    Returns:
    - DataFrame: Service times (in seconds) for each class.
    """
    service_times_df = pd.DataFrame(columns=["Total"], index=class_names)

    for i, class_name in enumerate(class_names):
        node = i + 1 if i < 3 else 0  # First three are retail classes with specific nodes
        try:
            service_time = means_service(file_list, class_indices[i], node)
            service_times_df.loc[class_name, "Total"] = service_time
        except Exception as e:
            print(f"Error calculating service time for {class_name}: {e}")
            service_times_df.loc[class_name, "Total"] = None

    return service_times_df

In [None]:
class_names = ["Retail_Node1", "Retail_Node2", "Retail_Node3", "Premier", "Business", "Platinum", "Consumer_Loans", "Online_Banking", "EBO", "Telesales", "Subanco", "Case_Quality", "Priority_Service", "AST", "CCO", "Brokerage", "BPS"]

service_class_indices = [1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17]

service_times_17dim = calculate_service_times(file_list_agent_records, service_class_indices, class_names) #service times in seconds

In [None]:
# saving the service times and hourly service rates
file_name_service_times = os.path.join(main_test_folder_path, f"service_times_{class_no}dim.csv")
np.savetxt(file_name_service_times, service_times_17dim, delimiter = ",") #in seconds

# hourly service rates
file_name_hourly_mu = os.path.join(main_test_folder_path, f"mu_hourly_{class_no}dim.csv")
np.savetxt(file_name_hourly_mu, SECONDS_IN_HOUR/service_times_17dim, delimiter = ",", fmt="%.2f")


### 1.3 Abandonment Times/Rates 

In [None]:
def filt_abandon(file_path,service,node=0):
    """
    Filters abandonment times from data for a specific service and node.

    Parameters:
    - file_path: str - Path to the data file.
    - service: int - Service type to filter.
    - node: int - Node number to filter, default is 0.

    Returns:
    - DataFrame: Filtered data with abandonment times.
    """
    
    try:
        data = pd.read_csv(file_path)
    except Exception as e:
        raise FileNotFoundError(f"Error reading file {file_path}: {e}")
    
    # Eliminating outliers through distribution plot of queueing and service times
    queue_time_threshold = 1100 if service == 10 else 900
    data = data[(data["queue_time"] < queue_time_threshold) & (data["service_time"] < 1800)]

    # Removing calls with abnormal outcomes
    abnormal_outcomes = [4,13,14,23,30,40,50]
    calls_with_abnormal_outcome = data[data["outcome"].isin(abnormal_outcomes)]
    call_ids_abnormal_outcome = calls_with_abnormal_outcome["call_id"].unique()
    drop_index = data[data["call_id"].isin(call_ids_abnormal_outcome)].index
    data = data.drop(index = drop_index)
    
    # Focusing only on 1st customer subcalls
    data = data[data["cust_subcall"] == 1]
    
    # Sorting and removing duplicates
    data = data.sort_values(["segment_start"])
    data = data.drop_duplicates("call_id", keep = "last")
    data = data[(data["party_answered"] > 1000) | (data["outcome"].isin([11,12]))]
    
    data = data[data["service"] == service]
    
    # If service is 1, focusing on a specific node
    if service == 1:
        data = data[data["node"] == node]
    
    # Recording outcome based on abandonment or not
    data.loc[data["outcome"].isin([11,12]),"abandon"] = 1 #if abandoned the abandonment time is not censored
    data.loc[~data["outcome"].isin([11,12]),"abandon"] = 0 #if not abandoned the abandonment time is censored
    
    return data[["call_id","segment_start","queue_time","abandon"]]

In [None]:
# Bias corrected Kaplan Meier estimator

# Kaplan Meier integrator 

def kmi(survival_times, censored_flags): 
    """
    Calculates the Kaplan-Meier Integrator (KMI) for survival times.

    Parameters:
    - survival_times: array-like - An array of survival times.
    - censored_flags: array-like - A binary array indicating whether each survival time is censored (0) or not (1).

    Returns:
    - float: The estimated mean survival time.
    """
    if len(survival_times) != len(censored_flags):
        raise ValueError("Survival times and censored flags must be the same length.")

        
    sorted_indices = np.argsort(survival_times)
    sorted_times = np.sort(survival_times)
    sorted_flags = censored_flags[sorted_indices]
    n = len(sorted_flags)

    km_weights = np.zeros(n)
    km_weights[0] = 1 / n

    for i in range(1, n):
        km_weights[i] = km_weights[i - 1] * (n - (i+1) + 2)/(n - (i+1) + 1) * (((n - (i+1) + 1)/(n - (i+1) + 2))**sorted_flags[i - 1])

    weighted_flags = km_weights * sorted_flags

    if sorted_flags[-1] == 0:
        weighted_flags[-1] = 1 - np.sum(weighted_flags)

    kmi_estimate = np.sum(weighted_flags * sorted_times)
    return kmi_estimate

In [None]:
def calculate_bias(survival_times, censored_flags):
    """
    Calculates the bias in the Kaplan-Meier estimator.

    Parameters:
    - survival_times: array-like - An array of survival times.
    - censored_flags: array-like - A binary array indicating whether each survival time is censored (0) or not (1).

    Returns:
    - float: The calculated bias.
    """
    if len(survival_times) != len(censored_flags):
        raise ValueError("Survival times and censored flags must be the same length.")

    sorted_indices = np.argsort(survival_times)
    sorted_times = np.sort(survival_times)
    sorted_flags = censored_flags[sorted_indices]
    n = len(sorted_flags)

    bias_factors = np.zeros(n - 2)
    bias_factors[0] = ((n - 2) / (n - 1)) ** sorted_flags[0]

    for i in range(1, n - 2):
        bias_factors[i] = bias_factors[i-1]*(((n - (i+1) - 1)/(n - (i+1)))**sorted_flags[i])

    bias_value = -(n - 1) / n * sorted_times[-1] * sorted_flags[-1] * (1 - sorted_flags[-2]) * bias_factors[-3]
    return bias_value

In [None]:
def jackknife_estimation(survival_times, censored_flags):
    """
    Calculates the bias-corrected Kaplan-Meier estimator using the jackknife method.

    Parameters:
    - survival_times: array-like - An array of survival times.
    - censored_flags: array-like - A binary array indicating whether each survival time is censored (0) or not (1).

    Returns:
    - float: The bias-corrected Kaplan-Meier estimate.
    """
    kmi_estimate = kmi(survival_times, censored_flags)
    bias_estimate = calculate_bias(survival_times, censored_flags)
    bias_corrected_estimate = kmi_estimate - bias_estimate
    return bias_corrected_estimate

In [None]:
def km_estimation(file_list, service, node):
    """
    Concatenates data from multiple files to calculate the mean abandonment times
    using a bias-corrected Kaplan-Meier estimator.

    Parameters:
    - file_list: list of str - List of file paths.
    - service: int - Service type to filter.
    - node: int - Node number to filter.

    Returns:
    - float: Bias-corrected Kaplan-Meier estimate of mean abandonment times.
    """
    if not file_list:
        raise ValueError("File list is empty.")

    main_dataframe = pd.DataFrame(filt_abandon(file_list[0],service,node))
    
    for i in range(1,len(file_list)):
        
        data = filt_abandon(file_list[i],service,node)
        df = pd.DataFrame(data)
        main_dataframe = pd.concat([main_dataframe,df],axis=0)

    # Extracting queue times and abandonment flags
    queue_times = main_dataframe["queue_time"].to_numpy()
    abandonment_flags = main_dataframe["abandon"].to_numpy()

    # Calculating the bias-corrected Kaplan-Meier estimate
    bias_corrected_estimate = jackknife_estimation(queue_times, abandonment_flags)
    
    return bias_corrected_estimate

In [None]:
# class names and class indices as they appear in the dataset
class_names = ["Retail_Node1", "Retail_Node2", "Retail_Node3", "Premier", "Business", "Platinum", "Consumer_Loans", "Online_Banking", "EBO", "Telesales", "Subanco", "Case_Quality", "Priority_Service", "AST", "CCO", "Brokerage", "BPS"]
service_class_indices = [1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17]

# Create a DataFrame to store the abandonment times
abandonment_times_17dim = pd.DataFrame(index=class_names, columns=["Total"])

# Calculate and save abandonment times for each class
for i, class_name in enumerate(class_names):
    
    node = i + 1 if i < 3 else 0  # First three are retail classes with specific nodes
    abandonment_times_17dim.loc[class_name, "Total"] = km_estimation(file_list_cust_subcalls, service_class_indices[i], node) #abandonment times in second

In [None]:
# saving abandonment times in seconds and abandonment rates per hour into csv files
file_name_abandonment_times = os.path.join(main_test_folder_path, f"abandonment_times_{class_no}dim.csv")
np.savetxt(file_name_abandonment_times, abandonment_times_17dim, delimiter = ",")

# hourly abandonment rates
file_name_hourly_theta = os.path.join(main_test_folder_path, f"theta_hourly_{class_no}dim.csv")
np.savetxt(file_name_hourly_theta, SECONDS_IN_HOUR/abandonment_times_17dim, delimiter = ",", fmt="%.2f")


### 1.4 Cost rate 

In [None]:
# We impute holding cost data based on relative importance of the classes and their call volume
# please refer to the paper on how the holding costs are determined

holding_cost_17dim = np.array([24,24,24,26,30,32,22,22,20,22,20,20,32,22,22,22,20]) #hourly holding cost
abandonment_cost_17dim = holding_cost_17dim/12 #abandonment penalty 

file_name_holding_cost = os.path.join(main_test_folder_path, f"hourly_holding_cost_{class_no}dim.csv")
file_name_abandonment_cost = os.path.join(main_test_folder_path,f"abandonment_cost_{class_no}dim.csv")

np.savetxt(file_name_holding_cost, holding_cost_17dim, delimiter = ",", fmt="%.2f")
np.savetxt(file_name_abandonment_cost, abandonment_cost_17dim, delimiter = ",", fmt="%.2f")

In [None]:
# total cost rate calculation -- per hour
theta_hourly = SECONDS_IN_HOUR / abandonment_times_17dim["Total"].to_numpy()
total_costs = holding_cost_17dim + theta_hourly * abandonment_cost_17dim

In [None]:
# saving total cost rates per hour
file_name_total_cost = os.path.join(main_test_folder_path, f"hourly_total_cost_{class_no}dim.csv")
np.savetxt(file_name_total_cost, total_costs, delimiter = ",", fmt="%.2f")

### 1.5 Arrival Percentages

In [None]:
def daily_rate(file_path,service,node):  
    """
    Calculates the daily rate of calls for a specific service and node.

    Parameters:
    - file_path: str - Path to the data file.
    - service: int - Service type to filter.
    - node: int - Node number to filter.

    Returns:
    - int: Number of calls after filtering.
    """
    try:
        data = pd.read_csv(file_path)
    except Exception as e:
        raise FileNotFoundError(f"Error reading file {file_path}: {e}")

    #eliminating outliers through distribution plot of queueing times and service times
    data = data[data["queue_time"] < 900] #removing outliers based on the distribution
    data = data[data["service_time"] < 1800] #remove based on the service times
    
    #removing the calls with abnormal outcomes 
    abnormal_outcomes = [4,13,14,23,30,40,50]
    calls_with_abnormal_outcome = data[data["outcome"].isin(abnormal_outcomes)]
    call_ids_abnormal_outcome = calls_with_abnormal_outcome["call_id"].unique()
    drop_index = data[data["call_id"].isin(call_ids_abnormal_outcome)].index
    data = data.drop(index = drop_index)
    
    #focusing only on 1st customer subcalls
    data = data[data["cust_subcall"] == 1]
    
    data = data.sort_values(["segment_start"])
    data = data.drop_duplicates("call_id",keep = "last") #removing the multiple records of the same call
        
    if service == 1:
        data = data[data["service"] == 1]
        data = data[data["node"].isin([node])]
    else:
        data = data[data["service"] == service]
        
    return len(data)

In [None]:
def daily_rate_means(file_list, service, node):
    """
    Calculates the mean daily rate for a given service and node across multiple files.

    Parameters:
    - file_list: list of str - List of file paths.
    - service: int - Service type to filter.
    - node: int - Node number to filter.

    Returns:
    - float: Mean daily rate.
    """
    total_calls = sum(daily_rate(file, service, node) for file in file_list)
    return total_calls / len(file_list) if file_list else 0

# DataFrame to store the daily rates
class_indices = [1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17]
d_rates = pd.DataFrame({'index': class_indices, 'Total': 0})

# Calculate the mean daily rates for each class
for i, index in enumerate(d_rates['index']):
    node = i + 1 if i < 3 else 0  # First three are for specific nodes
    d_rates.loc[i, 'Total'] = daily_rate_means(file_list_cust_subcalls, index, node)



In [None]:
observations = np.array(d_rates["Total"])
percentages_17dim = observations/np.sum(observations)
file_name_percentage = os.path.join(main_test_folder_path, f"percentages_{class_no}dim.csv")
np.savetxt(file_name_percentage, percentages_17dim*100, delimiter=",", fmt="%.2f")

### 1.6 Cumulative distribution function of arrivals

In [None]:
# We need to calculate the cdf of arrivals to use in our discrete event simulation to determine which class arrives
# 204 time intervals (every interval is 5 minute), 17 different classes
results = np.zeros((length_time, class_no))

# Populate the results array with arrival data for each class
for i, key in enumerate(results_by_class.keys()):
    results[:, i] = results_by_class[key][time_focus_start:time_focus_end]

# Calculate cumulative sum of arrivals for each time interval
cumsum = np.cumsum(results, axis=1)

# Reshape total arrivals for broadcasting in division
total_arrivals_reshaped  = np.array(cumsum[:,class_no-1]).reshape(-1, 1)

# Check if total arrivals are zero to prevent division by zero
if np.any(total_arrivals_reshaped == 0):
    raise ValueError("Total arrivals contain zero values, cannot calculate CDF.")

# Calculate CDF by normalizing cumulative sums with total arrivals
cdf = cumsum / total_arrivals_reshaped

# Save the CDF to a CSV file
file_name_cdf = os.path.join(main_test_folder_path, f"cdf_{class_no}dim.csv")
np.savetxt(file_name_cdf, cdf, delimiter=",", fmt="%.3f")

### 1.7 Agents

In [None]:
file_list_agent_events = np.sort(get_file_list(data_folder_path,"*_agent_events.csv"))

In [None]:
def reader(file_path):
    """
    Reads the specified file and returns the unique IDs of calls with a second subcall.

    Parameters:
    - file_path: str - Path to the data file.

    Returns:
    - ndarray: Array of unique IDs for calls with a second subcall.
    """
    try:
        data = pd.read_csv(file_path)
    except Exception as e:
        raise FileNotFoundError(f"Error reading file {file_path}: {e}")

    # Extracting unique IDs of calls with a second subcall
    second_subcall_ids = data[data["cust_subcall"] == 2]["record_id"].unique()
    return second_subcall_ids

In [None]:
# Collecting unique IDs from all files
file_list_agent_records_sorted = np.sort(file_list_agent_records)
data_sets = [reader(file) for file in file_list_agent_records_sorted]

In [None]:
def seconds(array: Union[List[timedelta], pd.Series]) -> np.ndarray:
    """
    Converts an array-like of timedelta objects to an array of seconds.

    Parameters:
    - array (Union[List[timedelta], pd.Series]): An array-like structure of timedelta objects.

    Returns:
    - np.ndarray: An array of seconds corresponding to the timedelta objects.
    """
    if not all(isinstance(td, timedelta) for td in array):
        raise ValueError("All elements of the array must be timedelta objects")

    if isinstance(array, pd.Series):
        # Vectorized operation for pandas Series
        return array.dt.seconds.to_numpy()
    else:
        # List comprehension for a list
        return np.array([td.seconds for td in array])

In [None]:
def filt_agents(file_path,precision,num):
    """
    Calculates the average availability of agents over specified time intervals.

    Parameters:
    - file_path (str): Path to the CSV file containing the data.
    - precision (int): Time precision for averaging availability.
    - num (int): Identifier for data set selection.
    - event_ids (dict): Dictionary of event IDs for different categories.

    Returns:
    - np.ndarray: Array of average agent availability.
    """
    try:
        data = pd.read_csv(file_path)
    except Exception as e:
        raise FileNotFoundError(f"Error reading the file: {e}")

    
    #sign-in times
    data_sign_in = data[((data["event_id"] == 20) | (data["event_id"] == 21))] #events 20 and 21 are sign-in times
    data_sign_in = data_sign_in.sort_values(["agent","event_start"])
    data_sign_in = data_sign_in.reset_index()
    sign_in = pd.to_timedelta(data_sign_in["event_start"], unit = 'seconds')
    sign_in_times = seconds(sign_in) #converting sign_in times to seconds
    
    #sign_out times
    data_sign_out = data[((data["event_id"] == 30) | (data["event_id"] == 31))] #events 30 and 31 are sign-out times
    data_sign_out = data_sign_out.sort_values(["agent","event_start"])
    data_sign_out = data_sign_out.reset_index()
    sign_out = pd.to_timedelta(data_sign_out["event_start"], unit = 'seconds')
    sign_out_times = seconds(sign_out) #converting sign_out times to seconds
    
    available = np.zeros(86400)
    for i in range(len(data_sign_in)):
        if sign_in_times[i] > sign_out_times[i]:
            sign_out_times[i] = 86399
        for j in range(int(sign_in_times[i]),int(sign_out_times[i])+1):
            available[int(j)] += 1
    #breaks
    data_break = data[((data["event_id"] == 60) | (data["event_id"] == 61) | (data["event_id"] == 62))] #events 60, 61,62 are break times
    data_break = data_break.sort_values(["agent","event_start"])
    data_break = data_break.reset_index()
    
    break_start = pd.to_timedelta(data_break["event_start"], unit = 'seconds')
    break_end = pd.to_timedelta(data_break["event_end"], unit = 'seconds')
    
    break_start = seconds(break_start)
    break_end = seconds(break_end)
    
    for i in range(len(break_start)):
        for j in range(int(break_start[i]),int(break_end[i]) + 1):
            available[int(j)] -= 1
            
    #second subcalls eliminated
    data_ss = data[(data["record_id"].isin(data_sets[num]))]
    data_ss = data_ss.sort_values(["agent", "event_start"])
    data_ss = data_ss.reset_index()
    
    ss_start = pd.to_timedelta(data_ss["event_start"], unit = "seconds")
    ss_end = pd.to_timedelta(data_ss["event_end"], unit = "seconds")
    
    ss_start = seconds(ss_start)
    ss_end = seconds(ss_end)
    
    for k in range(len(ss_start)):
        for l in range(int(ss_start[k]), int(ss_end[k]) + 1):
            available[int(l)] -= 1
    
    available = np.reshape(available, (-1,precision*60))
    
    mean = np.mean(available, axis = 1) #mean
    
    return mean

In [None]:
def calculate_average_agent_availability(file_list, precision):
    """
    Calculates the average agent availability across multiple data files.

    Parameters:
    - file_list (list of str): List of file paths to process.
    - precision (int): Time precision parameter to pass to the filt_agents function.

    Returns:
    - np.ndarray: The average availability of agents across the given files.
    """
    
    total_mean = 0
    for i, file in enumerate(file_list):
        mean = filt_agents(file, precision, i)
        total_mean += mean

    average_availability = total_mean / len(file_list)

    return average_availability

In [None]:
agents = np.array(calculate_average_agent_availability(file_list_agent_events,precision))
agents = agents.round().astype(int)[time_focus_start:time_focus_end] #focus on the period between 7 am till midnight

In [None]:
file_name_agents = os.path.join(main_test_folder_path, f"main_test_agents.csv")
np.savetxt(file_name_agents, agents, delimiter=",")

### 1.8 Limiting arrival process and zeta calculation 

In [None]:
def compute_limiting_hourly_rates(class_no, lambda_prelimit, time_focus_start, time_focus_end, precision, mu_hourly, percentages, agents, length_time, SCALING_FACTOR):
    """
    Computes hourly rates for given parameters and class information.

    Parameters:
    - class_no (int): Number of classes.
    - lambda_prelimit: Array of prelimit lambda by class.
    - time_focus_start (int): Start index for focusing time.
    - time_focus_end (int): End index for focusing time.
    - precision (int): Precision value for time calculations.
    - mu_hourly (np.array): Array of hourly service rates.
    - percentages (np.array): Array of percentages.
    - agents (int): Number of agents.
    - length_time (int): Length of the time array.
    - SCALING_FACTOR (int): Factor to scale the system.

    Returns:
    - tuple: Tuple containing lambda limit hourly, and zeta hourly arrays.
    """

    # Calculations
    denominator_hourly = SCALING_FACTOR * np.sum(percentages / mu_hourly)

    lambd_limit_hourly = np.zeros((class_no, length_time))
    zeta_hourly = np.zeros((class_no, length_time))

    for i in range(class_no):
        lambd_limit_hourly[i, :] = percentages[i] * agents / denominator_hourly
        zeta_hourly[i, :] = (lambda_prelimit[i, :] - SCALING_FACTOR * lambd_limit_hourly[i, :]) / np.sqrt(SCALING_FACTOR)

    return lambd_limit_hourly, zeta_hourly

In [None]:
lambd_prelimit_hourly = np.zeros((class_no, length_time)) #hourly prelimit arrival rate
mu_hourly = SECONDS_IN_HOUR / service_times_17dim["Total"]

for i, key in enumerate(results_by_class.keys()):
    lambd_prelimit_hourly[i, :] = results_by_class[key][time_focus_start:time_focus_end]*MINUTES_IN_HOUR/precision

lambd_limit_hourly, zeta_hourly = compute_limiting_hourly_rates(class_no, lambd_prelimit_hourly, time_focus_start, time_focus_end, precision, mu_hourly, percentages_17dim, agents, length_time, SCALING_FACTOR)

In [None]:
file_name_lambda_limiting = os.path.join(main_test_folder_path, f"main_test_hourly_limiting_lambda.csv")
file_name_zeta = os.path.join(main_test_folder_path, f"main_test_hourly_zeta.csv")

np.savetxt(file_name_lambda_limiting, lambd_limit_hourly, delimiter = ",")
np.savetxt(file_name_zeta, zeta_hourly, delimiter = ",")

## 2. Low Dimensional Test Problems

### 2.1 Pre-limit arrival process

In [None]:
def filt_arrivals_low_dim(file_path,precision,service_lookup,service,class_no):
    """
    Processes and filters call arrival data based on various criteria.

    Parameters:
    - file_path (str): Path to the CSV file containing the data.
    - precision (int): Time precision for resampling.
    - service_lookup (bool): Whether to filter data based on service lookup.
    - service_class (int): The service class to filter.
    - class_no (int): Number of classes.

    Returns:
    - pd.DataFrame: Filtered and processed data.
    """
    try:
        data = pd.read_csv(file_path)
    except Exception as e:
        raise FileNotFoundError(f"Error reading the file: {e}")
    
    
    data = data[data["queue_time"] < 900] #removing outliers based on the distribution of queueing times
    data = data[data["service_time"] < 1800] #removing outliers based on the distribution of service times
    
    #removing the calls with abnormal outcomes 
    abnormal_outcomes = [4,13,14,23,30,40,50]
    calls_with_abnormal_outcome = data[data["outcome"].isin(abnormal_outcomes)]
    call_ids_abnormal_outcome = calls_with_abnormal_outcome["call_id"].unique()
    drop_index = data[data["call_id"].isin(call_ids_abnormal_outcome)].index
    data = data.drop(index = drop_index)
    
    #focusing only on 1st customer subcalls
    data = data[data["cust_subcall"] == 1]
    
    #removing the multiple records of the same call
    data = data.sort_values(["segment_start"])
    data = data.drop_duplicates("call_id",keep = "last")
    

    if service_lookup == True:
        if class_no == 2:
            if service == 1:
                class1_codes = [1,3,5,6,8,13]
                data = data[data["service"].isin(class1_codes)]
                node_drop_index = data[(data["service"] == 1) & (data["node"].isin([1,3,5,6,7]))].index #keep only node 2
                #drop nodes 1,3,5,6,7
                data = data.drop(index = node_drop_index) 

            elif service == 2:
                class2_codes = [1,2,4,7,9,10,11,12,15,17] 
                data = data[data["service"].isin(class2_codes)]
                #drop nodes 2,5,6,7
                node_drop_index = data[(data["service"] == 1) & (data["node"].isin([2,5,6,7]))].index #keep only nodes 1,3
                data = data.drop(index = node_drop_index)
        
        elif class_no == 3:
            
            if service == 1:
                class1_codes = [1,3,8]
                data = data[data["service"].isin(class1_codes)]
                node_drop_index = data[(data["service"] == 1) & (data["node"].isin([1,3,5,6,7]))].index #keep only node 2
                #drop nodes 1,3,5,6,7
                data = data.drop(index = node_drop_index) 
        
            elif service == 2:
                class2_codes = [1,5,6,13] 
                data = data[data["service"].isin(class2_codes)]
                #drop nodes 2,5,6,7
                node_drop_index = data[(data["service"] == 1) & (data["node"].isin([2,3,5,6,7]))].index #keep only nodes 1
                data = data.drop(index = node_drop_index)
        
            elif service == 3:
                class3_codes = [1,2,4,7,9,10,11,12,15,17]
                data = data[data["service"].isin(class3_codes)]
                node_drop_index = data[(data["service"] == 1) & (data["node"].isin([1,2,5,6,7]))].index #keep only node 3
                data = data.drop(index = node_drop_index)
            
    #some calls when they first arrive are on hold and then are transferred to the agents but since they have the same call id we look at their first occurrence
    data = data.reset_index()
    
    #defining the start time of the events
    start = []
    for i in range(len(data)):
        start.append(datetime.fromtimestamp(data["segment_start"][i]+5*60*60).strftime("%A, %B %d, %Y %H:%M:%S")) 
    data["start"] = start
    data['Datetime'] = pd.to_datetime(data['start'])
    data["day"] = pd.to_datetime(start).date
    new_data = data
    drop_ind = new_data.drop_duplicates(subset = ["day"])
    
    if len(drop_ind) > 1:
        drop_day = np.array(drop_ind.day)[1]
        data = data[data["day"] != drop_day]
    
    data = data.set_index('Datetime')
    
    #resolution
    resolution = "{pre}T"
    
    #how many arrivals happened within the specific time interval
    data = data.resample(resolution.format(pre = precision)).count()
    data = data.dropna(axis = 0)
    
    #setting up a new column to show the time interval
    new_index = []
    for j in data.index:
        h=j.hour
        m=j.minute
        item=str(h)+':'+str(m)
        new_index.append(item)
        
    data["index"] = new_index
    data = data.reset_index()
    
    return data[["index","call_id"]]

In [None]:
def merge_daily_data_low_dim(file_list, precision, service_lookup, service, class_no):
    """
    Merges and processes data from multiple days.

    Parameters:
    - file_list: list of str - List of file paths.
    - precision: int - Time precision for resampling.
    - service_lookup: bool - Whether to filter by service.
    - service: int - Service type to filter.
    - class_no (int): Number of classes.
    
    Returns:
    - DataFrame: Merged and processed data.
    """
    
    if not file_list:
        raise ValueError("The file list is empty.")
        
    time_intervals = create_time_intervals()
    main_dataframe = pd.DataFrame(index=pd.Index(time_intervals))
    main_dataframe = main_dataframe.merge(filt_arrivals_low_dim(file_list[0],precision,service_lookup,service,class_no), how = "left", left_on = main_dataframe.index, right_on = "index", suffixes=('', '_0'))
    main_dataframe = main_dataframe.fillna(0)
    
    for i in range(1,len(file_list)):
        #merging all the days based on their time arrival 
        main_dataframe = main_dataframe.merge(filt_arrivals_low_dim(file_list[i],precision,service_lookup,service,class_no), how = "left", left_on = "index", right_on = "index", suffixes=('', f'_{i}'))
    
    #dropping the null values
    main_dataframe = main_dataframe.fillna(0)
    
    #setting the index to show the arrival intervals
    main_dataframe = main_dataframe.set_index("index")
    
    
    return main_dataframe
        

def means_totals_low_dim(file_list, precision, service_lookup, service, class_no):
    """
    Calculates the average arrival rate over multiple days.

    Parameters:
    - file_list: list of str - List of file paths.
    - precision: int - Time precision for resampling.
    - service_lookup: bool - Whether to filter by service.
    - service: int - Service type to filter.
    - class_no (int): Number of classes.
    
    Returns:
    - Series: Average arrival rate for each time interval.
    """
    merged_data = merge_daily_data_low_dim(file_list, precision, service_lookup, service, class_no)
    
    return merged_data.mean(axis=1)


In [None]:
def save_to_csv_low_dim(folder_path, file_list, class_indices, class_no, precision, time_focus_start, time_focus_end):
    """
    Saves results for each class into CSV files in the specified directory.

    Parameters:
    - folder_path: str - Folder path to save the arrivals
    - file_list: list of str - List of file paths.
    - class_indices (list): List of class indices.
    - class_no (int): Number of classes.
    - precision (int): Precision used in calculations.
    - time_focus_start (int): Start index for focused time.
    - time_focus_end (int): End index for focused time.
    
    Returns:
    - dict: Dictionary of results by class.
    """
    
    results_by_class = {}
    
    for class_ind in range(len(class_indices)):
        
        results_by_class[f"results_class{class_ind+1}_{class_no}dim"] = means_totals_low_dim(file_list, precision, service_lookup=True, service=class_ind + 1, class_no=class_no)
        
        file_name_all = os.path.join(folder_path, f"arrivals_class{class_ind+1}_all_{precision}min_{class_no}dim.csv")
        file_name_partial = os.path.join(folder_path, f"arrivals_class{class_ind+1}_partial_{precision}min_{class_no}dim.csv")
        
        np.savetxt(file_name_all, results_by_class[f"results_class{class_ind+1}_{class_no}dim"], delimiter=",")
        np.savetxt(file_name_partial, results_by_class[f"results_class{class_ind+1}_{class_no}dim"][time_focus_start:time_focus_end], delimiter=",")  
    
    return results_by_class

In [None]:
# 0 is Retail (node: 1)
# 1 is Retail (node: 2)
# 2 is Retail (node: 3)

# 2 dimensional problem
class_no = 2
class_indices_2dim = [[1, 4, 6, 7, 9, 14], [0, 2, 3, 5, 8, 10, 11, 12, 13, 15, 16]]

folder_path_2dim = os.path.join(data_folder_path,f"problem_{class_no}dim")
os.makedirs(folder_path_2dim, exist_ok=True)
prelimit_arrivals_2dim = save_to_csv_low_dim(folder_path_2dim, file_list_cust_subcalls, class_indices_2dim, 2, precision, time_focus_start, time_focus_end)

# 3 dimensional problem
class_no = 3
class_indices_3dim = [[1, 4, 9], [0, 6, 7, 14], [2, 3, 5, 8, 10, 11, 12, 13, 15, 16]]

folder_path_3dim = os.path.join(data_folder_path,f"problem_{class_no}dim")
os.makedirs(folder_path_3dim, exist_ok=True)
prelimit_arrivals_3dim = save_to_csv_low_dim(folder_path_3dim, file_list_cust_subcalls, class_indices_3dim, 3, precision, time_focus_start, time_focus_end)


### 2.2 System parameters (Service, Abandonment and Cost rates) 

In [None]:
def low_dim_parameters(folder_path, class_indices,class_no):
    """
    Computes low dimensional parameters based on class indices and number of classes.

    Parameters:
    - folder_path: str - Folder path to save the system parameters
    - class_indices (list): Indices of the classes.
    - class_no (int): Number of classes.
    
    Returns:
    - tuple: Returns mu_low, theta_low, cost_low, and percentages_low arrays.
    """
    
    if not isinstance(class_indices, list) or not isinstance(class_no, int):
        raise ValueError("Invalid input types for class_indices and class_no")

    percentages_low = np.zeros(class_no)
    mu_low = np.zeros(class_no)
    theta_low = np.zeros(class_no)
    holding_cost_low = np.zeros(class_no)
    abandonment_cost_low = np.zeros(class_no)
    cost_low = np.zeros(class_no)
    
    # Calculate percentages
    for i in range(class_no):
        percentages_low[i] = np.sum(percentages_17dim[class_indices[i]])
    
    # Calculate other system parameters (per hour)
    for i, indices in enumerate(class_indices):
        percentages_ratio = percentages_17dim[indices] / percentages_low[i]
        mu_low[i] = np.sum(SECONDS_IN_HOUR / service_times_17dim["Total"][indices] * percentages_ratio)
        theta_low[i] = np.sum(SECONDS_IN_HOUR / abandonment_times_17dim["Total"][indices] * percentages_ratio)
        holding_cost_low[i] = np.sum(holding_cost_17dim[indices] * percentages_ratio)
        abandonment_cost_low[i] = np.sum(abandonment_cost_17dim[indices] * percentages_ratio)
        cost_low[i] = holding_cost_low[i] + theta_low[i] * abandonment_cost_low[i]
    
    # Save outputs to CSV
    file_name_hourly_mu = os.path.join(folder_path, f"mu_hourly_{class_no}dim.csv")
    file_name_hourly_theta = os.path.join(folder_path, f"theta_hourly_{class_no}dim.csv")

    file_name_holding_cost = os.path.join(folder_path, f"hourly_holding_cost_{class_no}dim.csv")
    file_name_abandonment_cost = os.path.join(folder_path,f"abandonment_cost_{class_no}dim.csv")
    file_name_total_cost = os.path.join(folder_path, f"hourly_total_cost_{class_no}dim.csv")
    file_name_percentage = os.path.join(folder_path, f"percentages_{class_no}dim.csv")
    
    np.savetxt(file_name_hourly_mu, mu_low, delimiter = ",", fmt="%.2f")
    np.savetxt(file_name_hourly_theta, theta_low, delimiter = ",", fmt="%.2f")
    
    np.savetxt(file_name_abandonment_cost, abandonment_cost_low, delimiter = ",", fmt="%.2f") 
    np.savetxt(file_name_holding_cost, holding_cost_low, delimiter = ",", fmt="%.2f")
    np.savetxt(file_name_total_cost, cost_low, delimiter = ",", fmt="%.2f") 
    np.savetxt(file_name_percentage, percentages_low, delimiter = ",", fmt="%.2f") 
    
    return mu_low, theta_low, cost_low, percentages_low

### 2.3 Limiting arrival rates and zeta calculation

In [None]:
#2 dim test problem parameters
class_no = 2

lambd_prelimit_hourly_2dim = np.zeros((class_no, length_time)) #hourly prelimit arrival rate
mu_hourly_2dim, theta_hourly_2dim, cost_hourly_2dim, percentages_2dim = low_dim_parameters(folder_path_2dim,class_indices_2dim,class_no)

for i, key in enumerate(prelimit_arrivals_2dim.keys()):
    lambd_prelimit_hourly_2dim[i, :] = prelimit_arrivals_2dim[key][time_focus_start:time_focus_end]*MINUTES_IN_HOUR/precision

lambd_limit_hourly_2dim, zeta_hourly_2dim = compute_limiting_hourly_rates(class_no, lambd_prelimit_hourly_2dim, time_focus_start, time_focus_end, precision, mu_hourly_2dim, percentages_2dim, agents, length_time, SCALING_FACTOR)


file_name_lambda_limiting = os.path.join(folder_path_2dim, f"hourly_limiting_lambda_{class_no}dim.csv")
file_name_zeta = os.path.join(folder_path_2dim, f"hourly_zeta_{class_no}dim.csv")

np.savetxt(file_name_lambda_limiting, lambd_limit_hourly_2dim, delimiter = ",")
np.savetxt(file_name_zeta, zeta_hourly_2dim, delimiter = ",")

In [None]:
#3 dim test problem parameters
class_no = 3

lambd_prelimit_hourly_3dim = np.zeros((class_no, length_time)) #hourly prelimit arrival rate
mu_hourly_3dim, theta_hourly_3dim, cost_hourly_3dim, percentages_3dim = low_dim_parameters(folder_path_3dim, class_indices_3dim,class_no)

for i, key in enumerate(prelimit_arrivals_3dim.keys()):
    lambd_prelimit_hourly_3dim[i, :] = prelimit_arrivals_3dim[key][time_focus_start:time_focus_end]*MINUTES_IN_HOUR/precision

lambd_limit_hourly_3dim, zeta_hourly_3dim = compute_limiting_hourly_rates(class_no, lambd_prelimit_hourly_3dim, time_focus_start, time_focus_end, precision, mu_hourly_3dim, percentages_3dim, agents, length_time, SCALING_FACTOR)

file_name_lambda_limiting = os.path.join(folder_path_3dim, f"hourly_limiting_lambda_{class_no}dim.csv")
file_name_zeta = os.path.join(folder_path_3dim, f"hourly_zeta_{class_no}dim.csv")

np.savetxt(file_name_lambda_limiting, lambd_limit_hourly_3dim, delimiter = ",")
np.savetxt(file_name_zeta, zeta_hourly_3dim, delimiter = ",")

### 2.4 Cumulative distribution function of arrivals

In [None]:
# we need to calculate the cdf of arrivals to use in our discrete event simulation to determine which class arrives
class_no = 2

# Calculate cumulative sum of arrivals for each time interval
cumsum_2dim = np.cumsum(np.transpose(lambd_prelimit_hourly_2dim), axis=1)
total_arrivals_reshaped  = np.array(cumsum_2dim[:,class_no-1]).reshape(-1, 1)

# Calculate CDF by normalizing cumulative sums with total arrivals
cdf_2dim = cumsum_2dim / total_arrivals_reshaped

# Save the CDF to a CSV file
file_name_cdf = os.path.join(folder_path_2dim, f"cdf_{class_no}dim.csv")
np.savetxt(file_name_cdf, cdf_2dim, delimiter=",", fmt="%.3f")

In [None]:
class_no = 3

# Calculate cumulative sum of arrivals for each time interval
cumsum_3dim = np.cumsum(np.transpose(lambd_prelimit_hourly_3dim), axis=1)
total_arrivals_reshaped  = np.array(cumsum_3dim[:,class_no-1]).reshape(-1, 1)

# Calculate CDF by normalizing cumulative sums with total arrivals
cdf_3dim = cumsum_3dim / total_arrivals_reshaped

# Save the CDF to a CSV file
file_name_cdf = os.path.join(folder_path_3dim, f"cdf_{class_no}dim.csv")
np.savetxt(file_name_cdf, cdf_3dim, delimiter=",", fmt="%.3f")

## 3. High Dimensional Test Problems 

In [None]:
main_test_class_no = 17

### 3.1 Pre-limit arrival process and system parameters

In [None]:
def calculate_high_dimensional_parameters(folder_path, high_dim_class_no, main_test_class_no, arrivals_main, mu_hourly_main, theta_hourly_main, agents, old_class_no, old_scaling_factor, precision, seed_value):
    """
    Calculate and save various high-dimensional parameters.

    Parameters:
    - folder_path (str): Folder path to save parameters.
    - high_dim_class_no (int): Number of high-dimensional classes.
    - main_test_class_no (int): Number of main test classes.
    - arrivals_main (np.ndarray): Main array of arrivals.
    - mu_hourly_main (np.ndarray): Main hourly service rates.
    - theta_hourly_main (np.ndarray): Main hourly abandonment rates.
    - agents (np.ndarray): Array of agents.
    - old_class_no (int): Number of old classes.
    - old_scaling_factor (float): Old scaling factor.
    - precision (int): Precision for calculations.
    - seed_value (int) : Random seed 

    Returns:
    - tuple: High-dimensional parameters including arrivals, agents, mu, theta, abandonment rate, percentage and scaling factor.
    """
    np.random.seed(seed_value)

    #Step 1: arrival rates
    arrivals_high_dim = np.zeros((high_dim_class_no, length_time))
    
    indices_arrivals = np.arange(0, main_test_class_no)
    repeated_indices_arrivals = np.random.choice(indices_arrivals, size = high_dim_class_no, replace = True)
    
    for i in range(high_dim_class_no):
        arrivals_high_dim[i,:] = arrivals_main[repeated_indices_arrivals[i],:]
        
    total_arrivals_high_dim = np.sum(arrivals_high_dim, axis = 0)

    #Step 2: arrival percentages
    numerator = np.sum(arrivals_high_dim, axis = 1)
    denominator = np.sum(numerator)
    percentage_high_dim = numerator/denominator

    #Step 3: service and abandonment rates
    mu_high_dimensional = np.zeros((high_dim_class_no,))
    theta_high_dimensional = np.zeros((high_dim_class_no,))

    indices_system_parameters = np.arange(0, main_test_class_no)
    repeated_indices_system_parameters = np.random.choice(indices_system_parameters, size = high_dim_class_no, replace = True)

    for i in range(high_dim_class_no):
        mu_high_dimensional[i] = mu_hourly_main[repeated_indices_system_parameters[i]]
        theta_high_dimensional[i] = theta_hourly_main[repeated_indices_system_parameters[i]]
    
    #Step 4: main test utilization
    denominator = np.sum(agents)
    num_factor = np.sum(arrivals_main, axis = 1)
    numerator = np.sum(1/mu_hourly_main * num_factor)
    main_test_utilization = numerator/denominator

    
    #Step 5: new system scaling parameter
    high_dim_scaling_factor = np.round(high_dim_class_no/old_class_no*old_scaling_factor, 2)
    high_dim_utilization = 1 - ((1 - main_test_utilization)/np.sqrt(high_dim_class_no/main_test_class_no))
    
    #Step 6: new staffing level
    first_term = agents/np.sum(agents)
    second_term = 1/high_dim_utilization
    third_term = np.sum(1/mu_high_dimensional * np.sum(arrivals_high_dim, axis = 1))
    high_dim_agents = first_term * second_term * third_term 
    high_dim_agents = np.ceil(high_dim_agents).astype(int) # Convert to integer
    
    #Step 7: system parameters
    if high_dim_class_no == 30:
        numbers = np.arange(14, 34, 0.5)
    elif high_dim_class_no == 50:
        numbers = np.arange(14, 34, 0.25)
    elif high_dim_class_no == 100:
        numbers = np.arange(14, 34, 0.125)

    holding_cost_rate_high_dim = np.random.choice(numbers, size = high_dim_class_no, replace = False)
    abandonment_cost_rate_high_dim = holding_cost_rate_high_dim/12

    common_mu = np.round(np.sum(percentage_high_dim*mu_high_dimensional),2)
    common_theta = np.round(np.sum(percentage_high_dim*theta_high_dimensional),2)
    common_abandonment_rate = np.round(np.sum(percentage_high_dim*abandonment_cost_rate_high_dim),2)

    cost_pathwise = np.zeros((high_dim_class_no,))
    common_abandonment_rate = np.sum(percentage_high_dim * abandonment_cost_rate_high_dim)

    for i in range(high_dim_class_no):
        cost_pathwise[i] = holding_cost_rate_high_dim[i] + common_abandonment_rate * common_theta
    
    file_name_hourly_mu = os.path.join(folder_path, f"mu_hourly_{high_dim_class_no}dim.csv")
    file_name_hourly_theta = os.path.join(folder_path, f"theta_hourly_{high_dim_class_no}dim.csv")
    file_name_holding_cost = os.path.join(folder_path, f"hourly_holding_cost_{high_dim_class_no}dim.csv")
    file_name_abandonment_cost = os.path.join(folder_path,f"abandonment_cost_{high_dim_class_no}dim.csv")
    file_name_total_cost = os.path.join(folder_path, f"hourly_total_cost_{high_dim_class_no}dim.csv")
    file_name_percentage = os.path.join(folder_path, f"percentages_{high_dim_class_no}dim.csv")
    file_name_agents = os.path.join(folder_path, f"agents_{high_dim_class_no}dim.csv")
    file_name_arrivals = os.path.join(folder_path, f"arrivals_partial_{precision}min_{high_dim_class_no}dim.csv")
    file_name_total_arrivals = os.path.join(folder_path, f"total_arrivals_partial_{precision}min_{high_dim_class_no}dim.csv")
    
    np.savetxt(file_name_total_cost, cost_pathwise, delimiter = ",", fmt="%.2f")
    np.savetxt(file_name_agents, high_dim_agents, delimiter = ",")
    np.savetxt(file_name_holding_cost, holding_cost_rate_high_dim, delimiter = ",", fmt="%.2f")
    np.savetxt(file_name_abandonment_cost, np.array([common_abandonment_rate]), delimiter = ",", fmt="%.2f")
    np.savetxt(file_name_arrivals, arrivals_high_dim / (MINUTES_IN_HOUR/precision), delimiter = ",") ##arrival rate per 5 minutes
    np.savetxt(file_name_percentage, percentage_high_dim * 100, delimiter = ",", fmt="%.2f")
    np.savetxt(file_name_total_arrivals, total_arrivals_high_dim / (MINUTES_IN_HOUR/precision), delimiter = ",")
    np.savetxt(file_name_hourly_mu, np.array([common_mu]), delimiter = ",", fmt="%.2f")
    np.savetxt(file_name_hourly_theta, np.array([common_theta]), delimiter = ",", fmt="%.2f")
    
    return arrivals_high_dim, high_dim_agents, common_mu, common_theta, common_abandonment_rate, high_dim_scaling_factor, percentage_high_dim, cost_pathwise

### 3.2 Limiting arrival rates and zeta calculation and cdf 

In [None]:
# Constants
OLD_SCALING_FACTOR = 400

mu_hourly_main = SECONDS_IN_HOUR / service_times_17dim["Total"]
theta_hourly_main = SECONDS_IN_HOUR / abandonment_times_17dim["Total"]

#30 dimensional test problem
high_dim_class_no = 30

folder_path_30dim = os.path.join(data_folder_path,f"problem_{high_dim_class_no}dim")
os.makedirs(folder_path_30dim, exist_ok=True)

arrivals_high_dim_30dim, high_dim_agents_30dim, common_mu_30dim, common_theta_30dim, common_abandonment_rate_30dim, high_dim_scaling_factor_30dim, percentage_high_dim_30dim, cost_pathwise_30dim = calculate_high_dimensional_parameters(folder_path_30dim, high_dim_class_no, main_test_class_no, lambd_prelimit_hourly, mu_hourly_main, theta_hourly_main, agents, main_test_class_no, OLD_SCALING_FACTOR, precision, seed_value)
lambd_limit_hourly_30dim, zeta_hourly_30dim = compute_limiting_hourly_rates(high_dim_class_no, arrivals_high_dim_30dim, time_focus_start, time_focus_end, precision, common_mu_30dim, percentage_high_dim_30dim, high_dim_agents_30dim, length_time, high_dim_scaling_factor_30dim)

# Calculate cumulative sum of arrivals for each time interval
cumsum_30dim = np.cumsum(arrivals_high_dim_30dim, axis = 0)
cumsum_30dim = cumsum_30dim.T
total_arrivals_reshaped  = np.array(cumsum_30dim[:,high_dim_class_no-1]).reshape(-1, 1)

# Calculate CDF by normalizing cumulative sums with total arrivals
cdf_30dim = cumsum_30dim / total_arrivals_reshaped

# Save the CDF to a CSV file
file_name_cdf = os.path.join(folder_path_30dim, f"cdf_{high_dim_class_no}dim.csv")
np.savetxt(file_name_cdf, cdf_30dim, delimiter=",", fmt="%.3f")

file_name_lambda_limiting = os.path.join(folder_path_30dim, f"hourly_limiting_lambda_{high_dim_class_no}dim.csv")
file_name_zeta = os.path.join(folder_path_30dim, f"hourly_zeta_{high_dim_class_no}dim.csv")

np.savetxt(file_name_lambda_limiting, lambd_limit_hourly_30dim, delimiter = ",")
np.savetxt(file_name_zeta, zeta_hourly_30dim, delimiter = ",")

In [None]:
#50 dimensional test problem
high_dim_class_no = 50

folder_path_50dim = os.path.join(data_folder_path,f"problem_{high_dim_class_no}dim")
os.makedirs(folder_path_50dim, exist_ok=True)

arrivals_high_dim_50dim, high_dim_agents_50dim, common_mu_50dim, common_theta_50dim, common_abandonment_rate_50dim, high_dim_scaling_factor_50dim, percentage_high_dim_50dim, cost_pathwise_50dim = calculate_high_dimensional_parameters(folder_path_50dim, high_dim_class_no, main_test_class_no, lambd_prelimit_hourly, mu_hourly_main, theta_hourly_main, agents, main_test_class_no, OLD_SCALING_FACTOR, precision, seed_value)
lambd_limit_hourly_50dim, zeta_hourly_50dim = compute_limiting_hourly_rates(high_dim_class_no, arrivals_high_dim_50dim, time_focus_start, time_focus_end, precision, common_mu_50dim, percentage_high_dim_50dim, high_dim_agents_50dim, length_time, high_dim_scaling_factor_50dim)

# Calculate cumulative sum of arrivals for each time interval
cumsum_50dim = np.cumsum(arrivals_high_dim_50dim, axis = 0)
cumsum_50dim = cumsum_50dim.T
total_arrivals_reshaped  = np.array(cumsum_50dim[:,high_dim_class_no-1]).reshape(-1, 1)

# Calculate CDF by normalizing cumulative sums with total arrivals
cdf_50dim = cumsum_50dim / total_arrivals_reshaped

# Save the CDF to a CSV file
file_name_cdf = os.path.join(folder_path_50dim, f"cdf_{high_dim_class_no}dim.csv")
np.savetxt(file_name_cdf, cdf_50dim, delimiter=",", fmt="%.3f")

file_name_lambda_limiting = os.path.join(folder_path_50dim, f"hourly_limiting_lambda_{high_dim_class_no}dim.csv")
file_name_zeta = os.path.join(folder_path_50dim, f"hourly_zeta_{high_dim_class_no}dim.csv")

np.savetxt(file_name_lambda_limiting, lambd_limit_hourly_50dim, delimiter = ",")
np.savetxt(file_name_zeta, zeta_hourly_50dim, delimiter = ",")

In [None]:
#100 dimensional test problem
high_dim_class_no = 100

folder_path_100dim = os.path.join(data_folder_path,f"problem_{high_dim_class_no}dim")
os.makedirs(folder_path_100dim, exist_ok=True)

arrivals_high_dim_100dim, high_dim_agents_100dim, common_mu_100dim, common_theta_100dim, common_abandonment_rate_100dim, high_dim_scaling_factor_100dim, percentage_high_dim_100dim, cost_pathwise_100dim = calculate_high_dimensional_parameters(folder_path_100dim, high_dim_class_no, main_test_class_no, lambd_prelimit_hourly, mu_hourly_main, theta_hourly_main, agents, main_test_class_no, OLD_SCALING_FACTOR, precision, seed_value)
lambd_limit_hourly_100dim, zeta_hourly_100dim = compute_limiting_hourly_rates(high_dim_class_no, arrivals_high_dim_100dim, time_focus_start, time_focus_end, precision, common_mu_100dim, percentage_high_dim_100dim, high_dim_agents_100dim, length_time, high_dim_scaling_factor_100dim)

# we need to calculate the cdf of arrivals to use in our discrete event simulation to determine which class arrives

# Calculate cumulative sum of arrivals for each time interval
cumsum_100dim = np.cumsum(arrivals_high_dim_100dim, axis = 0)
cumsum_100dim = cumsum_100dim.T
total_arrivals_reshaped  = np.array(cumsum_100dim[:,high_dim_class_no-1]).reshape(-1, 1)

# Calculate CDF by normalizing cumulative sums with total arrivals
cdf_100dim = cumsum_100dim / total_arrivals_reshaped

# Save the CDF to a CSV file
file_name_cdf = os.path.join(folder_path_100dim, f"cdf_{high_dim_class_no}dim.csv")
np.savetxt(file_name_cdf, cdf_100dim, delimiter=",", fmt="%.3f")

file_name_lambda_limiting = os.path.join(folder_path_100dim, f"hourly_limiting_lambda_{high_dim_class_no}dim.csv")
file_name_zeta = os.path.join(folder_path_100dim, f"hourly_zeta_{high_dim_class_no}dim.csv")

np.savetxt(file_name_lambda_limiting, lambd_limit_hourly_100dim, delimiter = ",")
np.savetxt(file_name_zeta, zeta_hourly_100dim, delimiter = ",")