
# PORT-CITY SIMULATION DATA - DATA PREPROCESSING

## Import Dependencies & Pickle Files





In [1]:
import pickle
import os
import pandas as pd
import numpy as np

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Define variables

In [3]:
# Define the road segment k
interesting_k = [6, 43, 62, 70]

# Define the variable names
variable_names = ["Speed Cars", "Speed Trucks", "Number of Cars", "Number of Trucks", "Flow Cars", "Flow Trucks"]

# Define the mapping from variable names to their indices
variable_name_to_index = {
    "Speed Cars": 0,
    "Speed Trucks": 1,
    "Number of Cars": 2,
    "Number of Trucks": 3,
    "Flow Cars": 4,
    "Flow Trucks": 5
}

aggregation_types = {
    "Speed Cars": "mean",
    "Speed Trucks": "mean",
    "Number of Cars": "mean",
    "Number of Trucks": "mean",
    "Flow Cars": "sum",
    "Flow Trucks": "sum"
}


index_segments_k = {
    "Elicoidale Downstream": 0,
    "Lungomare Canepa": 1,
    "Elicoidale Upstream": 2,
    "Via di Francia": 3
}

segment_variable_count = {
    "Elicoidale Downstream": 6,
    "Lungomare Canepa": 6,
    "Elicoidale Upstream": 6,
    "Via di Francia": 3
}

## Functions

In [4]:
def import_pickle(path):
  if os.path.exists(path):
    with open(path, 'rb') as f:
      print("Pickle file imported from: ", path)
      return pickle.load(f)
  else:
    print("File not found.")

In [5]:
def get_key_by_value(dictionary, value):
    for key, val in dictionary.items():
        if val == value:
            return key
    return None

In [6]:
# Helper function to reshuffle indices
def get_shuffled_indices(N, B):
    indices = np.random.choice(np.arange(N), size=2*B, replace=False)
    return indices[:B], indices[B:2*B]  # Return two non-overlapping subsets

In [7]:
def reshape_raw_data(data, interesting_k, variable_list):
    num_runs, num_segments, num_time_chunks = variable_list[0].shape  # Assumes all variable shapes are equal
    num_interesting_k = len(interesting_k)
    num_variables = len(variable_list)

    # Changed the shape of reshaped_data to (number_of_interesting_k * num_variables, n, t)
    reshaped_data = np.zeros((num_interesting_k * num_variables, num_runs, num_time_chunks))

    for i, k in enumerate(interesting_k):
        for j, variable in enumerate(variable_list):
            # Now assigning data for each segment k across all simulation runs and time chunks
            reshaped_data[i * num_variables + j, :, :] = variable[:, k, :]
    return reshaped_data


In [8]:
def print_first_dimension_organization(reaggregated_data):
    """
    Prints the organization of the first dimension of the reaggregated data.

    Args:
    - reaggregated_data (numpy.ndarray): Reaggregated data array.
    """
    segment_index = list(index_segments_k.values())
    num_variables = reaggregated_data.shape[0] // len(segment_index) if segment_index else 1
    num_interesting_k = len(segment_index) if segment_index else reaggregated_data.shape[0] // num_variables

    for i in range(num_interesting_k):
        start_index = i * num_variables
        end_index = (i + 1) * num_variables - 1
        segment = segment_index[i]
        variables = ", ".join(variable_names)
        print(f"{start_index} to {end_index}: Segment [{get_key_by_value(index_segments_k, segment)}] (Variables {variables})")


In [9]:
def remove_variables_for_segment(data, segment_key, variables_to_remove):
    """
    Removes specified variables for a given segment from the reaggregated data.

    Args:
    - data (numpy.ndarray): The reaggregated data array (shape: (num_interesting_k * num_variables, num_runs, num_time_chunks)).
    - segment_key (str): The key representing the segment in the `index_segments_k` dictionary.
    - variables_to_remove (list): The list of variable names to be removed for the given segment.

    Returns:
    - numpy.ndarray: The modified reaggregated data with the specified variables removed.
    """
    # Get the segment index
    segment_index = index_segments_k[segment_key]

    # Standard number of variables per segment (before removing)
    num_variables = len(variable_names)

    # Calculate the start and end indices for the segment
    start_index = segment_index * num_variables
    end_index = start_index + num_variables

    # Convert variable names to their indices relative to the segment
    variable_indices_to_remove = [variable_name_to_index[var] for var in variables_to_remove]

    # Generate the indices for the variables to keep within this segment
    segment_variables_to_keep = [
        start_index + i for i in range(num_variables)
        if i not in variable_indices_to_remove
    ]

    # Generate the global indices to keep for all other segments
    indices_to_keep = list(range(0, start_index)) + segment_variables_to_keep + list(range(end_index, data.shape[0]))

    # Filter the first dimension (retain only the indices_to_keep)
    modified_data = data[indices_to_keep, :, :]

    return modified_data


In [10]:
def print_segment_index_and_variables_in_data(data):
    """
    Prints the start and end indexes for each segment and the corresponding variables,
    directly based on the reaggregated data.

    Args:
    - data (numpy.ndarray): The reaggregated data array.
    """
    print(f"Data shape: {data.shape}")
    print("-" * 50)

    # Initialize the current index
    current_index = 0

    # Iterate over the segments
    for segment_name, segment_index in index_segments_k.items():
        # Get the number of variables for the current segment
        num_variables = segment_variable_count[segment_name]

        # Calculate start and end indices for the segment
        start_index = current_index
        end_index = start_index + num_variables - 1

        # Slice the data for this segment
        segment_data = data[start_index:end_index + 1]

        # Get the variable names for this segment
        #variable_names = list(variable_name_to_index.keys())[:num_variables] #

        # Print the segment information
        print(f"Segment: {segment_name}")
        print(f"Indexes: {start_index} to {end_index}")
        #print(f"Variables: {', '.join(variable_names)}")
        print(f"Data slice shape: {segment_data.shape}")
        print("-" * 50)

        # Update the current index
        current_index = end_index + 1


In [11]:
def save_dataset(dataset, filename, directory="/content/drive/MyDrive/TESE/data/preprocessed/sequential/"):
    """Saves a dataset to a specified directory on Google Drive.

    Args:
        dataset: The dataset to save (e.g., a NumPy array).
        filename: The name of the file to save the dataset to.
        directory: The directory to save the dataset to.
    """
    os.makedirs(directory, exist_ok=True)  # Create the directory if it doesn't exist
    filepath = os.path.join(directory, filename)
    with open(filepath, 'wb') as f:
      pickle.dump(dataset, f)

In [12]:
def run_pipeline(filename_scenario0, filename_scenario1):
  path_directory = '/content/drive/MyDrive/TESE/data/raw/'
  path_scenario_0 = path_directory + filename_scenario0
  path_scenario_1 = path_directory + filename_scenario1

  raw_data_scenario_0 = import_pickle(path_scenario_0)
  raw_data_scenario_1 = import_pickle(path_scenario_1)

  print()

  variables_s0 = [raw_data_scenario_0['speed_cars'],raw_data_scenario_0['speed_trucks'],
                raw_data_scenario_0['num_cars'], raw_data_scenario_0['num_trucks'],
                raw_data_scenario_0['flow_cars'], raw_data_scenario_0['flow_trucks']]

  variables_s1  = [raw_data_scenario_1['speed_cars'],raw_data_scenario_1['speed_trucks'],
                raw_data_scenario_1['num_cars'], raw_data_scenario_1['num_trucks'],
                raw_data_scenario_1['flow_cars'], raw_data_scenario_1['flow_trucks']]

  reshaped_s0 = reshape_raw_data(raw_data_scenario_0, interesting_k, variables_s0)
  reshaped_s1 = reshape_raw_data(raw_data_scenario_1, interesting_k, variables_s1)

  print(reshaped_s0.shape)
  print_first_dimension_organization(reshaped_s0)
  print()
  print(reshaped_s1.shape)
  print_first_dimension_organization(reshaped_s1)

  print()

  data_s0 = remove_variables_for_segment(reshaped_s0, "Via di Francia", ["Speed Trucks", "Number of Trucks", "Flow Trucks"])
  data_s1 = remove_variables_for_segment(reshaped_s1, "Via di Francia", ["Speed Trucks", "Number of Trucks", "Flow Trucks"])

  print_segment_index_and_variables_in_data(data_s0)
  print_segment_index_and_variables_in_data(data_s1)

  print()

  for timechunk in range(data_s0.shape[2]):
      chunk_data = data_s0[:, :, timechunk]
      filename = f"data_s0_{timechunk + 1}.pkl"
      save_dataset(chunk_data,
                   filename,
                   directory="/content/drive/MyDrive/TESE/data/preprocessed/sequential/" + filename_scenario0[:-4] + "/")

  for timechunk in range(data_s1.shape[2]):
      chunk_data = data_s1[:, :, timechunk]
      filename = f"data_s1_{timechunk + 1}.pkl"
      save_dataset(chunk_data,
                   filename,
                   directory="/content/drive/MyDrive/TESE/data/preprocessed/sequential/" + filename_scenario1[:-4] + "/")

  print("End of pipeline! Files are saved!")


# Data pipeline

In [13]:
#filename_scenario_0 = 'data_scenario0.pkl'
#filename_scenario_1 = 'data_scenario1.pkl'

# GAUSSIAN
filename_scenario_0_new = 'data_scenario0_new.pkl'
filename_scenario_1_new = 'data_scenario1_new.pkl'

# UNIFORM
filename_scenario_2_normal = 'data_scenario2_unif.pkl'
filename_scenario_2_alt = 'data_scenario2_alt_unif.pkl'
filename_scenario_2_weekend_normal = 'data_scenario2_unif_weekend.pkl'
filename_scenario_2_weekend_alt = 'data_scenario2_alt_unif_weekend.pkl'

# POISSON
filename_scenario_0_poisson_weekday = 'data_scenario2_poisson_feriale.pkl'
filename_scenario_0_poisson_weekend = 'data_scenario2_poisson_weekend.pkl'
filename_scenario_1_poisson_weekday = 'data_scenario2_alt_poisson_feriale.pkl'
filename_scenario_1_poisson_weekend = 'data_scenario2_alt_poisson_weekend.pkl'

In [14]:
run_pipeline(filename_scenario_0_poisson_weekday, filename_scenario_1_poisson_weekday)

Pickle file imported from:  /content/drive/MyDrive/TESE/data/raw/data_scenario2_poisson_feriale.pkl
Pickle file imported from:  /content/drive/MyDrive/TESE/data/raw/data_scenario2_alt_poisson_feriale.pkl

(24, 1000, 12)
0 to 5: Segment [Elicoidale Downstream] (Variables Speed Cars, Speed Trucks, Number of Cars, Number of Trucks, Flow Cars, Flow Trucks)
6 to 11: Segment [Lungomare Canepa] (Variables Speed Cars, Speed Trucks, Number of Cars, Number of Trucks, Flow Cars, Flow Trucks)
12 to 17: Segment [Elicoidale Upstream] (Variables Speed Cars, Speed Trucks, Number of Cars, Number of Trucks, Flow Cars, Flow Trucks)
18 to 23: Segment [Via di Francia] (Variables Speed Cars, Speed Trucks, Number of Cars, Number of Trucks, Flow Cars, Flow Trucks)

(24, 1000, 12)
0 to 5: Segment [Elicoidale Downstream] (Variables Speed Cars, Speed Trucks, Number of Cars, Number of Trucks, Flow Cars, Flow Trucks)
6 to 11: Segment [Lungomare Canepa] (Variables Speed Cars, Speed Trucks, Number of Cars, Number of

In [15]:
run_pipeline(filename_scenario_0_poisson_weekend, filename_scenario_1_poisson_weekend)

Pickle file imported from:  /content/drive/MyDrive/TESE/data/raw/data_scenario2_poisson_weekend.pkl
Pickle file imported from:  /content/drive/MyDrive/TESE/data/raw/data_scenario2_alt_poisson_weekend.pkl

(24, 1000, 12)
0 to 5: Segment [Elicoidale Downstream] (Variables Speed Cars, Speed Trucks, Number of Cars, Number of Trucks, Flow Cars, Flow Trucks)
6 to 11: Segment [Lungomare Canepa] (Variables Speed Cars, Speed Trucks, Number of Cars, Number of Trucks, Flow Cars, Flow Trucks)
12 to 17: Segment [Elicoidale Upstream] (Variables Speed Cars, Speed Trucks, Number of Cars, Number of Trucks, Flow Cars, Flow Trucks)
18 to 23: Segment [Via di Francia] (Variables Speed Cars, Speed Trucks, Number of Cars, Number of Trucks, Flow Cars, Flow Trucks)

(24, 1000, 12)
0 to 5: Segment [Elicoidale Downstream] (Variables Speed Cars, Speed Trucks, Number of Cars, Number of Trucks, Flow Cars, Flow Trucks)
6 to 11: Segment [Lungomare Canepa] (Variables Speed Cars, Speed Trucks, Number of Cars, Number of

In [16]:
#run_pipeline(filename_scenario_0_new, filename_scenario_1_new)

In [17]:
#run_pipeline(filename_scenario_2_normal, filename_scenario_2_alt)

In [18]:
#run_pipeline(filename_scenario_2_weekend_normal, filename_scenario_2_weekend_alt)