In [1]:
import numpy as np
import scipy as sp

import matplotlib.pyplot as plt
import matplotlib.axes as axe
import pandas as pd
import datetime as dt
import gurobipy as gp
from gurobipy import GRB
import cvxpy as cp
import yaml

import random
from itertools import chain, combinations, tee
import time

import os


# Data Capping

In [2]:
start_time = time.time()

directory_path = '../data/pems_flow___101_N_Sep_to_Nov_2024/'
file_name_list = [f for f in os.listdir(directory_path) \
                  if os.path.isfile(os.path.join(directory_path, f)) and f[-1] == 'x' and f[0] != "~"]
file_name_list.sort()

thresh_lower = 0.1

# df = pd.read_excel(directory_path + file_name)
df_data_filtered = pd.read_csv(directory_path + 'data_filtered.csv', index_col=0)

dict_data_capped = {}
# dict_data_capped["Data Category"] = \
#     ["Station Type", "Number (data points)", "Mean (capped flow)", "Std Dev (capped flow)"]

for column_name_full in list(df_data_filtered.columns):
    print("column_name_full:", column_name_full)
    
    vals_list_str = df_data_filtered[column_name_full].tolist()
    
#     print("vals_list_str:", vals_list_str)

    column_name = column_name_full[:-5]
    
    dict_data_capped[column_name] = {}
    dict_data_capped[column_name]["Station Type"] = vals_list_str[0]
    
    vals_list = [float(val) for val in vals_list_str[1:]]
    max_val = max(vals_list)
    dict_data_capped[column_name]["Flows (capped)"] = \
        [val for val in vals_list if val != 0.0 and val >= max_val * thresh_lower]
    
    
#     print()
#     print("vals_list:", vals_list)
#     print()
#     print("vals_array_capped:", vals_array_capped)
    
#     dict_data_capped[column_name] += [vals_array_capped.shape[0]]
#     dict_data_capped[column_name] += [np.mean(vals_array_capped)]
#     dict_data_capped[column_name] += [np.std(vals_array_capped)]

    
    
#     print("dict_data_capped[column_name]:", dict_data_capped[column_name])
#     print()


end_time = time.time()

print()
print("Time:", end_time - start_time)



column_name_full: 001___402376_Palo_Alto___main.xlsx
column_name_full: 002___402377_Palo_Alto___main.xlsx
column_name_full: 003___402379_Palo_Alto___main.xlsx
column_name_full: 004___425696_Palo_Alto___off.xlsx
column_name_full: 005___425697_Palo_Alto___main.xlsx
column_name_full: 006___425897_Palo_Alto___on.xlsx
column_name_full: 007___402380_Palo_Alto___main.xlsx
column_name_full: 008___404529_East_Palo_Alto___main.xlsx
column_name_full: 009___422116_East_Palo_Alto___main.xlsx
column_name_full: 010___403059_East_Palo_Alto___off.xlsx
column_name_full: 011___403060_East_Palo_Alto___on.xlsx
column_name_full: 012___400859_East_Palo_Alto___main.xlsx
column_name_full: 013___403061_East_Palo_Alto___on.xlsx
column_name_full: 014___404533_East_Palo_Alto___main.xlsx
column_name_full: 015___408267_East_Palo_Alto___on.xlsx
column_name_full: 016___400981_East_Palo_Alto___main.xlsx
column_name_full: 017___402398_East_Palo_Alto___main.xlsx
column_name_full: 018___404534_Menlo_Park___main.xlsx
colum

# Data Consolidation

In [3]:
# Assumption: The data starts and ends with main.
# Assumption: There is > 1 column.

column_name_list = list(dict_data_capped.keys())[1:]

dict_data_consolidated = {}

counter = 0
for column_index, column_name in enumerate(list(dict_data_capped.keys())):
#     print("column_index:", column_index)
    
    if column_index == 0:
        station_type_current = dict_data_capped[column_name]["Station Type"]
        
        dict_data_consolidated[counter] = {}
        dict_data_consolidated[counter]["Station Type"] = dict_data_capped[column_name]["Station Type"]
        dict_data_consolidated[counter]["Start Station"] = column_name
        dict_data_consolidated[counter]["End Station"] = column_name
        dict_data_consolidated[counter]["Flow Data"] = dict_data_capped[column_name]["Flows (capped)"]
        
    elif column_index != 0 and dict_data_capped[column_name]["Station Type"] != station_type_current:
        counter += 1
        
        station_type_current = dict_data_capped[column_name]["Station Type"]
        
        dict_data_consolidated[counter] = {}
        dict_data_consolidated[counter]["Station Type"] = dict_data_capped[column_name]["Station Type"]
        dict_data_consolidated[counter]["Start Station"] = column_name
        dict_data_consolidated[counter]["End Station"] = column_name
        dict_data_consolidated[counter]["Flow Data"] = dict_data_capped[column_name]["Flows (capped)"]
        
    elif column_index != 0 and dict_data_capped[column_name]["Station Type"] == station_type_current:
        dict_data_consolidated[counter]["End Station"] = column_name
        dict_data_consolidated[counter]["Flow Data"] += dict_data_capped[column_name]["Flows (capped)"]
        
    else:
        assert 1 == 0, "There should be no such case."


In [4]:
len(dict_data_consolidated)

63

In [5]:
# Add (1) Number of data points, (2) Mean of capped flow, (3) Std Dev of capped flow, from flow data.

for counter in list(dict_data_consolidated.keys()):
    print("counter:", counter)
    print()
    
    flow_data_array = np.array(dict_data_consolidated[counter]["Flow Data"])
#     print("flow_data_array:", flow_data_array)
#     print()
    
    dict_data_consolidated[counter]["Number (data points)"] = flow_data_array.shape[0]
    dict_data_consolidated[counter]["Mean (capped flow)"] = np.mean(flow_data_array)
    dict_data_consolidated[counter]["Std Dev (capped flow)"] = np.std(flow_data_array)

dict_data_consolidated[0]

counter: 0

counter: 1

counter: 2

counter: 3

counter: 4

counter: 5

counter: 6

counter: 7

counter: 8

counter: 9

counter: 10

counter: 11

counter: 12

counter: 13

counter: 14

counter: 15

counter: 16

counter: 17

counter: 18

counter: 19

counter: 20

counter: 21

counter: 22

counter: 23

counter: 24

counter: 25

counter: 26

counter: 27

counter: 28

counter: 29

counter: 30

counter: 31

counter: 32

counter: 33

counter: 34

counter: 35

counter: 36

counter: 37

counter: 38

counter: 39

counter: 40

counter: 41

counter: 42

counter: 43

counter: 44

counter: 45

counter: 46

counter: 47

counter: 48

counter: 49

counter: 50

counter: 51

counter: 52

counter: 53

counter: 54

counter: 55

counter: 56

counter: 57

counter: 58

counter: 59

counter: 60

counter: 61

counter: 62



{'Station Type': 'main',
 'Start Station': '001___402376_Palo_Alto___main',
 'End Station': '003___402379_Palo_Alto___main',
 'Flow Data': [3258.0,
  4324.0,
  4520.0,
  4025.0,
  3506.0,
  3300.0,
  3159.0,
  3331.0,
  4228.0,
  4318.0,
  3837.0,
  3518.0,
  3422.0,
  3125.0,
  3253.0,
  4243.0,
  4455.0,
  4202.0,
  3663.0,
  3296.0,
  3435.0,
  3103.0,
  4053.0,
  3998.0,
  3775.0,
  3342.0,
  3287.0,
  3325.0,
  4039.0,
  4944.0,
  5117.0,
  4760.0,
  4195.0,
  4008.0,
  3966.0,
  3391.0,
  4452.0,
  4653.0,
  4150.0,
  3718.0,
  3407.0,
  3098.0,
  3026.0,
  3996.0,
  4518.0,
  4005.0,
  3342.0,
  3517.0,
  3229.0,
  3800.0,
  4706.0,
  4612.0,
  4146.0,
  4000.0,
  4024.0,
  4065.0,
  3886.0,
  4681.0,
  4786.0,
  4430.0,
  3941.0,
  3680.0,
  3602.0,
  3502.0,
  4460.0,
  4688.0,
  4229.0,
  3461.0,
  3037.0,
  3188.0,
  3574.0,
  4488.0,
  4606.0,
  3984.0,
  3608.0,
  3216.0,
  3158.0,
  4143.0,
  4289.0,
  4595.0,
  4272.0,
  3658.0,
  3419.0,
  3163.0,
  3931.0,
  4674.0,
  

# Data Consistency Enforcement

In [6]:
dict_data_consolidated_to_save = {}
dict_data_consolidated_to_save["Data Category"] \
    = ["Station Type", "Start Station", "End Station", \
       "Number (data points)", "Mean (capped flow)", "Std Dev (capped flow)"]

for counter in list(dict_data_consolidated.keys()):
    dict_data_consolidated_to_save[counter] = []
    dict_data_consolidated_to_save[counter] += [dict_data_consolidated[counter]["Station Type"]]
    dict_data_consolidated_to_save[counter] += [dict_data_consolidated[counter]["Start Station"]]
    dict_data_consolidated_to_save[counter] += [dict_data_consolidated[counter]["End Station"]]
    dict_data_consolidated_to_save[counter] += [dict_data_consolidated[counter]["Number (data points)"]]
    dict_data_consolidated_to_save[counter] += [dict_data_consolidated[counter]["Mean (capped flow)"]]
    dict_data_consolidated_to_save[counter] += [dict_data_consolidated[counter]["Std Dev (capped flow)"]]

# df_dict_data_consolidated_to_save = pd.DataFrame(dict_data_consolidated_to_save)
# df_dict_data_consolidated_to_save.to_csv(directory_path + 'data_capped_consolidated.csv', index = False)



## Optimization Problem for Data Consistency Enforcement

In [7]:
## Define od_pairs_dict:

num_data = len(list(dict_data_consolidated.keys()))
# print(num_data)

# flow_data_consistent = np.float_(df_data_consistent.iloc[-1].tolist())

station_type_list = [dict_data_consolidated[index]["Station Type"] for index in range(num_data)]
origin_list = [0] + [index for index in range(num_data) if station_type_list[index] == "on"]
destination_list = [index for index in range(num_data) if station_type_list[index] == "off"] + [num_data - 1]
mainline_list = [index for index in range(num_data) if station_type_list[index] == "main"][1:-1]

print("origin_list:", origin_list)
print("destination_list:", destination_list)

counter = 0
od_pairs_dict = {}
for origin_index in origin_list:
    for destination_index in destination_list:
        if destination_index > origin_index:
            od_pairs_dict[counter] = [origin_index, destination_index]
            counter += 1
    
od_pairs_dict


origin_list: [0, 3, 6, 8, 10, 15, 17, 19, 21, 24, 26, 30, 32, 34, 39, 42, 46, 49, 51, 53, 57]
destination_list: [1, 5, 12, 14, 23, 28, 36, 38, 41, 44, 48, 55, 59, 61, 62]


{0: [0, 1],
 1: [0, 5],
 2: [0, 12],
 3: [0, 14],
 4: [0, 23],
 5: [0, 28],
 6: [0, 36],
 7: [0, 38],
 8: [0, 41],
 9: [0, 44],
 10: [0, 48],
 11: [0, 55],
 12: [0, 59],
 13: [0, 61],
 14: [0, 62],
 15: [3, 5],
 16: [3, 12],
 17: [3, 14],
 18: [3, 23],
 19: [3, 28],
 20: [3, 36],
 21: [3, 38],
 22: [3, 41],
 23: [3, 44],
 24: [3, 48],
 25: [3, 55],
 26: [3, 59],
 27: [3, 61],
 28: [3, 62],
 29: [6, 12],
 30: [6, 14],
 31: [6, 23],
 32: [6, 28],
 33: [6, 36],
 34: [6, 38],
 35: [6, 41],
 36: [6, 44],
 37: [6, 48],
 38: [6, 55],
 39: [6, 59],
 40: [6, 61],
 41: [6, 62],
 42: [8, 12],
 43: [8, 14],
 44: [8, 23],
 45: [8, 28],
 46: [8, 36],
 47: [8, 38],
 48: [8, 41],
 49: [8, 44],
 50: [8, 48],
 51: [8, 55],
 52: [8, 59],
 53: [8, 61],
 54: [8, 62],
 55: [10, 12],
 56: [10, 14],
 57: [10, 23],
 58: [10, 28],
 59: [10, 36],
 60: [10, 38],
 61: [10, 41],
 62: [10, 44],
 63: [10, 48],
 64: [10, 55],
 65: [10, 59],
 66: [10, 61],
 67: [10, 62],
 68: [15, 23],
 69: [15, 28],
 70: [15, 36],
 71

In [8]:
## Define constraint_matrix:

num_od = len(od_pairs_dict)

constraint_matrix = np.zeros((num_data, num_od))
for i in range(num_data):
    if station_type_list[i] == "on" or i == 0:
        indices_set_to_one = [counter for counter in range(num_od) if od_pairs_dict[counter][0] == i]
        constraint_matrix[i, indices_set_to_one] = 1
    elif station_type_list[i] == "off" or i == num_data - 1:
        indices_set_to_one = [counter for counter in range(num_od) if od_pairs_dict[counter][1] == i]
        constraint_matrix[i, indices_set_to_one] = 1
    else:
        indices_set_to_one = [counter for counter in range(num_od) \
                              if od_pairs_dict[counter][0] < i and od_pairs_dict[counter][1] > i]
        constraint_matrix[i, indices_set_to_one] = 1
        
constraint_matrix


array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [9]:
## Optimization Problem:

# Variables:    
data_var = cp.Variable(num_data)
data_od = cp.Variable(num_od)

# Cost Function:
func = 0.0
data_mean = np.array([dict_data_consolidated[counter]["Mean (capped flow)"] for counter in range(num_data)])
kernel = np.diag([1/dict_data_consolidated[counter]["Mean (capped flow)"]**2 for counter in range(num_data)])

func += cp.quad_form(data_var - data_mean, kernel)

# Objective:
objective = cp.Minimize(func)

# Constraints:
constraints = []
constraints += [constraint_matrix @ data_od == data_var]
constraints += [data_od >= 0.0]

# Problem:
prob = cp.Problem(objective, constraints)

# Solve:
result = prob.solve()

# Extract Values:

data_var_values = data_var.value
data_od_values = data_od.value

print("data_var_values.shape:", data_var_values.shape)
print("data_var_values:", data_var_values)
print()
print("data_od_values.shape:", data_od_values.shape)
print("data_od_values:", data_od_values)


data_var_values.shape: (63,)
data_var_values: [4077.14716085  612.53961885 3464.60754682  770.95648744 4235.56403649
   98.03492783  257.07363436 4394.60275434  449.31250701 4843.91526048
  232.28069199 5076.1959469   620.42320417 4455.77273523  775.76575302
  948.80689894 4628.8138733    57.07490815 4685.88876136  175.6527938
 4861.54154731  535.98743678 5397.52898765  460.44075392  424.22728858
 5361.31552175  226.53279865 5587.8483275   867.07591575 4720.77241048
  409.6886826  5130.46109553  879.73655869 6010.19765987  194.91262553
 6205.11029256  123.89549894 6081.21474468  752.27456028  341.14220979
 5670.08239062  546.69982717  439.61469119 5562.99724019   77.59449562
 5485.40273127   90.21267462 5575.61536494  762.39618781  608.66753039
 5421.88669588  459.24512608 5881.131811    596.87506494 6478.00686218
  789.73387148 5688.27297942  635.30465818 6323.57762691  573.46293878
 5750.11466385  818.01401693 4932.1006322 ]

data_od_values.shape: (192,)
data_od_values: [ 6.12539617e

In [10]:
print("Residual:", np.linalg.norm(constraint_matrix @ data_od.value - data_var.value))

Residual: 7.823835578637513e-05


In [11]:
dict_data_consolidated_to_save["Data Category"] += ["Consistent Flow"]

for counter in list(dict_data_consolidated.keys()):
    print("data_var_value[counter]:", data_var_values[counter])
    dict_data_consolidated_to_save[counter].append(data_var_values[counter])

# print()
# print("dict_data_consolidated_to_save[10]:", dict_data_consolidated_to_save[10])
# print()
# print("dict_data_consolidated_to_save:", dict_data_consolidated_to_save)

# print()
# for counter in list(dict_data_consolidated.keys()):
#     print("counter")
#     print("len(list(dict_data_consolidated_to_save[counter])):", \
#           len(list(dict_data_consolidated_to_save[counter])))

# print("len(list(dict_data_consolidated_to_save[Data Category]))", \
#       len(list(dict_data_consolidated_to_save["Data Category"] )))



data_var_value[counter]: 4077.1471608500797
data_var_value[counter]: 612.5396188457191
data_var_value[counter]: 3464.6075468234853
data_var_value[counter]: 770.956487436798
data_var_value[counter]: 4235.564036487306
data_var_value[counter]: 98.03492782898499
data_var_value[counter]: 257.07363436211415
data_var_value[counter]: 4394.602754336838
data_var_value[counter]: 449.31250700551277
data_var_value[counter]: 4843.915260479178
data_var_value[counter]: 232.2806919862353
data_var_value[counter]: 5076.195946904687
data_var_value[counter]: 620.4232041667378
data_var_value[counter]: 4455.772735229941
data_var_value[counter]: 775.7657530162196
data_var_value[counter]: 948.8068989399695
data_var_value[counter]: 4628.813873303177
data_var_value[counter]: 57.07490815211697
data_var_value[counter]: 4685.888761359458
data_var_value[counter]: 175.65279380219042
data_var_value[counter]: 4861.541547312207
data_var_value[counter]: 535.9874367796592
data_var_value[counter]: 5397.528987652317
data_va

In [12]:
df_dict_data_consolidated_to_save = pd.DataFrame(dict_data_consolidated_to_save)
df_dict_data_consolidated_to_save.to_csv(directory_path + 'data_capped_consolidated.csv', index = False)


# Origin-Destination Pair Generation

In [13]:
# ## Optimization Problem:

# Variables:
num_od = len(od_pairs_dict)
data_od_max_entr = cp.Variable(num_od)

# Cost Function:
func = 0.0
for k in range(num_od):
    func += cp.entr(data_od_max_entr[k]) + data_od_max_entr[k]

# Objective:
objective = cp.Maximize(func)

# Constraints:
constraints = []
# constraints += [constraint_matrix @ data_od_max_entr == data_var.value]
constraints += [constraint_matrix @ data_od_max_entr == constraint_matrix @ data_od.value]
constraints += [data_od_max_entr >= 0.0]

# Problem:
prob = cp.Problem(objective, constraints)

# Solve:
result = prob.solve()

# Test feasibility:
if prob.status == "infeasible":
    print("Problem is infeasible") 
else:
    print("Problem is feasible") 
print()

# Extract Values:

data_od_max_entr_values = data_od_max_entr.value
print("data_od_max_entr_values.shape:", data_od_max_entr_values.shape)
print("data_od_max_entr_values:", data_od_max_entr_values)


Problem is feasible

data_od_max_entr_values.shape: (192,)
data_od_max_entr_values: [6.12539617e+02 8.01904526e+01 4.13649586e+02 5.17217749e+02
 2.09301150e+02 3.48246701e+02 3.78605145e+01 2.29864661e+02
 1.56996252e+02 2.05290503e+01 1.98371494e+02 1.52678919e+02
 9.97271841e+01 1.42257758e+02 8.57716057e+02 1.78444673e+01
 9.20465484e+01 1.15094185e+02 4.65752608e+01 7.74914905e+01
 8.42388957e+00 5.11492318e+01 3.49355046e+01 4.56620766e+00
 4.41432391e+01 3.39748215e+01 2.21920003e+01 3.16558981e+01
 1.90863743e+02 3.14204624e+01 3.92880828e+01 1.58982308e+01
 2.64519797e+01 2.87505091e+00 1.74597387e+01 1.19251273e+01
 1.55815742e+00 1.50682040e+01 1.15972002e+01 7.57510869e+00
 1.08056361e+01 6.51506561e+01 5.49164210e+01 6.86666933e+01
 2.77868808e+01 4.62322624e+01 5.02558059e+00 3.05160865e+01
 2.08426424e+01 2.72404740e+00 2.63361645e+01 2.02695037e+01
 1.32398659e+01 1.88860180e+01 1.13870338e+02 2.83901917e+01
 3.54990463e+01 1.43649438e+01 2.39008689e+01 2.59765172e+00
 

In [14]:
## Store data in pandas:
# Start counter
# End counter
# Start station
# End station
# O-D flow

dict_od_data_to_save = {}
dict_od_data_to_save["Data Category"] \
    = ["Start Index", "End Index", "Start Station", "End Station", \
       "O-D Flow (Initialization)", "O-D Flow (Max Entropy)"]

for index in range(num_od):
    print("index:", index)
    dict_od_data_to_save[index] = []
    
    start_counter = od_pairs_dict[index][0]
    end_counter = od_pairs_dict[index][1]
    print("start_counter:", start_counter)
    print("end_counter:", end_counter)
    print()

    dict_od_data_to_save[index] += [start_counter]
    dict_od_data_to_save[index] += [end_counter]
    dict_od_data_to_save[index] += [dict_data_consolidated[start_counter]["Start Station"]]
    dict_od_data_to_save[index] += [dict_data_consolidated[end_counter]["End Station"]]
    dict_od_data_to_save[index] += [max(data_od_values[index], 0.0) ]
    dict_od_data_to_save[index] += [max(data_od_max_entr_values[index], 0.0)]

dict_od_data_to_save

index: 0
start_counter: 0
end_counter: 1

index: 1
start_counter: 0
end_counter: 5

index: 2
start_counter: 0
end_counter: 12

index: 3
start_counter: 0
end_counter: 14

index: 4
start_counter: 0
end_counter: 23

index: 5
start_counter: 0
end_counter: 28

index: 6
start_counter: 0
end_counter: 36

index: 7
start_counter: 0
end_counter: 38

index: 8
start_counter: 0
end_counter: 41

index: 9
start_counter: 0
end_counter: 44

index: 10
start_counter: 0
end_counter: 48

index: 11
start_counter: 0
end_counter: 55

index: 12
start_counter: 0
end_counter: 59

index: 13
start_counter: 0
end_counter: 61

index: 14
start_counter: 0
end_counter: 62

index: 15
start_counter: 3
end_counter: 5

index: 16
start_counter: 3
end_counter: 12

index: 17
start_counter: 3
end_counter: 14

index: 18
start_counter: 3
end_counter: 23

index: 19
start_counter: 3
end_counter: 28

index: 20
start_counter: 3
end_counter: 36

index: 21
start_counter: 3
end_counter: 38

index: 22
start_counter: 3
end_counter: 41

i

{'Data Category': ['Start Index',
  'End Index',
  'Start Station',
  'End Station',
  'O-D Flow (Initialization)',
  'O-D Flow (Max Entropy)'],
 0: [0,
  1,
  '001___402376_Palo_Alto___main',
  '004___425696_Palo_Alto___off',
  612.539616936298,
  612.539616936298],
 1: [0,
  5,
  '001___402376_Palo_Alto___main',
  '010___403059_East_Palo_Alto___off',
  98.02842969746226,
  80.19045260172992],
 2: [0,
  12,
  '001___402376_Palo_Alto___main',
  '027___403206_Redwood_City___off',
  309.21104560854025,
  413.6495858413996],
 3: [0,
  14,
  '001___402376_Palo_Alto___main',
  '032___410094_Redwood_City___off',
  340.9217997502863,
  517.2177492843396],
 4: [0,
  23,
  '001___402376_Palo_Alto___main',
  '045___410111_Belmont___off',
  225.44612634033228,
  209.30114967791195],
 5: [0,
  28,
  '001___402376_Palo_Alto___main',
  '051___409888_San_Mateo___off',
  268.39336877576807,
  348.2467008641436],
 6: [0,
  36,
  '001___402376_Palo_Alto___main',
  '067___405847_San_Mateo___off',
  124.0

In [15]:
df_od_data_to_save = pd.DataFrame(dict_od_data_to_save)
df_od_data_to_save.to_csv(directory_path + 'data_od.csv', index = False)


In [16]:
print("Residual:", np.linalg.norm(constraint_matrix @ data_od_max_entr.value - data_var.value))


Residual: 0.00015070692506787074


# Scratch Work

In [None]:
ell = [1, 2, 3, 4, 5]
# ell[1:-1]
ell += [6]
ell