In [1]:
import numpy as np
import scipy as sp

import matplotlib.pyplot as plt
import matplotlib.axes as axe
import pandas as pd
import datetime as dt
import gurobipy as gp
from gurobipy import GRB
import cvxpy as cp
import yaml

import random
from itertools import chain, combinations, tee
import time

import os


# Data Capping

In [2]:
start_time = time.time()

directory_path = '../data/pems___101_N_Sep_2024/'
file_name_list = [f for f in os.listdir(directory_path) \
                  if os.path.isfile(os.path.join(directory_path, f)) and f[-1] == 'x' and f[0] != "~"]
file_name_list.sort()

thresh_lower = 0.1

# df = pd.read_excel(directory_path + file_name)
df_data_filtered = pd.read_csv(directory_path + 'data_filtered.csv', index_col=0)

dict_data_capped = {}
# dict_data_capped["Data Category"] = \
#     ["Station Type", "Number (data points)", "Mean (capped flow)", "Std Dev (capped flow)"]

times_dict = {}
for column_name_full in list(df_data_filtered.columns):
    print("column_name_full:", column_name_full)
    
    vals_list_str = df_data_filtered[column_name_full].tolist()
    
#     print("vals_list_str:", vals_list_str)

    column_name = column_name_full[:-5]
    
    dict_data_capped[column_name] = {}
    dict_data_capped[column_name]["Station Type"] = vals_list_str[0]
    
    vals_list = [float(val) for val in vals_list_str[1:]]
    max_val = max(vals_list)
    dict_data_capped[column_name]["Flows (capped)"] = \
        [val for val in vals_list if val != 0.0 and val >= max_val * thresh_lower]
    
    
#     print()
#     print("vals_list:", vals_list)
#     print()
#     print("vals_array_capped:", vals_array_capped)
    
#     dict_data_capped[column_name] += [vals_array_capped.shape[0]]
#     dict_data_capped[column_name] += [np.mean(vals_array_capped)]
#     dict_data_capped[column_name] += [np.std(vals_array_capped)]

    
    
#     print("dict_data_capped[column_name]:", dict_data_capped[column_name])
#     print()


end_time = time.time()

print()
print("Time:", end_time - start_time)



column_name_full: 001___402376_Palo_Alto___main.xlsx
column_name_full: 002___402377_Palo_Alto___main.xlsx
column_name_full: 003___402379_Palo_Alto___main.xlsx
column_name_full: 004___425696_Palo_Alto___off.xlsx
column_name_full: 005___425697_Palo_Alto___main.xlsx
column_name_full: 006___425897_Palo_Alto___on.xlsx
column_name_full: 007___402380_Palo_Alto___main.xlsx
column_name_full: 008___404259_East_Palo_Alto___main.xlsx
column_name_full: 009___422116_East_Palo_Alto___main.xlsx
column_name_full: 010___403059_East_Palo_Alto___off.xlsx
column_name_full: 011___400859_East_Palo_Alto___main.xlsx
column_name_full: 012___403060_East_Palo_Alto___on.xlsx
column_name_full: 013___403061_East_Palo_Alto___on.xlsx
column_name_full: 014___404533_East_Palo_Alto___main.xlsx
column_name_full: 015___408267_East_Palo_Alto___on.xlsx
column_name_full: 016___400981_East_Palo_Alto___main.xlsx
column_name_full: 017___402398_East_Palo_Alto___main.xlsx
column_name_full: 018___404534_Menlo_Park___main.xlsx
colum

# Data Consolidation

In [3]:
# Assumption: The data starts and ends with main.
# Assumption: There is > 1 column.

column_name_list = list(dict_data_capped.keys())[1:]

dict_data_consolidated = {}

counter = 0
for column_index, column_name in enumerate(list(dict_data_capped.keys())):
#     print("column_index:", column_index)
    
    if column_index == 0:
        station_type_current = dict_data_capped[column_name]["Station Type"]
        
        dict_data_consolidated[counter] = {}
        dict_data_consolidated[counter]["Station Type"] = dict_data_capped[column_name]["Station Type"]
        dict_data_consolidated[counter]["Start Station"] = column_name
        dict_data_consolidated[counter]["End Station"] = column_name
        dict_data_consolidated[counter]["Flow Data"] = dict_data_capped[column_name]["Flows (capped)"]
        
    elif column_index != 0 and dict_data_capped[column_name]["Station Type"] != station_type_current:
        counter += 1
        
        station_type_current = dict_data_capped[column_name]["Station Type"]
        
        dict_data_consolidated[counter] = {}
        dict_data_consolidated[counter]["Station Type"] = dict_data_capped[column_name]["Station Type"]
        dict_data_consolidated[counter]["Start Station"] = column_name
        dict_data_consolidated[counter]["End Station"] = column_name
        dict_data_consolidated[counter]["Flow Data"] = dict_data_capped[column_name]["Flows (capped)"]
        
    elif column_index != 0 and dict_data_capped[column_name]["Station Type"] == station_type_current:
        dict_data_consolidated[counter]["End Station"] = column_name
        dict_data_consolidated[counter]["Flow Data"] += dict_data_capped[column_name]["Flows (capped)"]
        
    else:
        assert 1 == 0, "There should be no such case."


In [4]:
len(dict_data_consolidated)

61

In [5]:
# Add (1) Number of data points, (2) Mean of capped flow, (3) Std Dev of capped flow, from flow data.

for counter in list(dict_data_consolidated.keys()):
    print("counter:", counter)
    print()
    
    flow_data_array = np.array(dict_data_consolidated[counter]["Flow Data"])
#     print("flow_data_array:", flow_data_array)
#     print()
    
    dict_data_consolidated[counter]["Number (data points)"] = flow_data_array.shape[0]
    dict_data_consolidated[counter]["Mean (capped flow)"] = np.mean(flow_data_array)
    dict_data_consolidated[counter]["Std Dev (capped flow)"] = np.std(flow_data_array)

dict_data_consolidated[0]

counter: 0

counter: 1

counter: 2

counter: 3

counter: 4

counter: 5

counter: 6

counter: 7

counter: 8

counter: 9

counter: 10

counter: 11

counter: 12

counter: 13

counter: 14

counter: 15

counter: 16

counter: 17

counter: 18

counter: 19

counter: 20

counter: 21

counter: 22

counter: 23

counter: 24

counter: 25

counter: 26

counter: 27

counter: 28

counter: 29

counter: 30

counter: 31

counter: 32

counter: 33

counter: 34

counter: 35

counter: 36

counter: 37

counter: 38

counter: 39

counter: 40

counter: 41

counter: 42

counter: 43

counter: 44

counter: 45

counter: 46

counter: 47

counter: 48

counter: 49

counter: 50

counter: 51

counter: 52

counter: 53

counter: 54

counter: 55

counter: 56

counter: 57

counter: 58

counter: 59

counter: 60



{'Station Type': 'main',
 'Start Station': '001___402376_Palo_Alto___main',
 'End Station': '003___402379_Palo_Alto___main',
 'Flow Data': [3258.0,
  4324.0,
  4520.0,
  4025.0,
  3506.0,
  3300.0,
  3159.0,
  3331.0,
  4228.0,
  4318.0,
  3837.0,
  3518.0,
  3422.0,
  3125.0,
  3253.0,
  4243.0,
  4455.0,
  4202.0,
  3663.0,
  3296.0,
  3435.0,
  3103.0,
  4053.0,
  3998.0,
  3775.0,
  3342.0,
  3287.0,
  3325.0,
  4039.0,
  4944.0,
  5117.0,
  4760.0,
  4195.0,
  4008.0,
  3966.0,
  3391.0,
  4452.0,
  4653.0,
  4150.0,
  3718.0,
  3407.0,
  3098.0,
  3026.0,
  3996.0,
  4518.0,
  4005.0,
  3342.0,
  3517.0,
  3229.0,
  3800.0,
  4706.0,
  4612.0,
  4146.0,
  4000.0,
  4024.0,
  4065.0,
  3886.0,
  4681.0,
  4786.0,
  4430.0,
  3941.0,
  3680.0,
  3602.0,
  3502.0,
  4460.0,
  4688.0,
  4229.0,
  3461.0,
  3037.0,
  3188.0,
  3574.0,
  4488.0,
  4606.0,
  3984.0,
  3608.0,
  3216.0,
  3158.0,
  4143.0,
  4289.0,
  4595.0,
  4272.0,
  3658.0,
  3419.0,
  3163.0,
  3931.0,
  4674.0,
  

# Data Consistency Enforcement

In [6]:
dict_data_consolidated_to_save = {}
dict_data_consolidated_to_save["Data Category"] \
    = ["Station Type", "Start Station", "End Station", \
       "Number (data points)", "Mean (capped flow)", "Std Dev (capped flow)"]

for counter in list(dict_data_consolidated.keys()):
    dict_data_consolidated_to_save[counter] = []
    dict_data_consolidated_to_save[counter] += [dict_data_consolidated[counter]["Station Type"]]
    dict_data_consolidated_to_save[counter] += [dict_data_consolidated[counter]["Start Station"]]
    dict_data_consolidated_to_save[counter] += [dict_data_consolidated[counter]["End Station"]]
    dict_data_consolidated_to_save[counter] += [dict_data_consolidated[counter]["Number (data points)"]]
    dict_data_consolidated_to_save[counter] += [dict_data_consolidated[counter]["Mean (capped flow)"]]
    dict_data_consolidated_to_save[counter] += [dict_data_consolidated[counter]["Std Dev (capped flow)"]]

# df_dict_data_consolidated_to_save = pd.DataFrame(dict_data_consolidated_to_save)
# df_dict_data_consolidated_to_save.to_csv(directory_path + 'data_capped_consolidated.csv', index = False)



## Optimization Problem for Data Consistency Enforcement

In [7]:
## Define od_pairs_dict:

num_data = len(list(dict_data_consolidated.keys()))
# print(num_data)

# flow_data_consistent = np.float_(df_data_consistent.iloc[-1].tolist())

station_type_list = [dict_data_consolidated[index]["Station Type"] for index in range(num_data)]
origin_list = [0] + [index for index in range(num_data) if station_type_list[index] == "on"]
destination_list = [index for index in range(num_data) if station_type_list[index] == "off"] + [num_data - 1]
mainline_list = [index for index in range(num_data) if station_type_list[index] == "main"][1:-1]

print("origin_list:", origin_list)
print("destination_list:", destination_list)

counter = 0
od_pairs_dict = {}
for origin_index in origin_list:
    for destination_index in destination_list:
        if destination_index > origin_index:
            od_pairs_dict[counter] = [origin_index, destination_index]
            counter += 1
    
od_pairs_dict


origin_list: [0, 3, 7, 9, 14, 16, 18, 20, 23, 25, 29, 31, 33, 37, 40, 44, 47, 49, 51, 55]
destination_list: [1, 5, 11, 13, 22, 27, 35, 39, 42, 46, 53, 57, 59, 60]


{0: [0, 1],
 1: [0, 5],
 2: [0, 11],
 3: [0, 13],
 4: [0, 22],
 5: [0, 27],
 6: [0, 35],
 7: [0, 39],
 8: [0, 42],
 9: [0, 46],
 10: [0, 53],
 11: [0, 57],
 12: [0, 59],
 13: [0, 60],
 14: [3, 5],
 15: [3, 11],
 16: [3, 13],
 17: [3, 22],
 18: [3, 27],
 19: [3, 35],
 20: [3, 39],
 21: [3, 42],
 22: [3, 46],
 23: [3, 53],
 24: [3, 57],
 25: [3, 59],
 26: [3, 60],
 27: [7, 11],
 28: [7, 13],
 29: [7, 22],
 30: [7, 27],
 31: [7, 35],
 32: [7, 39],
 33: [7, 42],
 34: [7, 46],
 35: [7, 53],
 36: [7, 57],
 37: [7, 59],
 38: [7, 60],
 39: [9, 11],
 40: [9, 13],
 41: [9, 22],
 42: [9, 27],
 43: [9, 35],
 44: [9, 39],
 45: [9, 42],
 46: [9, 46],
 47: [9, 53],
 48: [9, 57],
 49: [9, 59],
 50: [9, 60],
 51: [14, 22],
 52: [14, 27],
 53: [14, 35],
 54: [14, 39],
 55: [14, 42],
 56: [14, 46],
 57: [14, 53],
 58: [14, 57],
 59: [14, 59],
 60: [14, 60],
 61: [16, 22],
 62: [16, 27],
 63: [16, 35],
 64: [16, 39],
 65: [16, 42],
 66: [16, 46],
 67: [16, 53],
 68: [16, 57],
 69: [16, 59],
 70: [16, 60],

In [8]:
## Define constraint_matrix:

num_od = len(od_pairs_dict)

constraint_matrix = np.zeros((num_data, num_od))
for i in range(num_data):
    if station_type_list[i] == "on" or i == 0:
        indices_set_to_one = [counter for counter in range(num_od) if od_pairs_dict[counter][0] == i]
        constraint_matrix[i, indices_set_to_one] = 1
    elif station_type_list[i] == "off" or i == num_data - 1:
        indices_set_to_one = [counter for counter in range(num_od) if od_pairs_dict[counter][1] == i]
        constraint_matrix[i, indices_set_to_one] = 1
    else:
        indices_set_to_one = [counter for counter in range(num_od) \
                              if od_pairs_dict[counter][0] < i and od_pairs_dict[counter][1] > i]
        constraint_matrix[i, indices_set_to_one] = 1
        
constraint_matrix


array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [9]:
## Optimization Problem:

# Variables:    
data_var = cp.Variable(num_data)
data_od = cp.Variable(num_od)

# Cost Function:
func = 0.0
data_mean = np.array([dict_data_consolidated[counter]["Mean (capped flow)"] for counter in range(num_data)])
kernel = np.diag([1/dict_data_consolidated[counter]["Mean (capped flow)"]**2 for counter in range(num_data)])

func += cp.quad_form(data_var - data_mean, kernel)

# Objective:
objective = cp.Minimize(func)

# Constraints:
constraints = []
constraints += [constraint_matrix @ data_od == data_var]
constraints += [data_od >= 0.0]

# Problem:
prob = cp.Problem(objective, constraints)

# Solve:
result = prob.solve()

# Extract Values:

data_var_values = data_var.value
data_od_values = data_od.value

print("data_var_values.shape:", data_var_values.shape)
print("data_var_values:", data_var_values)
print()
print("data_od_values.shape:", data_od_values.shape)
print("data_od_values:", data_od_values)


data_var_values.shape: (61,)
data_var_values: [4226.17436841  617.30880252 3608.8655702   779.57380444 4388.43937392
   92.22995581 4296.20942578  362.43774233 4658.64716709  239.93126614
 4898.57843015  631.52736231 4267.05106375  777.21749257 1011.07486517
 4500.90843559   57.92187328 4558.83030004  178.27876872 4737.10906797
  581.50123414 5318.61031007  469.11703415  427.16000281 5276.65328116
  231.75963012 5508.41291707  951.620059   4556.79285917  408.90459804
 4965.69746131  753.56253358 5719.26000281  199.07880958 5918.33880087
  425.39995148 5492.93885127  336.07782135 5829.01667314  585.80519942
  431.73798622 5674.94944916   81.62460783 5593.32484863   93.83574437
 5687.16055502  803.59321727  595.33317428 5478.90050484  448.88054962
 5927.78104486  583.03601669 6510.81704914  521.65298179 5989.16406015
  691.2273649  6680.39141559  590.96022151 6089.43117196  828.89970946
 5260.53145432]

data_od_values.shape: (166,)
data_od_values: [ 6.17308800e+02  9.22265850e+01  3.8590

In [10]:
print("Residual:", np.linalg.norm(constraint_matrix @ data_od.value - data_var.value))

Residual: 4.6581248510977225e-05


In [11]:
dict_data_consolidated_to_save["Data Category"] += ["Consistent Flow"]

for counter in list(dict_data_consolidated.keys()):
    print("data_var_value[counter]:", data_var_values[counter])
    dict_data_consolidated_to_save[counter].append(data_var_values[counter])

# print()
# print("dict_data_consolidated_to_save[10]:", dict_data_consolidated_to_save[10])
# print()
# print("dict_data_consolidated_to_save:", dict_data_consolidated_to_save)

# print()
# for counter in list(dict_data_consolidated.keys()):
#     print("counter")
#     print("len(list(dict_data_consolidated_to_save[counter])):", \
#           len(list(dict_data_consolidated_to_save[counter])))

# print("len(list(dict_data_consolidated_to_save[Data Category]))", \
#       len(list(dict_data_consolidated_to_save["Data Category"] )))



data_var_value[counter]: 4226.174368409473
data_var_value[counter]: 617.3088025176335
data_var_value[counter]: 3608.8655702022315
data_var_value[counter]: 779.5738044387366
data_var_value[counter]: 4388.439373919963
data_var_value[counter]: 92.22995581242077
data_var_value[counter]: 4296.209425784956
data_var_value[counter]: 362.43774233227487
data_var_value[counter]: 4658.647167086254
data_var_value[counter]: 239.93126613573207
data_var_value[counter]: 4898.578430154392
data_var_value[counter]: 631.527362307067
data_var_value[counter]: 4267.051063747431
data_var_value[counter]: 777.2174925664822
data_var_value[counter]: 1011.0748651668436
data_var_value[counter]: 4500.908435593845
data_var_value[counter]: 57.921873281925386
data_var_value[counter]: 4558.830300044743
data_var_value[counter]: 178.27876872096647
data_var_value[counter]: 4737.109067967182
data_var_value[counter]: 581.5012341410388
data_var_value[counter]: 5318.610310066242
data_var_value[counter]: 469.1170341451288
data_v

In [13]:
df_dict_data_consolidated_to_save = pd.DataFrame(dict_data_consolidated_to_save)
df_dict_data_consolidated_to_save.to_csv(directory_path + 'data_capped_consolidated.csv', index = False)


# Origin-Destination Pair Generation

In [14]:
# ## Optimization Problem:

# Variables:
num_od = len(od_pairs_dict)
data_od_max_entr = cp.Variable(num_od)

# Cost Function:
func = 0.0
for k in range(num_od):
    func += cp.entr(data_od_max_entr[k]) + data_od_max_entr[k]

# Objective:
objective = cp.Maximize(func)

# Constraints:
constraints = []
# constraints += [constraint_matrix @ data_od_max_entr == data_var.value]
constraints += [constraint_matrix @ data_od_max_entr == constraint_matrix @ data_od.value]
constraints += [data_od_max_entr >= 0.0]

# Problem:
prob = cp.Problem(objective, constraints)

# Solve:
result = prob.solve()

# Test feasibility:
if prob.status == "infeasible":
    print("Problem is infeasible") 
else:
    print("Problem is feasible") 
print()

# Extract Values:

data_od_max_entr_values = data_od_max_entr.value
print("data_od_max_entr_values.shape:", data_od_max_entr_values.shape)
print("data_od_max_entr_values:", data_od_max_entr_values)


Problem is feasible

data_od_max_entr_values.shape: (166,)
data_od_max_entr_values: [6.17308800e+02 7.58459159e+01 4.55479211e+02 5.60552178e+02
 2.22006206e+02 3.96487175e+02 1.36462244e+02 1.77083308e+02
 2.27991432e+01 2.20730718e+02 1.07471254e+02 1.09152024e+02
 1.53110051e+02 9.71686130e+02 1.63840353e+01 9.83896266e+01
 1.21088987e+02 4.79578996e+01 8.56431712e+01 2.94781220e+01
 3.82530238e+01 4.92456206e+00 4.76826931e+01 2.32172517e+01
 2.35802622e+01 3.30744044e+01 2.09899763e+02 4.67259719e+01
 5.75074274e+01 2.27747378e+01 4.06726139e+01 1.39987730e+01
 1.81660786e+01 2.33808620e+00 2.26444479e+01 1.10255598e+01
 1.11978958e+01 1.57066539e+01 9.96794936e+01 3.09325552e+01
 3.80689019e+01 1.50765292e+01 2.69248591e+01 9.26698959e+00
 1.20256857e+01 1.54720642e+00 1.49902817e+01 7.29876822e+00
 7.41285147e+00 1.03975593e+01 6.59890811e+01 8.91778778e+01
 1.59258341e+02 5.48161818e+01 7.11327447e+01 9.15814122e+00
 8.86678560e+01 4.31740634e+01 4.38492528e+01 6.15033074e+01
 

In [19]:
## Store data in pandas:
# Start counter
# End counter
# Start station
# End station
# O-D flow

dict_od_data_to_save = {}
dict_od_data_to_save["Data Category"] \
    = ["Start Index", "End Index", "Start Station", "End Station", \
       "O-D Flow (Initialization)", "O-D Flow (Max Entropy)"]

for index in range(num_od):
    print("index:", index)
    dict_od_data_to_save[index] = []
    
    start_counter = od_pairs_dict[index][0]
    end_counter = od_pairs_dict[index][1]
    print("start_counter:", start_counter)
    print("end_counter:", end_counter)
    print()

    dict_od_data_to_save[index] += [start_counter]
    dict_od_data_to_save[index] += [end_counter]
    dict_od_data_to_save[index] += [dict_data_consolidated[start_counter]["Start Station"]]
    dict_od_data_to_save[index] += [dict_data_consolidated[end_counter]["End Station"]]
    dict_od_data_to_save[index] += [max(data_od_values[index], 0.0) ]
    dict_od_data_to_save[index] += [max(data_od_max_entr_values[index], 0.0)]

dict_od_data_to_save

index: 0
start_counter: 0
end_counter: 1

index: 1
start_counter: 0
end_counter: 5

index: 2
start_counter: 0
end_counter: 11

index: 3
start_counter: 0
end_counter: 13

index: 4
start_counter: 0
end_counter: 22

index: 5
start_counter: 0
end_counter: 27

index: 6
start_counter: 0
end_counter: 35

index: 7
start_counter: 0
end_counter: 39

index: 8
start_counter: 0
end_counter: 42

index: 9
start_counter: 0
end_counter: 46

index: 10
start_counter: 0
end_counter: 53

index: 11
start_counter: 0
end_counter: 57

index: 12
start_counter: 0
end_counter: 59

index: 13
start_counter: 0
end_counter: 60

index: 14
start_counter: 3
end_counter: 5

index: 15
start_counter: 3
end_counter: 11

index: 16
start_counter: 3
end_counter: 13

index: 17
start_counter: 3
end_counter: 22

index: 18
start_counter: 3
end_counter: 27

index: 19
start_counter: 3
end_counter: 35

index: 20
start_counter: 3
end_counter: 39

index: 21
start_counter: 3
end_counter: 42

index: 22
start_counter: 3
end_counter: 46

i

{'Data Category': ['Start Index',
  'End Index',
  'Start Station',
  'End Station',
  'O-D Flow (Initialization)',
  'O-D Flow (Max Entropy)'],
 0: [0,
  1,
  '001___402376_Palo_Alto___main',
  '004___425696_Palo_Alto___off',
  617.3087999272502,
  617.3087999272502],
 1: [0,
  5,
  '001___402376_Palo_Alto___main',
  '010___403059_East_Palo_Alto___off',
  92.22658501802857,
  75.84591591466587],
 2: [0,
  11,
  '001___402376_Palo_Alto___main',
  '027___403206_Redwood_City___off',
  385.905663056197,
  455.4792105872506],
 3: [0,
  13,
  '001___402376_Palo_Alto___main',
  '032___410094_Redwood_City___off',
  432.53377859521345,
  560.5521775060843],
 4: [0,
  22,
  '001___402376_Palo_Alto___main',
  '045___410111_Redwood_City___off',
  239.6434763535406,
  222.00620622582167],
 5: [0,
  27,
  '001___402376_Palo_Alto___main',
  '051___409888_San_Mateo___off',
  286.1078537610307,
  396.48717463724034],
 6: [0,
  35,
  '001___402376_Palo_Alto___main',
  '068___405845_San_Mateo___off',
  

In [20]:
df_od_data_to_save = pd.DataFrame(dict_od_data_to_save)
df_od_data_to_save.to_csv(directory_path + 'data_od.csv', index = False)


In [21]:
print("Residual:", np.linalg.norm(constraint_matrix @ data_od_max_entr.value - data_var.value))


Residual: 0.00011043629928761559


# Scratch Work

In [None]:
ell = [1, 2, 3, 4, 5]
# ell[1:-1]
ell += [6]
ell