In [1]:
import numpy as np
import scipy as sp

import matplotlib.pyplot as plt
import matplotlib.axes as axe
import pandas as pd
import datetime as dt
import gurobipy as gp
from gurobipy import GRB
import cvxpy as cp
import yaml

import random
from itertools import chain, combinations, tee
import time

import os


# Import O-D Flow Data + Income Data:

In [None]:
str_1 = "027___403206_Redwood_City___off"
"Redwood_City" in str_1

In [23]:
def station_to_city(station_name):
    if "Palo_Alto" in station_name:
        city_name = "Palo Alto"
    elif "East_Palo_Alto" in station_name:
        city_name = "East Palo Alto"
    elif "Menlo_Park" in station_name:
        city_name = "Menlo Park"
    elif "Redwood_City" in station_name:
        city_name = "Redwood City"
    elif "Belmont" in station_name:
        city_name = "Belmont"
    elif "San_Mateo" in station_name:
        city_name = "San Mateo"
    elif "Burlingame" in station_name:
        city_name = "Burlingame"
    elif "Millbrae" in station_name:
        city_name = "Millbrae"
    else:
        assert 1 == 0, "There should be no other case."
    return city_name

def income_range_avg(income_range_str):
    if income_range == "< $15,000":
        mean_income = 15000/2

    elif income_range == "$15,000 to $35,000":
        mean_income = (15000 + 35000)/2

    elif income_range == "$35,000 to $100,000":
        mean_income = (35000 + 100000)/2

    elif income_range == "$100,000 to $200,000":
        mean_income = (100000 + 200000)/2
        
    # TODO: Add the other cases.

    else:
        assert 1 == 0, "This case should not occur."
    
    
    return mean_income

In [24]:
directory_path = '../data/pems___101_N_Sep_2024/'
df_od_flow_data = pd.read_csv(directory_path + 'data_od.csv')

dict_df_od_flow_data = {}

for column_name_full in list(df_od_flow_data.columns):
    if column_name_full == "Data Category":
        categories_list = df_od_flow_data[column_name_full].tolist()
    else:
        dict_df_od_flow_data[int(column_name_full)] = {}
        for category_index, category in enumerate(categories_list):
            if category == "Start Index" or category == "End Index":
                dict_df_od_flow_data[int(column_name_full)][category] \
                    = int(df_od_flow_data[column_name_full].tolist()[category_index])
            elif category == "O-D Flow (Initialization)" or category == "O-D Flow (Max Entropy)":
                dict_df_od_flow_data[int(column_name_full)][category] \
                    = float(df_od_flow_data[column_name_full].tolist()[category_index])
            else:
                dict_df_od_flow_data[int(column_name_full)][category] \
                    = df_od_flow_data[column_name_full].tolist()[category_index]
        
        dict_df_od_flow_data[int(column_name_full)]["Start City"] \
            = station_to_city(dict_df_od_flow_data[int(column_name_full)]["Start Station"])
        dict_df_od_flow_data[int(column_name_full)]["End City"] \
            = station_to_city(dict_df_od_flow_data[int(column_name_full)]["End Station"])
        
dict_df_od_flow_data
# dict_df_od_flow_data[0]["O-D Flow (Initialization)"]


{0: {'Start Index': 0,
  'End Index': 1,
  'Start Station': '001___402376_Palo_Alto___main',
  'End Station': '004___425696_Palo_Alto___off',
  'O-D Flow (Initialization)': 617.3087999,
  'O-D Flow (Max Entropy)': 617.3087999,
  'Start City': 'Palo Alto',
  'End City': 'Palo Alto'},
 1: {'Start Index': 0,
  'End Index': 5,
  'Start Station': '001___402376_Palo_Alto___main',
  'End Station': '010___403059_East_Palo_Alto___off',
  'O-D Flow (Initialization)': 92.22658502,
  'O-D Flow (Max Entropy)': 75.84591591,
  'Start City': 'Palo Alto',
  'End City': 'Palo Alto'},
 2: {'Start Index': 0,
  'End Index': 11,
  'Start Station': '001___402376_Palo_Alto___main',
  'End Station': '027___403206_Redwood_City___off',
  'O-D Flow (Initialization)': 385.9056631,
  'O-D Flow (Max Entropy)': 455.4792106,
  'Start City': 'Palo Alto',
  'End City': 'Redwood City'},
 3: {'Start Index': 0,
  'End Index': 13,
  'Start Station': '001___402376_Palo_Alto___main',
  'End Station': '032___410094_Redwood_Cit

In [25]:
directory_path = '../data/data_ACS_income/'
df_income_data = pd.read_csv(directory_path + 'data_income_raw___1_year_2023.csv')

dict_df_income_data = {}

for column_name_full in list(df_income_data.columns):
    if column_name_full == "Income":
        categories_list = df_income_data[column_name_full].tolist()
    else:
        dict_df_income_data[column_name_full] = {}
        for category_index, category in enumerate(categories_list):
            dict_df_income_data[column_name_full][category] \
                = float(df_income_data[column_name_full].tolist()[category_index])
                
dict_df_income_data


{'Palo Alto': {'< $10,000': 4.7,
  '$10,000 to $14,999': 3.1,
  '$15,000 to $24,999': 2.0,
  '$25,000 to $34,999': 2.5,
  '$35,000 to $49,999': 3.5,
  '$50,000 to $74,999': 7.8,
  '$75,000 to $99,999': 7.5,
  '$100,000 to $149,999': 13.2,
  '$150,000 to $199,999': 8.8,
  '>= $200,000': 46.9,
  'Median': 184068.0,
  'Mean': 264292.0},
 'East Palo Alto': {'< $10,000': 3.6,
  '$10,000 to $14,999': 2.3,
  '$15,000 to $24,999': 3.9,
  '$25,000 to $34,999': 4.9,
  '$35,000 to $49,999': 8.8,
  '$50,000 to $74,999': 14.9,
  '$75,000 to $99,999': 9.5,
  '$100,000 to $149,999': 19.3,
  '$150,000 to $199,999': 9.7,
  '>= $200,000': 23.1,
  'Median': 104832.0,
  'Mean': 138379.0},
 'Menlo Park': {'< $10,000': 1.9,
  '$10,000 to $14,999': 0.3,
  '$15,000 to $24,999': 2.2,
  '$25,000 to $34,999': 2.7,
  '$35,000 to $49,999': 5.0,
  '$50,000 to $74,999': 6.2,
  '$75,000 to $99,999': 5.6,
  '$100,000 to $149,999': 13.4,
  '$150,000 to $199,999': 9.0,
  '>= $200,000': 53.6,
  'Median': 206588.0,
  'Mea

## Case 1: (3 eligible groups, 2 ineligible groups)

In [26]:
city_names_list = list(dict_df_income_data.keys())

dict_income_groups = {}
income_range_list = ["< $15,000", "$15,000 to $35,000", "$35,000 to $100,000", \
                    "$100,000 to $200,000", ">= $200,000"]

for city_name in city_names_list:
    dict_income_groups[city_name] = {}
    
    for income_range in income_range_list:
        dict_income_groups[city_name][income_range] = np.zeros(2)
        
        if income_range == "< $15,000":
            percent_of_population = dict_df_income_data[city_name]["< $10,000"] \
                + dict_df_income_data[city_name]["$10,000 to $14,999"]
            mean_income = income_range_avg(income_range)
            dict_income_groups[city_name][income_range] = [mean_income, percent_of_population]
            
        elif income_range == "$15,000 to $35,000":
            percent_of_population = dict_df_income_data[city_name]["$15,000 to $24,999"] \
                + dict_df_income_data[city_name]["$25,000 to $34,999"]
            mean_income = income_range_avg(income_range)
            dict_income_groups[city_name][income_range] = [mean_income, percent_of_population]
            
        elif income_range == "$35,000 to $100,000":
            percent_of_population = dict_df_income_data[city_name]["$35,000 to $49,999"] \
                + dict_df_income_data[city_name]["$50,000 to $74,999"] \
                + dict_df_income_data[city_name]["$75,000 to $99,999"]
            mean_income = income_range_avg(income_range)
            dict_income_groups[city_name][income_range] = [mean_income, percent_of_population]
            
        elif income_range == "$100,000 to $200,000":
            percent_of_population = dict_df_income_data[city_name]["$100,000 to $149,999"] \
                + dict_df_income_data[city_name]["$150,000 to $199,999"]
            mean_income = income_range_avg(income_range)
            dict_income_groups[city_name][income_range] = [mean_income, percent_of_population]
            
        elif income_range == ">= $200,000":
            percent_of_population = dict_df_income_data[city_name][">= $200,000"]          
            dict_income_groups[city_name][income_range] = [0, percent_of_population]
        
        else:
            assert 1 == 0, "This case should not occur."
    
    if income_range == ">= $200,000":
        sum_of_income_mean_percent_products \
            = sum([dict_income_groups[city_name][income_range][0] * dict_income_groups[city_name][income_range][1] \
                   for income_range in income_range_list if income_range != ">= 200,000"])
        mean_income = (dict_df_income_data[city_name]["Mean"] * 100 - sum_of_income_mean_percent_products) \
            / dict_income_groups[city_name][">= $200,000"][1]
        dict_income_groups[city_name][income_range][0] = mean_income

dict_income_groups


{'Palo Alto': {'< $15,000': [7500.0, 7.800000000000001],
  '$15,000 to $35,000': [25000.0, 4.5],
  '$35,000 to $100,000': [67500.0, 18.8],
  '$100,000 to $200,000': [150000.0, 22.0],
  '>= $200,000': [462456.28997867805, 46.9]},
 'East Palo Alto': {'< $15,000': [7500.0, 5.9],
  '$15,000 to $35,000': [25000.0, 8.8],
  '$35,000 to $100,000': [67500.0, 33.2],
  '$100,000 to $200,000': [150000.0, 29.0],
  '>= $200,000': [302279.22077922075, 23.1]},
 'Menlo Park': {'< $15,000': [7500.0, 2.1999999999999997],
  '$15,000 to $35,000': [25000.0, 4.9],
  '$35,000 to $100,000': [67500.0, 16.799999999999997],
  '$100,000 to $200,000': [150000.0, 22.4],
  '>= $200,000': [546800.3731343284, 53.6]},
 'Redwood City': {'< $15,000': [7500.0, 5.2],
  '$15,000 to $35,000': [25000.0, 6.300000000000001],
  '$35,000 to $100,000': [67500.0, 21.2],
  '$100,000 to $200,000': [150000.0, 25.5],
  '>= $200,000': [412454.54545454547, 41.8]},
 'Belmont': {'< $15,000': [7500.0, 4.699999999999999],
  '$15,000 to $35,00

# <font color='red'>From previous code:</font>

# Data Capping

In [None]:
start_time = time.time()

directory_path = '../data/pems___101_N_Sep_2024/'
file_name_list = [f for f in os.listdir(directory_path) \
                  if os.path.isfile(os.path.join(directory_path, f)) and f[-1] == 'x' and f[0] != "~"]
file_name_list.sort()

thresh_lower = 0.1

# df = pd.read_excel(directory_path + file_name)
df_data_filtered = pd.read_csv(directory_path + 'data_filtered.csv', index_col=0)

dict_data_capped = {}
# dict_data_capped["Data Category"] = \
#     ["Station Type", "Number (data points)", "Mean (capped flow)", "Std Dev (capped flow)"]

times_dict = {}
for column_name_full in list(df_data_filtered.columns):
    print("column_name_full:", column_name_full)
    
    vals_list_str = df_data_filtered[column_name_full].tolist()
    
#     print("vals_list_str:", vals_list_str)

    column_name = column_name_full[:-5]
    
    dict_data_capped[column_name] = {}
    dict_data_capped[column_name]["Station Type"] = vals_list_str[0]
    
    vals_list = [float(val) for val in vals_list_str[1:]]
    max_val = max(vals_list)
    dict_data_capped[column_name]["Flows (capped)"] = \
        [val for val in vals_list if val != 0.0 and val >= max_val * thresh_lower]
    
    
#     print()
#     print("vals_list:", vals_list)
#     print()
#     print("vals_array_capped:", vals_array_capped)
    
#     dict_data_capped[column_name] += [vals_array_capped.shape[0]]
#     dict_data_capped[column_name] += [np.mean(vals_array_capped)]
#     dict_data_capped[column_name] += [np.std(vals_array_capped)]

    
    
#     print("dict_data_capped[column_name]:", dict_data_capped[column_name])
#     print()


end_time = time.time()

print()
print("Time:", end_time - start_time)



# Data Consolidation

# Scratch Work

In [None]:
ell = [1, 2, 3, 4, 5]
# ell[1:-1]
ell += [6]
ell