In [None]:
!pip install pandas
!pip install numpy
!pip install seaborn
!pip install matplotlib
!pip install scikit-learn
!pip install scipy
!pip install gdown




In [1]:

# Import portion of a package
import matplotlib.pyplot as plt  # Most common visualization package that a lot of others are based on

# Import full packages under custom name
import numpy as np  # Common package for numerical methods
import pandas as pd  # Common package for data storeage/manipulation
import seaborn as sns  # Common package for statistical visualizations

# Import portion of a package
import scipy.stats as stats
from sklearn.impute import SimpleImputer as Imputer  # Specific function from common machine learning package\

#more packages
import gdown


In [2]:


# File ID from your link (assuming this is the cleaned dataset)
file_id = "13elVDJJ6JUH2gsj36CxPjkLryE2rnl-r"
url = f"https://drive.google.com/uc?id={file_id}"

# Download the file
gdown.download(url, "flight_data.csv", quiet=False)

# Now load it
flight_data = pd.read_csv("flight_data.csv")
print(flight_data.shape)
print(flight_data.head())

Downloading...
From (original): https://drive.google.com/uc?id=13elVDJJ6JUH2gsj36CxPjkLryE2rnl-r
From (redirected): https://drive.google.com/uc?id=13elVDJJ6JUH2gsj36CxPjkLryE2rnl-r&confirm=t&uuid=73690511-4d44-4b60-8590-c965ca1c16a0
To: /content/flight_data.csv
100%|██████████| 1.32G/1.32G [00:15<00:00, 84.8MB/s]


(7546988, 32)
   YEAR  QUARTER  MONTH  DAY_OF_MONTH  DAY_OF_WEEK  MKT_CARRIER_AIRLINE_ID  \
0  2024        1      1             1            1                   19393   
1  2024        1      1             1            1                   19393   
2  2024        1      1             1            1                   19393   
3  2024        1      1             1            1                   19393   
4  2024        1      1             1            1                   19393   

   ORIGIN_AIRPORT_ID  ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_NAME  DEST_AIRPORT_ID  \
0              10140                1014005  Albuquerque, NM            10423   
1              10140                1014005  Albuquerque, NM            10423   
2              10140                1014005  Albuquerque, NM            10800   
3              10140                1014005  Albuquerque, NM            10821   
4              10140                1014005  Albuquerque, NM            11259   

   ...  CANCELLATION_CODE CRS_

In [3]:

df = flight_data.copy()
print (df.head())
print(df.columns.tolist())

   YEAR  QUARTER  MONTH  DAY_OF_MONTH  DAY_OF_WEEK  MKT_CARRIER_AIRLINE_ID  \
0  2024        1      1             1            1                   19393   
1  2024        1      1             1            1                   19393   
2  2024        1      1             1            1                   19393   
3  2024        1      1             1            1                   19393   
4  2024        1      1             1            1                   19393   

   ORIGIN_AIRPORT_ID  ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_NAME  DEST_AIRPORT_ID  \
0              10140                1014005  Albuquerque, NM            10423   
1              10140                1014005  Albuquerque, NM            10423   
2              10140                1014005  Albuquerque, NM            10800   
3              10140                1014005  Albuquerque, NM            10821   
4              10140                1014005  Albuquerque, NM            11259   

   ...  CANCELLATION_CODE CRS_ELAPSED_TIME  

In [None]:
"""
minimize sum of (epsilon?) (frequency * average * passengers)
st.
    ...

optimization problem in words
- frequency = probability of flight being delayed
- average = expected delay time (in minutes)
- passengers = number of passengers taking that flight
- possible constraints for passengers:
  - arrival/depaarture time being between certain times
  - flight duration being shorter than x hours
  - groups of passengers staying together (ie families)
  - max number of extra seats on flights
  - need to get to destination by a certain time
  - airline?
  - any connecting flights must have at least a 1 hour buffer for connecting purposes


info needed about the set of flights:
- number of seats available (this is a made up number)
- frequency
- average
- arrival/departure time/day/month/year
- flight duration
- destination/origin airport/city
- airline

"""

In [4]:
"""
calculating average and frequency of delay for each flight with a different origin-destination city pair

"""
# Boolean series: True if there is a delay
def compute_delay_stats(group):
    delayed = group['DEP_DELAY_NEW'] > 0

    # Probability of delay
    prob_delay = delayed.sum() / len(group)

    # Average delay (only for delayed rows)
    if delayed.sum() > 0:
        avg_delay = group.loc[delayed, 'DEP_DELAY_NEW'].mean()
    else:
        avg_delay = 0.0

    return pd.Series({'freq_delay': prob_delay, 'avg_delay': avg_delay})

# Group by origin and destination airport to calculate delay stats
delay_stats = df.groupby(['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']).apply(compute_delay_stats).reset_index()

# Display
print(delay_stats.head())



   ORIGIN_AIRPORT_ID  DEST_AIRPORT_ID  freq_delay  avg_delay
0              10135            10397    0.259587  51.750000
1              10135            10693    0.247619  45.307692
2              10135            11057    0.282161  76.173252
3              10135            11292    0.255814  51.181818
4              10135            11697    0.283951  38.652174


  delay_stats = df.groupby(['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']).apply(compute_delay_stats).reset_index()


In [5]:
"""

for the optimization problem, assuming starting point is LA. to get a network of flights, took the top 5
most frequent destinations from LA, from those 5 took the top 5, and from those took the top 2 to get a
solid variety of cities.

"""
#doing a network of flights instead of top 20
start_airport = 12892  # LA

# --- First leg: LA -> top 5 destinations ---
from_LA = df[df['ORIGIN_AIRPORT_ID'] == start_airport]
top5_first_leg = from_LA['DEST_AIRPORT_ID'].value_counts().head(5).index.tolist()
print("Top 5 destinations from LA:", top5_first_leg)

# --- Second leg: from each first leg destination, top 5 destinations (excluding LA) ---
second_leg = {}
second_leg_airports = set()
for origin in top5_first_leg:
    flights_from_origin = df[df['ORIGIN_AIRPORT_ID'] == origin]
    top5 = flights_from_origin[flights_from_origin['DEST_AIRPORT_ID'] != start_airport] \
            ['DEST_AIRPORT_ID'].value_counts().head(5).index.tolist()
    second_leg[origin] = top5
    second_leg_airports.update(top5)

print("\nSecond leg top 5 destinations from each first leg destination:")
for origin, dests in second_leg.items():
    print(f"{origin} -> {dests}")

# --- Third leg: from second leg destinations, top 2 destinations (excluding first leg, second leg, and return) ---
third_leg = {}
first_leg_set = set(top5_first_leg)

for origin, dests in second_leg.items():
    for dest in dests:
        flights_from_dest = df[df['ORIGIN_AIRPORT_ID'] == dest]

        # exclude first leg airports, all second leg airports, the immediate origin, AND Los Angeles
        exclude_set = first_leg_set | second_leg_airports | {dest, start_airport}

        # pick top 2 *after* removing Los Angeles
        top2 = (
            flights_from_dest[
                ~flights_from_dest['DEST_AIRPORT_ID'].isin(exclude_set)
            ]['DEST_AIRPORT_ID']
            .value_counts()
            .head(2)
            .index
            .tolist()
        )

        third_leg[dest] = top2

print("\nThird leg top 2 destinations from second leg (excluding first leg, second leg, and return trips):")
for origin, dests in third_leg.items():
    print(f"{origin} -> {dests}")


Top 5 destinations from LA: [14771, 12889, 12478, 11292, 14747]

Second leg top 5 destinations from each first leg destination:
14771 -> [14747, 12478, 14679, 12889, 11292]
12889 -> [11292, 14107, 14679, 14747, 11298]
12478 -> [14771, 10721, 13303, 13204, 14492]
11292 -> [14107, 12889, 14869, 14747, 13930]
14747 -> [10299, 14057, 14107, 11292, 12889]

Third leg top 2 destinations from second leg (excluding first leg, second leg, and return trips):
14747 -> [11884, 10713]
12478 -> [11697, 14843]
14679 -> [14831, 14893]
12889 -> [14893, 10397]
11292 -> [13487, 10397]
14107 -> [14908, 10397]
11298 -> [12266, 10397]
14771 -> [14908, 11618]
10721 -> [11278, 12953]
13303 -> [10397, 12953]
13204 -> [10397, 11618]
14492 -> [10397, 12953]
14869 -> [10397, 12266]
13930 -> [12953, 11278]
10299 -> [11630, 12523]
14057 -> [14831, 14893]


In [6]:
"""
converts all the airport IDs from the previous step to cities
"""

airport_to_city = (
    df[['ORIGIN_AIRPORT_ID', 'ORIGIN_CITY_NAME']]
    .drop_duplicates()
    .assign(ORIGIN_CITY_NAME=lambda x: x['ORIGIN_CITY_NAME'].str.split(',').str[0])
    .set_index('ORIGIN_AIRPORT_ID')['ORIGIN_CITY_NAME']
    .to_dict()
)


def id_to_city(airport_id):
    return airport_to_city.get(airport_id, f"Unknown({airport_id})")

def list_to_cities(id_list):
    return [id_to_city(i) for i in id_list]

print("Top 5 destinations from LA:")
for a in top5_first_leg:
    print(a, "->", id_to_city(a))

print("\nSecond leg top 5 destinations from each first leg destination:")
for origin, dests in second_leg.items():
    print(f"{origin} ({id_to_city(origin)}) -> {list_to_cities(dests)}")

print("\nThird leg top 2 destinations from second leg:")
for origin, dests in third_leg.items():
    print(f"{origin} ({id_to_city(origin)}) -> {list_to_cities(dests)}")


Top 5 destinations from LA:
14771 -> San Francisco
12889 -> Las Vegas
12478 -> New York
11292 -> Denver
14747 -> Seattle

Second leg top 5 destinations from each first leg destination:
14771 (San Francisco) -> ['Seattle', 'New York', 'San Diego', 'Las Vegas', 'Denver']
12889 (Las Vegas) -> ['Denver', 'Phoenix', 'San Diego', 'Seattle', 'Dallas/Fort Worth']
12478 (New York) -> ['San Francisco', 'Boston', 'Miami', 'Orlando', 'Raleigh/Durham']
11292 (Denver) -> ['Phoenix', 'Las Vegas', 'Salt Lake City', 'Seattle', 'Chicago']
14747 (Seattle) -> ['Anchorage', 'Portland', 'Phoenix', 'Denver', 'Las Vegas']

Third leg top 2 destinations from second leg:
14747 (Seattle) -> ['Spokane', 'Boise']
12478 (New York) -> ['Fort Lauderdale', 'San Juan']
14679 (San Diego) -> ['San Jose', 'Sacramento']
12889 (Las Vegas) -> ['Sacramento', 'Atlanta']
11292 (Denver) -> ['Minneapolis', 'Atlanta']
14107 (Phoenix) -> ['Santa Ana', 'Atlanta']
11298 (Dallas/Fort Worth) -> ['Houston', 'Atlanta']
14771 (San Francisc

In [7]:
"""
this section lists all possible destinations for the optimization problem
"""
# Collect all city names from each leg
possible_destinations = set()

# --- First leg ---
for a in top5_first_leg:
    possible_destinations.add(id_to_city(a))

# --- Second leg ---
for origin, dests in second_leg.items():
    possible_destinations.add(id_to_city(origin))
    for d in dests:
        possible_destinations.add(d)

# --- Third leg ---
for origin, dests in third_leg.items():
    possible_destinations.add(id_to_city(origin))
    for d in dests:
        possible_destinations.add(d)

# Convert airport IDs in second/third leg to city names where needed
possible_destinations = {id_to_city(c) if isinstance(c, int) else c
                         for c in possible_destinations}

# Turn into a sorted list
possible_destinations = sorted(possible_destinations)

print(possible_destinations)
print("Number of unique cities:", len(possible_destinations))

['Anchorage', 'Atlanta', 'Boise', 'Boston', 'Chicago', 'Dallas/Fort Worth', 'Denver', 'Fairbanks', 'Fort Lauderdale', 'Houston', 'Juneau', 'Las Vegas', 'Miami', 'Minneapolis', 'New York', 'Newark', 'Orlando', 'Phoenix', 'Portland', 'Raleigh/Durham', 'Sacramento', 'Salt Lake City', 'San Diego', 'San Francisco', 'San Jose', 'San Juan', 'Santa Ana', 'Seattle', 'Spokane', 'Washington']
Number of unique cities: 30


In [8]:
"""
the output of this is turning all the "legs" into pairs of origin-destination cities
"""


# --- Gather all origin-destination pairs from 3 legs ---
# First leg pairs
first_leg_pairs = [(id_to_city(start_airport), id_to_city(dest)) for dest in top5_first_leg]

# Second leg pairs
second_leg_pairs = []
for origin_airport, dest_airports in second_leg.items():
    origin_city = id_to_city(origin_airport)
    for dest_airport in dest_airports:
        dest_city = id_to_city(dest_airport)  # convert to city
        second_leg_pairs.append((origin_city, dest_city))

# Third leg pairs
third_leg_pairs = []
for origin_airport, dest_airports in third_leg.items():
    origin_city = id_to_city(origin_airport)
    for dest_airport in dest_airports:
        dest_city = id_to_city(dest_airport)  # convert to city
        third_leg_pairs.append((origin_city, dest_city))

# Combine all pairs
all_pairs = set(first_leg_pairs + second_leg_pairs + third_leg_pairs)

print(all_pairs)
print (len(all_pairs))


{('New York', 'San Francisco'), ('Los Angeles', 'San Francisco'), ('Denver', 'Seattle'), ('Seattle', 'Boise'), ('Denver', 'Atlanta'), ('Miami', 'New York'), ('New York', 'Miami'), ('Seattle', 'Denver'), ('Orlando', 'Newark'), ('Seattle', 'Anchorage'), ('Las Vegas', 'Seattle'), ('Las Vegas', 'Atlanta'), ('Orlando', 'Atlanta'), ('San Francisco', 'San Diego'), ('New York', 'Orlando'), ('Los Angeles', 'Seattle'), ('Seattle', 'Spokane'), ('San Francisco', 'Las Vegas'), ('Raleigh/Durham', 'Atlanta'), ('Dallas/Fort Worth', 'Houston'), ('Las Vegas', 'Dallas/Fort Worth'), ('Portland', 'San Jose'), ('Raleigh/Durham', 'New York'), ('Las Vegas', 'Denver'), ('Los Angeles', 'Denver'), ('Los Angeles', 'New York'), ('Seattle', 'Phoenix'), ('Seattle', 'Portland'), ('Chicago', 'Washington'), ('San Diego', 'San Jose'), ('Denver', 'Minneapolis'), ('Denver', 'Phoenix'), ('Seattle', 'Las Vegas'), ('Denver', 'Chicago'), ('Boston', 'Washington'), ('Salt Lake City', 'Houston'), ('San Francisco', 'Newark'), ('D

In [9]:
"""
this part shows all feasible routes starting from LA to each destination in the all pairs network
the output of this is a dictionary of city: possible routes from LA to that city
"""

from collections import defaultdict, deque

# Create adjacency list from all_pairs
graph = defaultdict(list)
for origin, dest in all_pairs:
    graph[origin].append(dest)

start_city = "Los Angeles"

# Dictionary to store feasible routes: destination -> set of tuples (routes)
feasible_routes = defaultdict(set)

# BFS-like traversal for up to 3 legs (direct + 2 connections)
max_legs = 3
queue = deque()
queue.append( ([start_city], 0) )  # (current_route_list, current_depth)

while queue:
    route, depth = queue.popleft()
    current_city = route[-1]

    if depth > 0:  # exclude the starting city itself
        dest = current_city
        feasible_routes[dest].add(tuple(route))

    if depth < max_legs:
        for neighbor in graph[current_city]:
            # avoid cycles in the route
            if neighbor not in route:
                queue.append((route + [neighbor], depth + 1))

# Example: print feasible routes to Seattle
for dest, routes in feasible_routes.items():
    print(dest, routes)


San Francisco {('Los Angeles', 'New York', 'San Francisco'), ('Los Angeles', 'San Francisco')}
Seattle {('Los Angeles', 'Seattle'), ('Los Angeles', 'Denver', 'Seattle'), ('Los Angeles', 'San Francisco', 'Seattle'), ('Los Angeles', 'Las Vegas', 'Seattle'), ('Los Angeles', 'San Francisco', 'Denver', 'Seattle'), ('Los Angeles', 'New York', 'San Francisco', 'Seattle'), ('Los Angeles', 'Denver', 'Las Vegas', 'Seattle'), ('Los Angeles', 'San Francisco', 'Las Vegas', 'Seattle'), ('Los Angeles', 'Las Vegas', 'Denver', 'Seattle')}
Denver {('Los Angeles', 'San Francisco', 'Denver'), ('Los Angeles', 'New York', 'San Francisco', 'Denver'), ('Los Angeles', 'San Francisco', 'Seattle', 'Denver'), ('Los Angeles', 'Seattle', 'Denver'), ('Los Angeles', 'Seattle', 'Las Vegas', 'Denver'), ('Los Angeles', 'San Francisco', 'Las Vegas', 'Denver'), ('Los Angeles', 'Denver'), ('Los Angeles', 'Las Vegas', 'Denver'), ('Los Angeles', 'Las Vegas', 'Seattle', 'Denver')}
New York {('Los Angeles', 'San Francisco', 'N

In [10]:
"""

the output of this is turning all the "legs" into pairs of origin-destination cities, but using airport ids
for future steps like filtering the original dataframe and generating statistics for each pair
"""


# --- Gather all origin-destination pairs from 3 legs ---

# First leg pairs
first_leg_pairs_id = [(start_airport, dest) for dest in top5_first_leg]

# Second leg pairs
second_leg_pairs_id = []
for origin_airport, dest_airports in second_leg.items():
    for dest_airport in dest_airports:
        second_leg_pairs_id.append((origin_airport, dest_airport))

# Third leg pairs
third_leg_pairs_id = []
for origin_airport, dest_airports in third_leg.items():
    for dest_airport in dest_airports:
        third_leg_pairs_id.append((origin_airport, dest_airport))

# Combine all pairs
all_pairs_id = set(first_leg_pairs_id + second_leg_pairs_id + third_leg_pairs_id)

print(all_pairs_id)
print(len(all_pairs_id))


{(12889, 14893), (14771, 14908), (14747, 11884), (14679, 14831), (12892, 11292), (14747, 11292), (14771, 14679), (14869, 10397), (12889, 14679), (12478, 14843), (11292, 14107), (12478, 14492), (11292, 13930), (10299, 11630), (12478, 10721), (14771, 11292), (11292, 10397), (12892, 12478), (12889, 11292), (12478, 11697), (14747, 14107), (14107, 10397), (12889, 11298), (14771, 11618), (14492, 10397), (12478, 13204), (14057, 14893), (11292, 14747), (11292, 13487), (13204, 11618), (14869, 12266), (11292, 12889), (11292, 14869), (14771, 12478), (10299, 12523), (14747, 10299), (11298, 10397), (12892, 12889), (14747, 12889), (12892, 14747), (14492, 12953), (13204, 10397), (12889, 14107), (13303, 10397), (12889, 10397), (14679, 14893), (12892, 14771), (14057, 14831), (14747, 14057), (12478, 14771), (14771, 14747), (14107, 14908), (14771, 12889), (14747, 10713), (10721, 12953), (10721, 11278), (13303, 12953), (13930, 12953), (12889, 14747), (13930, 11278), (11298, 12266), (12478, 13303)}
62


In [11]:

"""

this filters the full dataset to only the feasible flight pairs from the 3 legs

"""

# Mask for any pair in all_pairs (airport IDs)
mask = df[['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']].apply(
    lambda row: (row['ORIGIN_AIRPORT_ID'], row['DEST_AIRPORT_ID']) in all_pairs_id, axis=1
)

# Filtered DataFrame
filtered_df = df[mask].drop_duplicates()

print(f"Number of flights after filtering: {len(filtered_df)}")
print(filtered_df.head())

Number of flights after filtering: 383598
      YEAR  QUARTER  MONTH  DAY_OF_MONTH  DAY_OF_WEEK  MKT_CARRIER_AIRLINE_ID  \
1118  2024        1      1             1            1                   19393   
1119  2024        1      1             1            1                   19393   
1120  2024        1      1             1            1                   19393   
1121  2024        1      1             1            1                   19393   
1122  2024        1      1             1            1                   19393   

      ORIGIN_AIRPORT_ID  ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_NAME  \
1118              11292                1129202       Denver, CO   
1119              11292                1129202       Denver, CO   
1120              11292                1129202       Denver, CO   
1121              11292                1129202       Denver, CO   
1122              11292                1129202       Denver, CO   

      DEST_AIRPORT_ID  ...  CANCELLATION_CODE CRS_ELAPSED_TIME  \
11

In [12]:
"""
for each of the origin-city pairs, gets all of the flight stats
"""

#making set of top 20 flights

# Assuming the dataframe is already loaded
# Group by 'ORIGIN_CITY_NAME' and 'DEST_AIRPORT_ID' and count the occurrences
route_stats = filtered_df.groupby(['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']).agg(
    origin_city=('ORIGIN_CITY_NAME', 'first'),
    destination_city=('DEST_CITY_NAME', 'first'),
    count=('ORIGIN_CITY_NAME', 'size'),
    avg_duration=('ACTUAL_ELAPSED_TIME', 'mean'),
    most_common_departure=('CRS_DEP_TIME', lambda x: x.mode()[0] if not x.mode().empty else np.nan), #arrival time will be calculated by adding elapsed time to arrival
    most_common_airline=('MKT_CARRIER_AIRLINE_ID', lambda x: x.mode()[0] if not x.mode().empty else np.nan)
).reset_index()

# Add a "made-up" number of seats available (for now I will assume random values, e.g. between 100 and 300)
route_stats['seats_available'] = np.random.randint(1, 6, size=len(route_stats))

# Merge with your top_20_routes dataframe
route_stats = route_stats.merge(delay_stats, on=['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID'], how='left')

route_stats['origin_city'] = route_stats['origin_city'].str.split(',').str[0] #extracting city from city,state
route_stats['destination_city'] = route_stats['destination_city'].str.split(',').str[0]


#calculating the most common city to use as starting point for the group of passengers
origin_counts = route_stats['ORIGIN_AIRPORT_ID'].astype(int).value_counts()
most_common_origin_id = origin_counts.idxmax()

print("Most common origin airport ID:", most_common_origin_id) # it is 12892 (freq of 5) which is LA

# Display the top 20 routes with all the requested details
print(route_stats[['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'origin_city', 'destination_city', 'avg_duration', 'most_common_departure',
                       'most_common_airline','seats_available', 'freq_delay', 'avg_delay']])


Most common origin airport ID: 11292
    ORIGIN_AIRPORT_ID  DEST_AIRPORT_ID     origin_city destination_city  \
0               10299            11630       Anchorage        Fairbanks   
1               10299            12523       Anchorage           Juneau   
2               10721            11278          Boston       Washington   
3               10721            12953          Boston         New York   
4               11292            10397          Denver          Atlanta   
..                ...              ...             ...              ...   
57              14771            14679   San Francisco        San Diego   
58              14771            14747   San Francisco          Seattle   
59              14771            14908   San Francisco        Santa Ana   
60              14869            10397  Salt Lake City          Atlanta   
61              14869            12266  Salt Lake City          Houston   

    avg_duration  most_common_departure  most_common_airline  

In [13]:
"""
produces a dictionary for each of the origin-city pairs including all stats. also generated a randomized
number of the same flight departing at different times to simulate real life.
"""

import random
simulated_dict = {}

for _, row in route_stats.iterrows():

    # Key base like "New York-Los Angeles"
    base_key = f"{row['origin_city']}-{row['destination_city']}"

    # Random number of simulated flights (e.g. between 1 and 5)
    num_simulated_flights = random.randint(1, 5)

    for i in range(1, num_simulated_flights + 1):

        # Random departure time in HHMM (0000–2359 but realistic ranges optional)
        allowed_times = []

        for hour in range(24):          # 0 → 23
            for minute in [0, 30]:      # :00 and :30
                time_value = hour * 100 + minute
                allowed_times.append(time_value)


        random_time = random.choice(allowed_times)
        # Random seats available (exponential)
        min_seats = 1
        max_seats = 5
        scale = 0.5  # smaller scale → more skewed toward larger numbers

        # Generate one random seat number
        rand_val = np.random.exponential(scale=scale)
        # Normalize and scale to min-max range
        random_seats = int(max_seats - (rand_val / (rand_val + 1)) * (max_seats - min_seats))
        random_seats = max(min_seats, min(random_seats, max_seats))

        # Build key with -1, -2, -3 suffix
        key = f"{base_key}-{i}"

        # Store all data exactly like your structure
        simulated_dict[key] = [
            row['origin_city'],
            row['destination_city'],
            row['ORIGIN_AIRPORT_ID'],
            row['DEST_AIRPORT_ID'],
            row['avg_duration'],
            row['freq_delay'],
            row['avg_delay'],
            random_time,
            row['most_common_airline'],
            random_seats
        ]

dictionary_df = pd.DataFrame(simulated_dict, index = ['origin_city', 'destination_city','origin_airport', 'destination_airport', 'expected_duration', 'freq_delay', 'avg_delay','departure(HHMM)', 'airline', 'num_seats'])
dictionary_df



Unnamed: 0,Anchorage-Fairbanks-1,Anchorage-Fairbanks-2,Anchorage-Fairbanks-3,Anchorage-Fairbanks-4,Anchorage-Juneau-1,Anchorage-Juneau-2,Anchorage-Juneau-3,Anchorage-Juneau-4,Anchorage-Juneau-5,Boston-Washington-1,...,San Francisco-Seattle-3,San Francisco-Seattle-4,San Francisco-Seattle-5,San Francisco-Santa Ana-1,San Francisco-Santa Ana-2,San Francisco-Santa Ana-3,Salt Lake City-Atlanta-1,Salt Lake City-Atlanta-2,Salt Lake City-Atlanta-3,Salt Lake City-Houston-1
origin_city,Anchorage,Anchorage,Anchorage,Anchorage,Anchorage,Anchorage,Anchorage,Anchorage,Anchorage,Boston,...,San Francisco,San Francisco,San Francisco,San Francisco,San Francisco,San Francisco,Salt Lake City,Salt Lake City,Salt Lake City,Salt Lake City
destination_city,Fairbanks,Fairbanks,Fairbanks,Fairbanks,Juneau,Juneau,Juneau,Juneau,Juneau,Washington,...,Seattle,Seattle,Seattle,Santa Ana,Santa Ana,Santa Ana,Atlanta,Atlanta,Atlanta,Houston
origin_airport,10299,10299,10299,10299,10299,10299,10299,10299,10299,10721,...,14771,14771,14771,14771,14771,14771,14869,14869,14869,14869
destination_airport,11630,11630,11630,11630,12523,12523,12523,12523,12523,11278,...,14747,14747,14747,14908,14908,14908,10397,10397,10397,12266
expected_duration,59.925075,59.925075,59.925075,59.925075,99.441055,99.441055,99.441055,99.441055,99.441055,102.933923,...,129.926744,129.926744,129.926744,92.40545,92.40545,92.40545,213.182857,213.182857,213.182857,180.664073
freq_delay,0.264935,0.264935,0.264935,0.264935,0.246305,0.246305,0.246305,0.246305,0.246305,0.253081,...,0.430967,0.430967,0.430967,0.350959,0.350959,0.350959,0.43115,0.43115,0.43115,0.279802
avg_delay,29.121849,29.121849,29.121849,29.121849,30.77,30.77,30.77,30.77,30.77,55.96072,...,36.560111,36.560111,36.560111,39.483333,39.483333,39.483333,30.7,30.7,30.7,48.057878
departure(HHMM),2300,200,530,230,300,1600,1200,2130,1030,1300,...,2230,430,1730,430,30,1500,700,1400,1800,100
airline,19930,19930,19930,19930,19930,19930,19930,19930,19930,20409,...,19930,19930,19930,19930,19930,19930,19790,19790,19790,19977
num_seats,4,3,4,4,4,4,2,4,2,3,...,2,4,4,2,3,3,4,4,4,4


In [None]:
distinct_routes = (
    dictionary_df.loc[['origin_city', 'destination_city']]
    .T
    .drop_duplicates()
    .reset_index(drop=True)
)

distinct_routes

Unnamed: 0,origin_city,destination_city
0,Anchorage,Fairbanks
1,Anchorage,Juneau
2,Boston,Washington
3,Boston,New York
4,Denver,Atlanta
...,...,...
57,San Francisco,San Diego
58,San Francisco,Seattle
59,San Francisco,Santa Ana
60,Salt Lake City,Atlanta


In [49]:
"""

starting here, im trying to build an output that takes the feasible_routes dictionary for each destination that
has the flight stats so i can use those as options in my optimization problem. the first step here is to convert departure
and arrival times to minutes
"""
def hhmm_to_minutes(hhmm):
    hh = hhmm // 100
    mm = hhmm % 100
    return hh*60 + mm

def minutes_to_hhmm(minutes):
    minutes = int(minutes) #% (24*60)
    hh = minutes // 60
    mm = minutes % 60
    return hh*100 + mm

dictionary_df = dictionary_df.T

# Convert rows from strings to proper types if needed
dictionary_df['expected_duration'] = dictionary_df['expected_duration'].astype(float)
dictionary_df['departure(HHMM)'] = dictionary_df['departure(HHMM)'].astype(int)
dictionary_df['freq_delay'] = dictionary_df['freq_delay'].astype(float)
dictionary_df['avg_delay'] = dictionary_df['avg_delay'].astype(float)
dictionary_df['origin_airport'] = dictionary_df['origin_airport'].astype(int)
dictionary_df['destination_airport'] = dictionary_df['destination_airport'].astype(int)
dictionary_df['num_seats'] = dictionary_df['num_seats'].astype(int)


dictionary_df['dep_minutes'] = dictionary_df['departure(HHMM)'].apply(hhmm_to_minutes)
dictionary_df['arr_minutes'] = dictionary_df['dep_minutes'] + dictionary_df['expected_duration']

In [50]:
""""
this code does the making of flight options that involve multiple legs, checking feasibility (ie. the min layover is 30 min)
"""

# Start city and airport
start_city = "Los Angeles"
start_airport = 12892

# Minimum layover in minutes
min_layover = 30

# Create dictionary mapping origin city to all flights leaving that city. example flights_from_city['LosAngeles'] gives all flights
#leaving from LA
flights_from_city = {}
for _, row in dictionary_df.iterrows():
    flights_from_city.setdefault(row['origin_city'], []).append(row)

# Helper function to compute total frequency and average delay of a route
def aggregate_route(route_flights):
    total_expected_duration = route_flights[-1]['arr_minutes'] - route_flights[0]['dep_minutes'] #time from dep. of flight 1 to arrival of last flight, doesn't factor in delays
    avg_freq_delay = sum(f['freq_delay'] for f in route_flights) / len(route_flights)
    avg_delay = sum(f['avg_delay'] for f in route_flights) / len(route_flights)
    return total_expected_duration, avg_freq_delay, avg_delay

# Recursive function to find all feasible routes from LA to a destination
def find_routes(current_city, destination_city, visited=set(), current_route=[]):
    """
    Goal: find all feasible routes from current_city to destination_city.

    How it works:
    1. Look up all flights leaving current_city.
    2. Avoid cycles: don’t revisit cities in the same route (visited).
    3. If this is a connecting flight, check that the layover is at least 30 minutes.
    4. If the next flight reaches destination_city, add this complete route to results.
    5. Otherwise, recurse to explore flights from next_city.

    This is effectively doing a depth-first search of all possible flight paths from LA to the destination.
    """
    if current_city not in flights_from_city:
        return []


    routes = []
    for flight in flights_from_city[current_city]:
        # Avoid cycles
        next_city = flight['destination_city']
        if next_city in visited:
            continue

        # Check layover if this is a connecting flight
        if current_route:
            prev_flight = current_route[-1]
            if flight['dep_minutes'] < prev_flight['arr_minutes'] + min_layover:
                continue

        new_route = current_route + [flight]
        if next_city == destination_city:
            routes.append(new_route)
        else:
            routes.extend(find_routes(next_city, destination_city, visited | {next_city}, new_route))
    return routes

# Example: generate routes from LA to all other cities
all_destinations = set(dictionary_df['destination_city']) - {start_city}
all_routes = {}

for dest in all_destinations:
    feasible_routes = find_routes(start_city, dest) #contains all valid paths from LA to that destination.
    summarized_routes = []
    for route in feasible_routes:
        total_duration, freq_delay, avg_delay = aggregate_route(route)
        summarized_routes.append({
            'num_legs': len(route),
            #legs is a list of [origin, destination, departure HHMM, arrival HHMM for each flight]
            'legs': [(f['origin_city'], f['destination_city'], f['departure(HHMM)'], minutes_to_hhmm(f['arr_minutes'])) for f in route],
            'total_expected_duration': total_duration,
            'avg_freq_delay': freq_delay,
            'avg_delay': avg_delay,
            'num_seats': route[0]['num_seats']  # assuming same for all legs
        })
    all_routes[dest] = summarized_routes

# Example: print routes to Seattle
import pprint
pprint.pprint(all_routes['Seattle'])

[{'avg_delay': 36.44010125613679,
  'avg_freq_delay': 0.4058059838548061,
  'legs': [('Los Angeles', 'Denver', 900, 1117),
           ('Denver', 'Las Vegas', 1900, 2055),
           ('Las Vegas', 'Seattle', 2230, 2510)],
  'num_legs': 3,
  'num_seats': 2,
  'total_expected_duration': 970.6671207992733},
 {'avg_delay': 36.44010125613679,
  'avg_freq_delay': 0.4058059838548061,
  'legs': [('Los Angeles', 'Denver', 900, 1117),
           ('Denver', 'Las Vegas', 1830, 2025),
           ('Las Vegas', 'Seattle', 2230, 2510)],
  'num_legs': 3,
  'num_seats': 2,
  'total_expected_duration': 970.6671207992733},
 {'avg_delay': 34.576346610939105,
  'avg_freq_delay': 0.39703089174894113,
  'legs': [('Los Angeles', 'Denver', 900, 1117),
           ('Denver', 'Seattle', 1800, 2055)],
  'num_legs': 2,
  'num_seats': 2,
  'total_expected_duration': 715.776962252846},
 {'avg_delay': 34.576346610939105,
  'avg_freq_delay': 0.39703089174894113,
  'legs': [('Los Angeles', 'Denver', 900, 1117),
          

In [51]:
"""
turns the previous output into a dataframe for easy viewing and manipulation
"""


# Prepare a list of rows for the DataFrame
rows = []

for dest, routes in all_routes.items():
    for route in routes:
        rows.append({
            'destination_city': dest,
            'num_legs': route['num_legs'],
            'legs': route['legs'],  # list of tuples (origin, dest, dep, arr)
            'total_expected_duration': route['total_expected_duration'],
            'avg_freq_delay': route['avg_freq_delay'],
            'avg_delay': route['avg_delay'],
            'num_seats': route['num_seats']
        })

# Create the DataFrame
routes_df = pd.DataFrame(rows)

# Optional: sort by total duration or number of legs
routes_df = routes_df.sort_values(['num_legs', 'total_expected_duration']).reset_index(drop=True)

# Show example
print(routes_df.head(10))

  destination_city  num_legs                                        legs  \
0        Las Vegas         1      [(Los Angeles, Las Vegas, 1630, 1743)]   
1        Las Vegas         1      [(Los Angeles, Las Vegas, 1000, 1113)]   
2        Las Vegas         1      [(Los Angeles, Las Vegas, 1530, 1643)]   
3        Las Vegas         1      [(Los Angeles, Las Vegas, 1130, 1243)]   
4        Las Vegas         1        [(Los Angeles, Las Vegas, 830, 943)]   
5    San Francisco         1    [(Los Angeles, San Francisco, 730, 857)]   
6    San Francisco         1  [(Los Angeles, San Francisco, 1630, 1757)]   
7    San Francisco         1  [(Los Angeles, San Francisco, 2000, 2127)]   
8    San Francisco         1  [(Los Angeles, San Francisco, 2300, 2427)]   
9           Denver         1         [(Los Angeles, Denver, 2200, 2417)]   

   total_expected_duration  avg_freq_delay  avg_delay  num_seats  
0                73.013380        0.393589  39.975113          3  
1                73.013380   

In [52]:
"""
trying to turn routes_df to a dictionary_df to put into the optimization problem
"""

expanded_rows = []

for i, row in routes_df.iterrows():
    flat = {
        'route_name': f"{row['destination_city']}-route-{i+1}",
        'destination_city': row['destination_city'],
        'num_legs': row['num_legs'],
        'total_expected_duration': row['total_expected_duration'],
        'avg_freq_delay': row['avg_freq_delay'],
        'avg_delay': row['avg_delay'],
        'num_seats': row['num_seats'],
    }

    # expand each leg
    for leg_idx, leg in enumerate(row['legs'], start=1):
        origin, dest, dep, arr = leg
        flat[f'leg{leg_idx}_origin_city'] = origin
        flat[f'leg{leg_idx}_destination_city'] = dest
        flat[f'leg{leg_idx}_departure_HHMM'] = dep
        flat[f'leg{leg_idx}_arrival_HHMM'] = arr

    expanded_rows.append(flat)

expanded_df = pd.DataFrame(expanded_rows)
expanded_df.head()

# Get unique destinations (this will be used in the optimization problem)
unique_destinations = expanded_df['destination_city'].dropna().unique().tolist()

print("Unique destinations:", unique_destinations)



Unique destinations: ['Las Vegas', 'San Francisco', 'Denver', 'Seattle', 'New York', 'Spokane', 'San Diego', 'Chicago', 'Dallas/Fort Worth', 'Atlanta', 'Phoenix', 'Boise', 'Boston', 'Minneapolis', 'Santa Ana', 'Salt Lake City', 'Orlando', 'Anchorage', 'Fort Lauderdale', 'San Juan', 'San Jose', 'Sacramento', 'Washington', 'Houston', 'Newark']


In [53]:
"""
code to see the routes that reach a certain destination
"""

# Target destination
target_destination = 'Seattle'

# Filter routes by destination_city
filtered_routes_df = routes_df[
    routes_df['destination_city'].str.strip().str.lower() == target_destination.lower()
].reset_index(drop=True)

# Optional: show the first few filtered routes
print(filtered_routes_df)

# If you want just the legs or other info
for i, row in filtered_routes_df.iterrows():
    print(f"Route {i+1}: {row['legs']} | Seats: {row['num_seats']} | Duration: {row['total_expected_duration']}")



   destination_city  num_legs  \
0           Seattle         1   
1           Seattle         1   
2           Seattle         1   
3           Seattle         1   
4           Seattle         1   
5           Seattle         2   
6           Seattle         2   
7           Seattle         2   
8           Seattle         2   
9           Seattle         2   
10          Seattle         2   
11          Seattle         2   
12          Seattle         2   
13          Seattle         2   
14          Seattle         2   
15          Seattle         2   
16          Seattle         2   
17          Seattle         2   
18          Seattle         2   
19          Seattle         2   
20          Seattle         2   
21          Seattle         2   
22          Seattle         2   
23          Seattle         2   
24          Seattle         2   
25          Seattle         2   
26          Seattle         2   
27          Seattle         2   
28          Seattle         2   
29        

In [54]:
"""
expanded the previous dataframe to show more details of each leg of the route
"""
# Determine the maximum number of legs in any route
max_legs = routes_df['num_legs'].max()

# Prepare rows for the expanded DataFrame
expanded_rows = []

for _, row in routes_df.iterrows():
    expanded_row = {
        'destination_city': row['destination_city'],
        'num_legs': row['num_legs'],
        'total_expected_duration': row['total_expected_duration'],
        'avg_freq_delay': row['avg_freq_delay'],
        'avg_delay': row['avg_delay'],
        'num_seats': row['num_seats']
    }

    # Add each leg as separate columns
    for i, leg in enumerate(row['legs'], start=1):
        expanded_row[f'leg{i}_origin'] = leg[0]
        expanded_row[f'leg{i}_destination'] = leg[1]
        expanded_row[f'leg{i}_dep'] = leg[2]
        expanded_row[f'leg{i}_arr'] = leg[3]

    # Fill missing legs with None
    for i in range(len(row['legs']) + 1, max_legs + 1):
        expanded_row[f'leg{i}_origin'] = None
        expanded_row[f'leg{i}_destination'] = None
        expanded_row[f'leg{i}_dep'] = None
        expanded_row[f'leg{i}_arr'] = None

    expanded_rows.append(expanded_row)

# Create the expanded DataFrame
expanded_routes_df = pd.DataFrame(expanded_rows)

# Show example
print(expanded_routes_df.head(10))


  destination_city  num_legs  total_expected_duration  avg_freq_delay  \
0        Las Vegas         1                73.013380        0.393589   
1        Las Vegas         1                73.013380        0.393589   
2        Las Vegas         1                73.013380        0.393589   
3        Las Vegas         1                73.013380        0.393589   
4        Las Vegas         1                73.013380        0.393589   
5    San Francisco         1                87.731371        0.395204   
6    San Francisco         1                87.731371        0.395204   
7    San Francisco         1                87.731371        0.395204   
8    San Francisco         1                87.731371        0.395204   
9           Denver         1               137.191554        0.332466   

   avg_delay  num_seats  leg1_origin leg1_destination  leg1_dep  leg1_arr  \
0  39.975113          3  Los Angeles        Las Vegas      1630      1743   
1  39.975113          4  Los Angeles      

In [62]:
"""
making passenger df with passenger preferences
"""
def generate_passengers(n_passengers):

    passengers = []

    # extract valid airports unique_destinations. CAVEAT: this might not include all cities previously, given
    #layover and flight feasibility, so for simplicity just going to include the destinations that have valid routes
    valid_destinations = unique_destinations

    # random grouping (families)
    # groups of size 1–4
    group_sizes = []
    remaining = n_passengers

    while remaining > 0:
        size = random.randint(1, min(4, remaining))
        group_sizes.append(size)
        remaining -= size

    group_id = 1
    passenger_id = 1

    for size in group_sizes:
        # pick shared destination and latest arrival for this group
        group_destination = random.choice(valid_destinations)
        group_latest_arrival = random.choice([h*100 + m for h in range(24,48) for m in [0,30]])

        for _ in range(size):

            passenger = {}
            passenger['passenger_id'] = passenger_id
            passenger['group_id'] = group_id

            # random origin/destination
            passenger['origin_airport'] = 'Los Angeles'
            passenger['destination_airport'] = group_destination
            passenger['latest_arrival'] = group_latest_arrival

            # airline preference (20% chance of having one)
            #airline_pref = random.choice(dictionary_df.loc['airline'].tolist())
            #passenger['airline_pref'] = airline_pref if random.random() < 0.2 else None

            passengers.append(passenger)
            passenger_id += 1

        group_id += 1

    return pd.DataFrame(passengers)

passengers_df = generate_passengers(10)
print(passengers_df)

   passenger_id  group_id origin_airport destination_airport  latest_arrival
0             1         1    Los Angeles             Spokane            4200
1             2         1    Los Angeles             Spokane            4200
2             3         1    Los Angeles             Spokane            4200
3             4         2    Los Angeles              Newark            3700
4             5         2    Los Angeles              Newark            3700
5             6         3    Los Angeles              Newark            4730
6             7         3    Los Angeles              Newark            4730
7             8         4    Los Angeles           Anchorage            2530
8             9         5    Los Angeles          Sacramento            4500
9            10         5    Los Angeles          Sacramento            4500


In [63]:
import cvxpy as cp
"""

updated optimization problem
"""

# ---------- Helpers ----------
def hhmm_to_extended_hhmm(hhmm):
    """
    Convert HHMM-like value to minutes past midnight.
    Accepts int or str. Accepts hours >= 24 (e.g. 2530 -> 25:30 -> next-day).
    Returns integer minutes.
    """
    if pd.isna(hhmm):
        return None
    s = str(int(hhmm))
    # ensure at least 3 digits ('0' -> '0' -> 0 minutes)
    if len(s) <= 2:
        hh = 0
        mm = int(s)
    else:
        hh = int(s[:-2])
        mm = int(s[-2:])
    return hh * 60 + mm

def hhmm_to_minutes(hhmm):
    """Convert HHMM to total minutes past midnight (used only for fallback calculations)."""
    if pd.isna(hhmm):
        return None
    s = str(int(hhmm)).zfill(4)
    if len(s) <= 2:
        hh = 0
        mm = int(s)
    else:
        hh = int(s[:-2])
        mm = int(s[-2:])
    return hh * 60 + mm

def minutes_to_extended_hhmm(minutes):
    """Convert total minutes past midnight to extended HHMM format."""
    hh = minutes // 60
    mm = minutes % 60
    return hh * 100 + mm

# ---------- Prepare flights (routes) from expanded_df ----------
# assume expanded_df exists and has 'route_name' column
expanded_df = expanded_df.copy().reset_index(drop=True)

# route identifiers
routes = expanded_df['route_name'].tolist()
num_routes = len(routes)

# route-level arrays
route_dest = expanded_df['destination_city'].tolist()
route_seats = expanded_df['num_seats'].astype(float).fillna(0).astype(int).tolist()
route_avg_freq_delay = expanded_df['avg_freq_delay'].fillna(0).astype(float).tolist()
route_avg_delay = expanded_df['avg_delay'].fillna(0).astype(float).tolist()
route_total_expected_duration = expanded_df['total_expected_duration'].fillna(0).tolist()

# compute route final arrival time by reading the arrival column for the last leg:
route_arrival_minutes = []
route_departure_minutes = []


for idx, row in expanded_df.iterrows():
    nlegs = int(row.get('num_legs', 1))
    arr_col = f'leg{nlegs}_arrival_HHMM'
    dep_col = f'leg1_departure_HHMM'
    arr_val = row.get(arr_col, None)
    dep_val = row.get(dep_col, None)

    arr_ehhmm = hhmm_to_extended_hhmm(arr_val)
    dep_ehhmm = hhmm_to_extended_hhmm(dep_val)

    # fallback: estimate arrival using departure + total_expected_duration
    if arr_ehhmm is None:
        if dep_val is not None and not pd.isna(row.get('total_expected_duration', np.nan)):
            dep_minutes = hhmm_to_minutes(dep_val)
            arr_minutes = dep_minutes + int(row['total_expected_duration'])
            hours = arr_minutes // 60
            mins = arr_minutes % 60
            arr_ehhmm = hours * 100 + mins


    if arr_ehhmm is None:
        arr_ehhmm = 999999
    if dep_ehhmm is None:
        dep_ehhmm = -999999

    route_arrival_minutes.append(arr_ehhmm)
    route_departure_minutes.append(dep_ehhmm)

route_arrival_minutes = np.array(route_arrival_minutes)
route_departure_minutes = np.array(route_departure_minutes)
# optional airline column
has_airline = 'airline' in expanded_df.columns
route_airline = expanded_df['airline'].tolist() if has_airline else [None] * num_routes

# ---------- Prepare passengers ----------
# assume passengers_df exists and has passenger_id, destination_airport (or destination_city), latest_arrival, airline_pref
pass_df = passengers_df.copy().reset_index(drop=True)
num_passengers = len(pass_df)
passenger_ids = pass_df['passenger_id'].tolist()

# normalize passenger destination to compare with route_dest
# If passenger field is 'destination_airport' but route has 'destination_city', ensure they match in your data.
pass_dest = pass_df.get('destination_airport', pass_df.get('destination_city')).tolist()

# convert passenger latest_arrival to minutes
pass_latest_arrival = []
for v in pass_df['latest_arrival'].tolist():
    if pd.isna(v):
        pass_latest_arrival.append(10**9)  # no constraint
    else:
        pass_latest_arrival.append(hhmm_to_minutes(v))
pass_latest_arrival = np.array(pass_latest_arrival, dtype=float)

# passenger airline preference
pass_air_pref = pass_df.get('airline_pref', pd.Series([None]*num_passengers)).tolist()

# ---------- Build optimization problem ----------
# Decision variable x[p, r] = 1 if passenger p assigned to route r
x = cp.Variable((num_passengers, num_routes), boolean=True)

# Objective: minimize expected delay proxy: freq_delay * avg_delay (per route)
route_expected_delay_metric = np.array(route_avg_freq_delay) * np.array(route_avg_delay)
# Expand to passenger x route shape for elementwise multiply - cvxpy will broadcast
objective = cp.Minimize(cp.sum(cp.multiply(x, route_expected_delay_metric)))

constraints = []

# 1) Each passenger must be assigned to exactly one route
for p_idx in range(num_passengers):
    constraints.append(cp.sum(x[p_idx, :]) == 1)

# 2) Seat capacity per route
for r_idx in range(num_routes):
    seats = int(route_seats[r_idx]) if not pd.isna(route_seats[r_idx]) else 0
    constraints.append(cp.sum(x[:, r_idx]) <= seats)

# 3) Passenger must end up at their requested destination
   #i.e., x[p, r] allowed only if route_dest[r] == pass_dest[p]
for p_idx in range(num_passengers):
    desired = pass_dest[p_idx]
    allowed = [r_idx for r_idx, rd in enumerate(route_dest) if rd == desired]
    if len(allowed) == 0:
        # No route reaches that destination: force infeasibility by disallowing all routes for this passenger
        # (user can detect infeasible problem). We add sum(x[p,:]) == 0 which conflicts with earlier ==1
        # So instead, print warning and relax to allow zero assignment (change equality to >=1 previously).
        print(f"WARNING: no route reaches destination {desired} for passenger_id {passenger_ids[p_idx]}")
        # relax constraint so a feasible solution may exist: allow zero assignment (replace equality with >=0)
        # To implement relaxation here, we replace the equality created above by removing it and using >=0
        # Simpler: set big-M to allow no feasible route by forcing the passenger to be unassigned if no match.
        # For now set x[p_idx, :] == 0
        constraints.append(x[p_idx, :] == 0)
    else:
        # block all other routes
        for r_idx in range(num_routes):
            if r_idx not in allowed:
                constraints.append(x[p_idx, r_idx] == 0)

# 4) Latest-arrival constraint: if route arrival > passenger latest, disallow
for p_idx in range(num_passengers):
    la = pass_latest_arrival[p_idx]
    for r_idx in range(num_routes):
        if route_arrival_minutes[r_idx] > la:
            constraints.append(x[p_idx, r_idx] == 0)

# 5) Airline preference: if passenger has an airline_pref and route has airline, block others
# if has_airline:
#     for p_idx in range(num_passengers):
#         pref = pass_air_pref[p_idx]
#         if pref is not None and not pd.isna(pref):
#             allowed = [r_idx for r_idx, a in enumerate(route_airline) if a == pref]
#             if len(allowed) == 0:
#                 # none match - prefer to ignore preference rather than make infeasible; print warning
#                 print(f"WARNING: Passenger {passenger_ids[p_idx]} prefers airline {pref} but no route matches.")
#             else:
#                 for r_idx in range(num_routes):
#                     if r_idx not in allowed:
#                         constraints.append(x[p_idx, r_idx] == 0)

# 6) Group constraint: members of same group must be on the same route
groups = pass_df['group_id'].unique()
for g in groups:
    members_idx = pass_df.index[pass_df['group_id'] == g].tolist()
    if len(members_idx) <= 1:
        continue
    # enforce all members have identical route choice using auxiliary y_g_r (binary)
    y = cp.Variable(num_routes, boolean=True)
    # link x and y: for each member p and route r: x[p,r] == y[r]
    for r_idx in range(num_routes):
        for p_idx in members_idx:
            constraints.append(x[p_idx, r_idx] == y[r_idx])
    # choose exactly one route for group
    constraints.append(cp.sum(y) == 1)

# ---------- Solve ----------
prob = cp.Problem(objective, constraints)

# choose solver - GLPK_MI is common; if not installed try CBC or ECOS_BB
try:
    prob.solve(solver=cp.GLPK_MI, verbose=True)
except Exception as e:
    print("GLPK_MI failed:", e)
    try:
        prob.solve(solver=cp.ECOS_BB, verbose=True)
    except Exception as e2:
        print("ECOS_BB failed:", e2)
        prob.solve(verbose=True)  # let cvxpy pick

print("Solver status:", prob.status)

# ---------- Extract assignment results ----------
if prob.status == 'optimal' or prob.status == 'optimal_inaccurate':
    x_val = x.value
    assigned_pairs = []
    for p_idx in range(num_passengers):
        for r_idx in range(num_routes):
            if x_val[p_idx, r_idx] > 0.5:
                assigned_pairs.append({
                    'passenger_id': passenger_ids[p_idx],
                    'route_name': routes[r_idx],
                    'route_destination': route_dest[r_idx],
                    #'route_arrival_min': route_arrival_minutes[r_idx],
                    'route_seats': route_seats[r_idx]
                })
    assignments_df = pd.DataFrame(assigned_pairs)
   # ---------- Expand assignment results with passenger data ----------

    if prob.status in ['optimal', 'optimal_inaccurate']:
        x_val = x.value
        assigned_rows = []

        for p_idx in range(num_passengers):
            for r_idx in range(num_routes):
                if x_val[p_idx, r_idx] > 0.5:
                    assigned_rows.append({
                        'passenger_id': passenger_ids[p_idx],
                        'route_name': routes[r_idx],
                        'route_destination': route_dest[r_idx],
                        #'route_arrival_min': route_arrival_minutes[r_idx],
                        'route_seats': route_seats[r_idx]
                    })

        assignments_df = pd.DataFrame(assigned_rows)

        # ---- NEW: merge with full passenger dataframe ----
        expanded_assignments_df = assignments_df.merge(
            passengers_df,
            on='passenger_id',
            how='left'
        )

        print("\nExpanded Assignments With Passenger Data:")
        print(expanded_assignments_df)

    else:
        print("Problem not solved to optimality. Status:", prob.status)



(CVXPY) Nov 25 07:13:57 PM: Your problem has 4004 variables, 5649 constraints, and 0 parameters.


                                     CVXPY                                     
                                     v1.6.7                                    


(CVXPY) Nov 25 07:13:57 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Nov 25 07:13:57 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Nov 25 07:13:57 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Nov 25 07:13:57 PM: Your problem is compiled with the CPP canonicalization backend.
(CVXPY) Nov 25 07:13:58 PM: Compiling problem (target solver=GLPK_MI).
(CVXPY) Nov 25 07:13:58 PM: Reduction chain: Dcp2Cone -> CvxAttr2Constr -> ConeMatrixStuffing -> GLPK_MI
(CVXPY) Nov 25 07:13:58 PM: Applying reduction Dcp2Cone


-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------


(CVXPY) Nov 25 07:13:58 PM: Applying reduction CvxAttr2Constr
(CVXPY) Nov 25 07:13:58 PM: Applying reduction ConeMatrixStuffing
(CVXPY) Nov 25 07:14:03 PM: Applying reduction GLPK_MI
(CVXPY) Nov 25 07:14:03 PM: Finished problem compilation (took 6.017e+00 seconds).
(CVXPY) Nov 25 07:14:03 PM: Invoking solver GLPK_MI  to obtain a solution.
(CVXPY) Nov 25 07:14:03 PM: Problem status: optimal
(CVXPY) Nov 25 07:14:03 PM: Optimal value: 1.551e+02
(CVXPY) Nov 25 07:14:03 PM: Compilation took 6.017e+00 seconds
(CVXPY) Nov 25 07:14:03 PM: Solver (including time spent in interface) took 4.578e-02 seconds


-------------------------------------------------------------------------------
                                Numerical solver                               
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
                                    Summary                                    
-------------------------------------------------------------------------------
Solver status: optimal

Expanded Assignments With Passenger Data:
   passenger_id            route_name route_destination  route_seats  \
0             1      Spokane-route-26           Spokane            4   
1             2      Spokane-route-26           Spokane            4   
2             3      Spokane-route-26           Spokane            4   
3             4      Newark-route-221            Newark            4   
4             5      Newark-route-221            Newark            4   
5             6      N

In [57]:
"""
code to check the route that the optimization problem suggests for passengers
"""
route_to_find = "Phoenix-route-75"

matching_rows = expanded_df[expanded_df['route_name'] == route_to_find]
print(matching_rows)

          route_name destination_city  num_legs  total_expected_duration  \
74  Phoenix-route-75          Phoenix         2               550.354092   

    avg_freq_delay  avg_delay  num_seats leg1_origin_city  \
74        0.393616  39.722776          3      Los Angeles   

   leg1_destination_city  leg1_departure_HHMM  ...  leg2_departure_HHMM  \
74             Las Vegas                  830  ...               1630.0   

   leg2_arrival_HHMM leg3_origin_city  leg3_destination_city  \
74            1740.0              NaN                    NaN   

    leg3_departure_HHMM leg3_arrival_HHMM leg4_origin_city  \
74                  NaN               NaN              NaN   

    leg4_destination_city  leg4_departure_HHMM leg4_arrival_HHMM  
74                    NaN                  NaN               NaN  

[1 rows x 23 columns]


In [None]:
"""
results of optimization

for 10 passengers, global  = 150 minutes delayed

run 1: optimization problem = 151
- for set of passengers = 3 phoenix, 4 boston, 3 LV = 160.3972
- 5.859% better

run 2: optimization problem = 150
- 2 SLC, 4 Santa Ana, 4 NY = 157.917
- 5.013% better

run 3: optimization problem: 155.1
- 3 Spokane, 4 Newark, 1 Anchorage, 2 Sacramento: 161.675
- 4.067
"""


In [59]:
"""
baseline: overall average and average by destination
"""

overall_avg_freq_delay = expanded_df['avg_freq_delay'].mean()
overall_avg_delay = expanded_df['avg_delay'].mean()

print(f"Overall average frequency of delay: {overall_avg_freq_delay:.3f}")
print(f"Overall average amount of delay: {overall_avg_delay:.3f}")


avg_by_destination = expanded_df.groupby('destination_city')[['avg_freq_delay', 'avg_delay']].mean().reset_index()

print("\nAverage frequency and amount of delay by destination:")
print(avg_by_destination)

Overall average frequency of delay: 0.385
Overall average amount of delay: 39.467

Average frequency and amount of delay by destination:
     destination_city  avg_freq_delay  avg_delay
0           Anchorage        0.423355  31.225062
1             Atlanta        0.400690  37.390347
2               Boise        0.315482  29.797701
3              Boston        0.345424  51.877750
4             Chicago        0.386758  38.598209
5   Dallas/Fort Worth        0.372983  52.248036
6              Denver        0.371959  38.583139
7     Fort Lauderdale        0.403525  46.048234
8             Houston        0.380193  53.766353
9           Las Vegas        0.395692  35.284200
10        Minneapolis        0.378812  34.872844
11           New York        0.341013  50.756829
12             Newark        0.397274  50.321488
13            Orlando        0.392119  49.491757
14            Phoenix        0.393616  39.722776
15         Sacramento        0.419846  38.516039
16     Salt Lake City        0