In [None]:
!pip install pandas
!pip install numpy
!pip install seaborn
!pip install matplotlib
!pip install scikit-learn
!pip install scipy
!pip install gdown




In [1]:

# Import portion of a package
import matplotlib.pyplot as plt  # Most common visualization package that a lot of others are based on

# Import full packages under custom name
import numpy as np  # Common package for numerical methods
import pandas as pd  # Common package for data storeage/manipulation
import seaborn as sns  # Common package for statistical visualizations

# Import portion of a package
import scipy.stats as stats
from sklearn.impute import SimpleImputer as Imputer  # Specific function from common machine learning package\

#more packages
import gdown


In [2]:


# File ID from your link (assuming this is the cleaned dataset)
file_id = "13elVDJJ6JUH2gsj36CxPjkLryE2rnl-r"
url = f"https://drive.google.com/uc?id={file_id}"

# Download the file
gdown.download(url, "flight_data.csv", quiet=False)

# Now load it
flight_data = pd.read_csv("flight_data.csv")
print(flight_data.shape)
print(flight_data.head())

Downloading...
From (original): https://drive.google.com/uc?id=13elVDJJ6JUH2gsj36CxPjkLryE2rnl-r
From (redirected): https://drive.google.com/uc?id=13elVDJJ6JUH2gsj36CxPjkLryE2rnl-r&confirm=t&uuid=e449dc90-81d8-4b29-8ca6-e80fbd3cd2fa
To: /content/flight_data.csv
100%|██████████| 1.32G/1.32G [00:14<00:00, 91.0MB/s]


(7546988, 32)
   YEAR  QUARTER  MONTH  DAY_OF_MONTH  DAY_OF_WEEK  MKT_CARRIER_AIRLINE_ID  \
0  2024        1      1             1            1                   19393   
1  2024        1      1             1            1                   19393   
2  2024        1      1             1            1                   19393   
3  2024        1      1             1            1                   19393   
4  2024        1      1             1            1                   19393   

   ORIGIN_AIRPORT_ID  ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_NAME  DEST_AIRPORT_ID  \
0              10140                1014005  Albuquerque, NM            10423   
1              10140                1014005  Albuquerque, NM            10423   
2              10140                1014005  Albuquerque, NM            10800   
3              10140                1014005  Albuquerque, NM            10821   
4              10140                1014005  Albuquerque, NM            11259   

   ...  CANCELLATION_CODE CRS_

In [3]:

df = flight_data.copy()
print (df.head())
print(df.columns.tolist())

   YEAR  QUARTER  MONTH  DAY_OF_MONTH  DAY_OF_WEEK  MKT_CARRIER_AIRLINE_ID  \
0  2024        1      1             1            1                   19393   
1  2024        1      1             1            1                   19393   
2  2024        1      1             1            1                   19393   
3  2024        1      1             1            1                   19393   
4  2024        1      1             1            1                   19393   

   ORIGIN_AIRPORT_ID  ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_NAME  DEST_AIRPORT_ID  \
0              10140                1014005  Albuquerque, NM            10423   
1              10140                1014005  Albuquerque, NM            10423   
2              10140                1014005  Albuquerque, NM            10800   
3              10140                1014005  Albuquerque, NM            10821   
4              10140                1014005  Albuquerque, NM            11259   

   ...  CANCELLATION_CODE CRS_ELAPSED_TIME  

In [None]:
"""
minimize sum of (epsilon?) (frequency * average * passengers)
st.
    ...

optimization problem in words
- frequency = probability of flight being delayed
- average = expected delay time (in minutes)
- passengers = number of passengers taking that flight
- possible constraints for passengers:
  - arrival/depaarture time being between certain times
  - flight duration being shorter than x hours
  - groups of passengers staying together (ie families)
  - max number of extra seats on flights
  - need to get to destination by a certain time
  - airline?
  - any connecting flights must have at least a 1 hour buffer for connecting purposes


info needed about the set of flights:
- number of seats available (this is a made up number)
- frequency
- average
- arrival/departure time/day/month/year
- flight duration
- destination/origin airport/city
- airline

"""

In [4]:
"""
calculating average and frequency of delay for each flight with a different origin-destination city pair

"""
# Boolean series: True if there is a delay
def compute_delay_stats(group):
    delayed = group['DEP_DELAY_NEW'] > 0

    # Probability of delay
    prob_delay = delayed.sum() / len(group)

    # Average delay (only for delayed rows)
    if delayed.sum() > 0:
        avg_delay = group.loc[delayed, 'DEP_DELAY_NEW'].mean()
    else:
        avg_delay = 0.0

    return pd.Series({'freq_delay': prob_delay, 'avg_delay': avg_delay})

# Group by origin and destination airport to calculate delay stats
delay_stats = df.groupby(['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']).apply(compute_delay_stats).reset_index()

# Display
print(delay_stats.head())



   ORIGIN_AIRPORT_ID  DEST_AIRPORT_ID  freq_delay  avg_delay
0              10135            10397    0.259587  51.750000
1              10135            10693    0.247619  45.307692
2              10135            11057    0.282161  76.173252
3              10135            11292    0.255814  51.181818
4              10135            11697    0.283951  38.652174


  delay_stats = df.groupby(['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']).apply(compute_delay_stats).reset_index()


In [5]:
"""

for the optimization problem, assuming starting point is LA. to get a network of flights, took the top 5
most frequent destinations from LA, from those 5 took the top 5, and from those took the top 2 to get a
solid variety of cities.

"""
#doing a network of flights instead of top 20
start_airport = 12892  # LA

# --- First leg: LA -> top 5 destinations ---
from_LA = df[df['ORIGIN_AIRPORT_ID'] == start_airport]
top5_first_leg = from_LA['DEST_AIRPORT_ID'].value_counts().head(5).index.tolist()
print("Top 5 destinations from LA:", top5_first_leg)

# --- Second leg: from each first leg destination, top 5 destinations (excluding LA) ---
second_leg = {}
second_leg_airports = set()
for origin in top5_first_leg:
    flights_from_origin = df[df['ORIGIN_AIRPORT_ID'] == origin]
    top5 = flights_from_origin[flights_from_origin['DEST_AIRPORT_ID'] != start_airport] \
            ['DEST_AIRPORT_ID'].value_counts().head(5).index.tolist()
    second_leg[origin] = top5
    second_leg_airports.update(top5)

print("\nSecond leg top 5 destinations from each first leg destination:")
for origin, dests in second_leg.items():
    print(f"{origin} -> {dests}")

# --- Third leg: from second leg destinations, top 2 destinations (excluding first leg, second leg, and return) ---
third_leg = {}
first_leg_set = set(top5_first_leg)

for origin, dests in second_leg.items():
    for dest in dests:
        flights_from_dest = df[df['ORIGIN_AIRPORT_ID'] == dest]

        # exclude first leg airports, all second leg airports, the immediate origin, AND Los Angeles
        exclude_set = first_leg_set | second_leg_airports | {dest, start_airport}

        # pick top 2 *after* removing Los Angeles
        top2 = (
            flights_from_dest[
                ~flights_from_dest['DEST_AIRPORT_ID'].isin(exclude_set)
            ]['DEST_AIRPORT_ID']
            .value_counts()
            .head(2)
            .index
            .tolist()
        )

        third_leg[dest] = top2

print("\nThird leg top 2 destinations from second leg (excluding first leg, second leg, and return trips):")
for origin, dests in third_leg.items():
    print(f"{origin} -> {dests}")


Top 5 destinations from LA: [14771, 12889, 12478, 11292, 14747]

Second leg top 5 destinations from each first leg destination:
14771 -> [14747, 12478, 14679, 12889, 11292]
12889 -> [11292, 14107, 14679, 14747, 11298]
12478 -> [14771, 10721, 13303, 13204, 14492]
11292 -> [14107, 12889, 14869, 14747, 13930]
14747 -> [10299, 14057, 14107, 11292, 12889]

Third leg top 2 destinations from second leg (excluding first leg, second leg, and return trips):
14747 -> [11884, 10713]
12478 -> [11697, 14843]
14679 -> [14831, 14893]
12889 -> [14893, 10397]
11292 -> [13487, 10397]
14107 -> [14908, 10397]
11298 -> [12266, 10397]
14771 -> [14908, 11618]
10721 -> [11278, 12953]
13303 -> [10397, 12953]
13204 -> [10397, 11618]
14492 -> [10397, 12953]
14869 -> [10397, 12266]
13930 -> [12953, 11278]
10299 -> [11630, 12523]
14057 -> [14831, 14893]


In [6]:
"""
converts all the airport IDs from the previous step to cities
"""

airport_to_city = (
    df[['ORIGIN_AIRPORT_ID', 'ORIGIN_CITY_NAME']]
    .drop_duplicates()
    .assign(ORIGIN_CITY_NAME=lambda x: x['ORIGIN_CITY_NAME'].str.split(',').str[0])
    .set_index('ORIGIN_AIRPORT_ID')['ORIGIN_CITY_NAME']
    .to_dict()
)


def id_to_city(airport_id):
    return airport_to_city.get(airport_id, f"Unknown({airport_id})")

def list_to_cities(id_list):
    return [id_to_city(i) for i in id_list]

print("Top 5 destinations from LA:")
for a in top5_first_leg:
    print(a, "->", id_to_city(a))

print("\nSecond leg top 5 destinations from each first leg destination:")
for origin, dests in second_leg.items():
    print(f"{origin} ({id_to_city(origin)}) -> {list_to_cities(dests)}")

print("\nThird leg top 2 destinations from second leg:")
for origin, dests in third_leg.items():
    print(f"{origin} ({id_to_city(origin)}) -> {list_to_cities(dests)}")


Top 5 destinations from LA:
14771 -> San Francisco
12889 -> Las Vegas
12478 -> New York
11292 -> Denver
14747 -> Seattle

Second leg top 5 destinations from each first leg destination:
14771 (San Francisco) -> ['Seattle', 'New York', 'San Diego', 'Las Vegas', 'Denver']
12889 (Las Vegas) -> ['Denver', 'Phoenix', 'San Diego', 'Seattle', 'Dallas/Fort Worth']
12478 (New York) -> ['San Francisco', 'Boston', 'Miami', 'Orlando', 'Raleigh/Durham']
11292 (Denver) -> ['Phoenix', 'Las Vegas', 'Salt Lake City', 'Seattle', 'Chicago']
14747 (Seattle) -> ['Anchorage', 'Portland', 'Phoenix', 'Denver', 'Las Vegas']

Third leg top 2 destinations from second leg:
14747 (Seattle) -> ['Spokane', 'Boise']
12478 (New York) -> ['Fort Lauderdale', 'San Juan']
14679 (San Diego) -> ['San Jose', 'Sacramento']
12889 (Las Vegas) -> ['Sacramento', 'Atlanta']
11292 (Denver) -> ['Minneapolis', 'Atlanta']
14107 (Phoenix) -> ['Santa Ana', 'Atlanta']
11298 (Dallas/Fort Worth) -> ['Houston', 'Atlanta']
14771 (San Francisc

In [7]:
"""
this section lists all possible destinations for the optimization problem
"""
# Collect all city names from each leg
possible_destinations = set()

# --- First leg ---
for a in top5_first_leg:
    possible_destinations.add(id_to_city(a))

# --- Second leg ---
for origin, dests in second_leg.items():
    possible_destinations.add(id_to_city(origin))
    for d in dests:
        possible_destinations.add(d)

# --- Third leg ---
for origin, dests in third_leg.items():
    possible_destinations.add(id_to_city(origin))
    for d in dests:
        possible_destinations.add(d)

# Convert airport IDs in second/third leg to city names where needed
possible_destinations = {id_to_city(c) if isinstance(c, int) else c
                         for c in possible_destinations}

# Turn into a sorted list
possible_destinations = sorted(possible_destinations)

print(possible_destinations)
print("Number of unique cities:", len(possible_destinations))

['Anchorage', 'Atlanta', 'Boise', 'Boston', 'Chicago', 'Dallas/Fort Worth', 'Denver', 'Fairbanks', 'Fort Lauderdale', 'Houston', 'Juneau', 'Las Vegas', 'Miami', 'Minneapolis', 'New York', 'Newark', 'Orlando', 'Phoenix', 'Portland', 'Raleigh/Durham', 'Sacramento', 'Salt Lake City', 'San Diego', 'San Francisco', 'San Jose', 'San Juan', 'Santa Ana', 'Seattle', 'Spokane', 'Washington']
Number of unique cities: 30


In [13]:
"""
the output of this is turning all the "legs" into pairs of origin-destination cities
"""


# --- Gather all origin-destination pairs from 3 legs ---
# First leg pairs
first_leg_pairs = [(id_to_city(start_airport), id_to_city(dest)) for dest in top5_first_leg]

# Second leg pairs
second_leg_pairs = []
for origin_airport, dest_airports in second_leg.items():
    origin_city = id_to_city(origin_airport)
    for dest_airport in dest_airports:
        dest_city = id_to_city(dest_airport)  # convert to city
        second_leg_pairs.append((origin_city, dest_city))

# Third leg pairs
third_leg_pairs = []
for origin_airport, dest_airports in third_leg.items():
    origin_city = id_to_city(origin_airport)
    for dest_airport in dest_airports:
        dest_city = id_to_city(dest_airport)  # convert to city
        third_leg_pairs.append((origin_city, dest_city))

# Combine all pairs
all_pairs = set(first_leg_pairs + second_leg_pairs + third_leg_pairs)

print(all_pairs)
print (len(all_pairs))


{('San Francisco', 'Denver'), ('Las Vegas', 'Phoenix'), ('Denver', 'Las Vegas'), ('Seattle', 'Denver'), ('Denver', 'Seattle'), ('Chicago', 'New York'), ('Los Angeles', 'Denver'), ('New York', 'Boston'), ('Las Vegas', 'Seattle'), ('San Diego', 'San Jose'), ('Orlando', 'Newark'), ('New York', 'Fort Lauderdale'), ('Las Vegas', 'Dallas/Fort Worth'), ('New York', 'Miami'), ('Seattle', 'Phoenix'), ('San Francisco', 'Las Vegas'), ('Seattle', 'Boise'), ('Las Vegas', 'Sacramento'), ('Denver', 'Salt Lake City'), ('Salt Lake City', 'Atlanta'), ('Seattle', 'Las Vegas'), ('Phoenix', 'Atlanta'), ('Boston', 'Washington'), ('Miami', 'Atlanta'), ('Salt Lake City', 'Houston'), ('Anchorage', 'Juneau'), ('Dallas/Fort Worth', 'Atlanta'), ('San Francisco', 'Santa Ana'), ('Raleigh/Durham', 'Atlanta'), ('San Francisco', 'Seattle'), ('Dallas/Fort Worth', 'Houston'), ('Anchorage', 'Fairbanks'), ('Portland', 'San Jose'), ('Los Angeles', 'Las Vegas'), ('Denver', 'Chicago'), ('Raleigh/Durham', 'New York'), ('Los A

In [9]:
"""
this part shows all feasible routes starting from LA to each destination in the all pairs network
the output of this is a dictionary of city: possible routes from LA to that city
"""

from collections import defaultdict, deque

# Create adjacency list from all_pairs
graph = defaultdict(list)
for origin, dest in all_pairs:
    graph[origin].append(dest)

start_city = "Los Angeles"

# Dictionary to store feasible routes: destination -> set of tuples (routes)
feasible_routes = defaultdict(set)

# BFS-like traversal for up to 3 legs (direct + 2 connections)
max_legs = 3
queue = deque()
queue.append( ([start_city], 0) )  # (current_route_list, current_depth)

while queue:
    route, depth = queue.popleft()
    current_city = route[-1]

    if depth > 0:  # exclude the starting city itself
        dest = current_city
        feasible_routes[dest].add(tuple(route))

    if depth < max_legs:
        for neighbor in graph[current_city]:
            # avoid cycles in the route
            if neighbor not in route:
                queue.append((route + [neighbor], depth + 1))

# Example: print feasible routes to Seattle
for dest, routes in feasible_routes.items():
    print(dest, routes)


Denver {('Los Angeles', 'Seattle', 'Las Vegas', 'Denver'), ('Los Angeles', 'San Francisco', 'Seattle', 'Denver'), ('Los Angeles', 'Seattle', 'Denver'), ('Los Angeles', 'San Francisco', 'Denver'), ('Los Angeles', 'New York', 'San Francisco', 'Denver'), ('Los Angeles', 'Denver'), ('Los Angeles', 'Las Vegas', 'Seattle', 'Denver'), ('Los Angeles', 'Las Vegas', 'Denver'), ('Los Angeles', 'San Francisco', 'Las Vegas', 'Denver')}
Las Vegas {('Los Angeles', 'Seattle', 'Denver', 'Las Vegas'), ('Los Angeles', 'San Francisco', 'Seattle', 'Las Vegas'), ('Los Angeles', 'Denver', 'Seattle', 'Las Vegas'), ('Los Angeles', 'Denver', 'Las Vegas'), ('Los Angeles', 'San Francisco', 'Las Vegas'), ('Los Angeles', 'San Francisco', 'Denver', 'Las Vegas'), ('Los Angeles', 'Seattle', 'Las Vegas'), ('Los Angeles', 'New York', 'San Francisco', 'Las Vegas'), ('Los Angeles', 'Las Vegas')}
Seattle {('Los Angeles', 'Denver', 'Las Vegas', 'Seattle'), ('Los Angeles', 'Las Vegas', 'Seattle'), ('Los Angeles', 'Las Vegas'

In [18]:
"""

the output of this is turning all the "legs" into pairs of origin-destination cities, but using airport ids
for future steps like filtering the original dataframe and generating statistics for each pair
"""


# --- Gather all origin-destination pairs from 3 legs ---

# First leg pairs
first_leg_pairs_id = [(start_airport, dest) for dest in top5_first_leg]

# Second leg pairs
second_leg_pairs_id = []
for origin_airport, dest_airports in second_leg.items():
    for dest_airport in dest_airports:
        second_leg_pairs_id.append((origin_airport, dest_airport))

# Third leg pairs
third_leg_pairs_id = []
for origin_airport, dest_airports in third_leg.items():
    for dest_airport in dest_airports:
        third_leg_pairs_id.append((origin_airport, dest_airport))

# Combine all pairs
all_pairs_id = set(first_leg_pairs_id + second_leg_pairs_id + third_leg_pairs_id)

print(all_pairs_id)
print(len(all_pairs_id))


{(12889, 14893), (14771, 14908), (14747, 11884), (14679, 14831), (12892, 11292), (14747, 11292), (14771, 14679), (14869, 10397), (12889, 14679), (12478, 14843), (11292, 14107), (12478, 14492), (11292, 13930), (10299, 11630), (12478, 10721), (14771, 11292), (11292, 10397), (12892, 12478), (12889, 11292), (12478, 11697), (14747, 14107), (14107, 10397), (12889, 11298), (14771, 11618), (14492, 10397), (12478, 13204), (14057, 14893), (11292, 14747), (11292, 13487), (13204, 11618), (14869, 12266), (11292, 12889), (11292, 14869), (14771, 12478), (10299, 12523), (14747, 10299), (11298, 10397), (12892, 12889), (14747, 12889), (12892, 14747), (14492, 12953), (13204, 10397), (12889, 14107), (13303, 10397), (12889, 10397), (14679, 14893), (12892, 14771), (14057, 14831), (14747, 14057), (12478, 14771), (14771, 14747), (14107, 14908), (14771, 12889), (14747, 10713), (10721, 12953), (10721, 11278), (13303, 12953), (13930, 12953), (12889, 14747), (13930, 11278), (11298, 12266), (12478, 13303)}
62


In [22]:

"""

this filters the full dataset to only the feasible flight pairs from the 3 legs

"""

# Mask for any pair in all_pairs (airport IDs)
mask = df[['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']].apply(
    lambda row: (row['ORIGIN_AIRPORT_ID'], row['DEST_AIRPORT_ID']) in all_pairs_id, axis=1
)

# Filtered DataFrame
filtered_df = df[mask].drop_duplicates()

print(f"Number of flights after filtering: {len(filtered_df)}")
print(filtered_df.head())

Number of flights after filtering: 383598
      YEAR  QUARTER  MONTH  DAY_OF_MONTH  DAY_OF_WEEK  MKT_CARRIER_AIRLINE_ID  \
1118  2024        1      1             1            1                   19393   
1119  2024        1      1             1            1                   19393   
1120  2024        1      1             1            1                   19393   
1121  2024        1      1             1            1                   19393   
1122  2024        1      1             1            1                   19393   

      ORIGIN_AIRPORT_ID  ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_NAME  \
1118              11292                1129202       Denver, CO   
1119              11292                1129202       Denver, CO   
1120              11292                1129202       Denver, CO   
1121              11292                1129202       Denver, CO   
1122              11292                1129202       Denver, CO   

      DEST_AIRPORT_ID  ...  ACTUAL_ELAPSED_TIME DISTANCE  CARRIER_DE

In [24]:
"""
for each of the origin-city pairs, gets all of the flight stats
"""

#making set of top 20 flights

# Assuming the dataframe is already loaded
# Group by 'ORIGIN_CITY_NAME' and 'DEST_AIRPORT_ID' and count the occurrences
route_stats = filtered_df.groupby(['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']).agg(
    origin_city=('ORIGIN_CITY_NAME', 'first'),
    destination_city=('DEST_CITY_NAME', 'first'),
    count=('ORIGIN_CITY_NAME', 'size'),
    avg_duration=('ACTUAL_ELAPSED_TIME', 'mean'),
    most_common_departure=('CRS_DEP_TIME', lambda x: x.mode()[0] if not x.mode().empty else np.nan), #arrival time will be calculated by adding elapsed time to arrival
    most_common_airline=('MKT_CARRIER_AIRLINE_ID', lambda x: x.mode()[0] if not x.mode().empty else np.nan)
).reset_index()

# Add a "made-up" number of seats available (for now I will assume random values, e.g. between 100 and 300)
route_stats['seats_available'] = np.random.randint(1, 6, size=len(route_stats))

# Merge with your top_20_routes dataframe
route_stats = route_stats.merge(delay_stats, on=['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID'], how='left')

route_stats['origin_city'] = route_stats['origin_city'].str.split(',').str[0] #extracting city from city,state
route_stats['destination_city'] = route_stats['destination_city'].str.split(',').str[0]


#calculating the most common city to use as starting point for the group of passengers
origin_counts = route_stats['ORIGIN_AIRPORT_ID'].astype(int).value_counts()
most_common_origin_id = origin_counts.idxmax()

print("Most common origin airport ID:", most_common_origin_id) # it is 12892 (freq of 5) which is LA

# Display the top 20 routes with all the requested details
print(route_stats[['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'origin_city', 'destination_city', 'avg_duration', 'most_common_departure',
                       'most_common_airline','seats_available', 'freq_delay', 'avg_delay']])


Most common origin airport ID: 11292
    ORIGIN_AIRPORT_ID  DEST_AIRPORT_ID     origin_city destination_city  \
0               10299            11630       Anchorage        Fairbanks   
1               10299            12523       Anchorage           Juneau   
2               10721            11278          Boston       Washington   
3               10721            12953          Boston         New York   
4               11292            10397          Denver          Atlanta   
..                ...              ...             ...              ...   
57              14771            14679   San Francisco        San Diego   
58              14771            14747   San Francisco          Seattle   
59              14771            14908   San Francisco        Santa Ana   
60              14869            10397  Salt Lake City          Atlanta   
61              14869            12266  Salt Lake City          Houston   

    avg_duration  most_common_departure  most_common_airline  

In [29]:
"""
produces a dictionary for each of the origin-city pairs including all stats. also generated a randomized
number of the same flight departing at different times to simulate real life.
"""

import random
simulated_dict = {}

for _, row in route_stats.iterrows():

    # Key base like "New York-Los Angeles"
    base_key = f"{row['origin_city']}-{row['destination_city']}"

    # Random number of simulated flights (e.g. between 1 and 5)
    num_simulated_flights = random.randint(1, 5)

    for i in range(1, num_simulated_flights + 1):

        # Random departure time in HHMM (0000–2359 but realistic ranges optional)
        allowed_times = []

        for hour in range(24):          # 0 → 23
            for minute in [0, 30]:      # :00 and :30
                time_value = hour * 100 + minute
                allowed_times.append(time_value)


        random_time = random.choice(allowed_times)
        # Random seats available (exponential)
        min_seats = 1
        max_seats = 5
        scale = 0.5  # smaller scale → more skewed toward larger numbers

        # Generate one random seat number
        rand_val = np.random.exponential(scale=scale)
        # Normalize and scale to min-max range
        random_seats = int(max_seats - (rand_val / (rand_val + 1)) * (max_seats - min_seats))
        random_seats = max(min_seats, min(random_seats, max_seats))

        # Build key with -1, -2, -3 suffix
        key = f"{base_key}-{i}"

        # Store all data exactly like your structure
        simulated_dict[key] = [
            row['origin_city'],
            row['destination_city'],
            row['ORIGIN_AIRPORT_ID'],
            row['DEST_AIRPORT_ID'],
            row['avg_duration'],
            row['freq_delay'],
            row['avg_delay'],
            random_time,
            row['most_common_airline'],
            random_seats
        ]

dictionary_df = pd.DataFrame(simulated_dict, index = ['origin_city', 'destination_city','origin_airport', 'destination_airport', 'expected_duration', 'freq_delay', 'avg_delay','departure(HHMM)', 'airline', 'num_seats'])
dictionary_df



Unnamed: 0,Anchorage-Fairbanks-1,Anchorage-Juneau-1,Anchorage-Juneau-2,Anchorage-Juneau-3,Anchorage-Juneau-4,Boston-Washington-1,Boston-New York-1,Denver-Atlanta-1,Denver-Atlanta-2,Denver-Las Vegas-1,...,San Francisco-San Diego-2,San Francisco-San Diego-3,San Francisco-San Diego-4,San Francisco-Seattle-1,San Francisco-Santa Ana-1,Salt Lake City-Atlanta-1,Salt Lake City-Atlanta-2,Salt Lake City-Houston-1,Salt Lake City-Houston-2,Salt Lake City-Houston-3
origin_city,Anchorage,Anchorage,Anchorage,Anchorage,Anchorage,Boston,Boston,Denver,Denver,Denver,...,San Francisco,San Francisco,San Francisco,San Francisco,San Francisco,Salt Lake City,Salt Lake City,Salt Lake City,Salt Lake City,Salt Lake City
destination_city,Fairbanks,Juneau,Juneau,Juneau,Juneau,Washington,New York,Atlanta,Atlanta,Las Vegas,...,San Diego,San Diego,San Diego,Seattle,Santa Ana,Atlanta,Atlanta,Houston,Houston,Houston
origin_airport,10299,10299,10299,10299,10299,10721,10721,11292,11292,11292,...,14771,14771,14771,14771,14771,14869,14869,14869,14869,14869
destination_airport,11630,12523,12523,12523,12523,11278,12953,10397,10397,12889,...,14679,14679,14679,14747,14908,10397,10397,12266,12266,12266
expected_duration,59.925075,99.441055,99.441055,99.441055,99.441055,102.933923,77.132365,169.913525,169.913525,115.340555,...,95.171284,95.171284,95.171284,129.926744,92.40545,213.182857,213.182857,180.664073,180.664073,180.664073
freq_delay,0.264935,0.246305,0.246305,0.246305,0.246305,0.253081,0.219434,0.461463,0.461463,0.484563,...,0.438454,0.438454,0.438454,0.430967,0.350959,0.43115,0.43115,0.279802,0.279802,0.279802
avg_delay,29.121849,30.77,30.77,30.77,30.77,55.96072,55.845748,30.557569,30.557569,36.106195,...,39.526297,39.526297,39.526297,36.560111,39.483333,30.7,30.7,48.057878,48.057878,48.057878
departure(HHMM),830,400,330,400,1830,630,230,2100,330,1400,...,500,630,630,2100,930,1300,1430,230,2000,0
airline,19930,19930,19930,19930,19930,20409,19790,19790,19790,19393,...,19930,19930,19930,19930,19930,19790,19790,19977,19977,19977
num_seats,4,3,4,3,4,3,4,3,4,4,...,4,4,3,3,4,2,3,4,3,4


In [26]:
distinct_routes = (
    dictionary_df.loc[['origin_city', 'destination_city']]
    .T
    .drop_duplicates()
    .reset_index(drop=True)
)

distinct_routes

Unnamed: 0,origin_city,destination_city
0,Anchorage,Fairbanks
1,Anchorage,Juneau
2,Boston,Washington
3,Boston,New York
4,Denver,Atlanta
...,...,...
57,San Francisco,San Diego
58,San Francisco,Seattle
59,San Francisco,Santa Ana
60,Salt Lake City,Atlanta


In [34]:
"""

starting here, im trying to build an output that takes the feasible_routes dictionary for each destination that
has the flight stats so i can use those as options in my optimization problem. the first step here is to convert departure
and arrival times to minutes
"""
def hhmm_to_minutes(hhmm):
    hh = hhmm // 100
    mm = hhmm % 100
    return hh*60 + mm

def minutes_to_hhmm(minutes):
    minutes = int(minutes) % (24*60)
    hh = minutes // 60
    mm = minutes % 60
    return hh*100 + mm

dictionary_df = dictionary_df.T

# Convert rows from strings to proper types if needed
dictionary_df['expected_duration'] = dictionary_df['expected_duration'].astype(float)
dictionary_df['departure(HHMM)'] = dictionary_df['departure(HHMM)'].astype(int)
dictionary_df['freq_delay'] = dictionary_df['freq_delay'].astype(float)
dictionary_df['avg_delay'] = dictionary_df['avg_delay'].astype(float)
dictionary_df['origin_airport'] = dictionary_df['origin_airport'].astype(int)
dictionary_df['destination_airport'] = dictionary_df['destination_airport'].astype(int)
dictionary_df['num_seats'] = dictionary_df['num_seats'].astype(int)


dictionary_df['dep_minutes'] = dictionary_df['departure(HHMM)'].apply(hhmm_to_minutes)
dictionary_df['arr_minutes'] = dictionary_df['dep_minutes'] + dictionary_df['expected_duration']

In [35]:
""""
this code does the making of flight options that involve multiple legs, checking feasibility (ie. the min layover is 30 min)
"""

# Start city and airport
start_city = "Los Angeles"
start_airport = 12892

# Minimum layover in minutes
min_layover = 30

# Create dictionary mapping origin city to all flights leaving that city. example flights_from_city['LosAngeles'] gives all flights
#leaving from LA
flights_from_city = {}
for _, row in dictionary_df.iterrows():
    flights_from_city.setdefault(row['origin_city'], []).append(row)

# Helper function to compute total frequency and average delay of a route
def aggregate_route(route_flights):
    total_expected_duration = route_flights[-1]['arr_minutes'] - route_flights[0]['dep_minutes'] #time from dep. of flight 1 to arrival of last flight, doesn't factor in delays
    avg_freq_delay = sum(f['freq_delay'] for f in route_flights) / len(route_flights)
    avg_delay = sum(f['avg_delay'] for f in route_flights) / len(route_flights)
    return total_expected_duration, avg_freq_delay, avg_delay

# Recursive function to find all feasible routes from LA to a destination
def find_routes(current_city, destination_city, visited=set(), current_route=[]):
  """
  Goal: find all feasible routes from current_city to destination_city.

  How it works:
  1. Look up all flights leaving current_city.
  2. Avoid cycles: don’t revisit cities in the same route (visited).
  3. If this is a connecting flight, check that the layover is at least 30 minutes.
  4. If the next flight reaches destination_city, add this complete route to results.
  5. Otherwise, recurse to explore flights from next_city.

  This is effectively doing a depth-first search of all possible flight paths from LA to the destination.
  """
    if current_city not in flights_from_city:
        return []

    routes = []
    for flight in flights_from_city[current_city]:
        # Avoid cycles
        next_city = flight['destination_city']
        if next_city in visited:
            continue

        # Check layover if this is a connecting flight
        if current_route:
            prev_flight = current_route[-1]
            if flight['dep_minutes'] < prev_flight['arr_minutes'] + min_layover:
                continue

        new_route = current_route + [flight]
        if next_city == destination_city:
            routes.append(new_route)
        else:
            routes.extend(find_routes(next_city, destination_city, visited | {next_city}, new_route))
    return routes

# Example: generate routes from LA to all other cities
all_destinations = set(dictionary_df['destination_city']) - {start_city}
all_routes = {}

for dest in all_destinations:
    feasible_routes = find_routes(start_city, dest) #contains all valid paths from LA to that destination.
    summarized_routes = []
    for route in feasible_routes:
        total_duration, freq_delay, avg_delay = aggregate_route(route)
        summarized_routes.append({
            'num_legs': len(route),
            #legs is a list of [origin, destination, departure HHMM, arrival HHMM for each flight]
            'legs': [(f['origin_city'], f['destination_city'], f['departure(HHMM)'], minutes_to_hhmm(f['arr_minutes'])) for f in route],
            'total_expected_duration': total_duration,
            'avg_freq_delay': freq_delay,
            'avg_delay': avg_delay,
            'num_seats': route[0]['num_seats']  # assuming same for all legs
        })
    all_routes[dest] = summarized_routes

# Example: print routes to Seattle
import pprint
pprint.pprint(all_routes['Seattle'])

[{'avg_delay': 36.44010125613679,
  'avg_freq_delay': 0.4058059838548061,
  'legs': [('Los Angeles', 'Denver', 130, 347),
           ('Denver', 'Las Vegas', 1400, 1555),
           ('Las Vegas', 'Seattle', 1930, 2210)],
  'num_legs': 3,
  'num_seats': 4,
  'total_expected_duration': 1240.6671207992733},
 {'avg_delay': 36.44010125613679,
  'avg_freq_delay': 0.4058059838548061,
  'legs': [('Los Angeles', 'Denver', 130, 347),
           ('Denver', 'Las Vegas', 1400, 1555),
           ('Las Vegas', 'Seattle', 1730, 2010)],
  'num_legs': 3,
  'num_seats': 4,
  'total_expected_duration': 1120.6671207992733},
 {'avg_delay': 34.576346610939105,
  'avg_freq_delay': 0.39703089174894113,
  'legs': [('Los Angeles', 'Denver', 130, 347),
           ('Denver', 'Seattle', 2200, 55)],
  'num_legs': 2,
  'num_seats': 4,
  'total_expected_duration': 1405.776962252846},
 {'avg_delay': 34.576346610939105,
  'avg_freq_delay': 0.39703089174894113,
  'legs': [('Los Angeles', 'Denver', 130, 347),
           ('

In [36]:
"""
turns the previous output into a dataframe for easy viewing and manipulation
"""


# Prepare a list of rows for the DataFrame
rows = []

for dest, routes in all_routes.items():
    for route in routes:
        rows.append({
            'destination_city': dest,
            'num_legs': route['num_legs'],
            'legs': route['legs'],  # list of tuples (origin, dest, dep, arr)
            'total_expected_duration': route['total_expected_duration'],
            'avg_freq_delay': route['avg_freq_delay'],
            'avg_delay': route['avg_delay'],
            'num_seats': route['num_seats']
        })

# Create the DataFrame
routes_df = pd.DataFrame(rows)

# Optional: sort by total duration or number of legs
routes_df = routes_df.sort_values(['num_legs', 'total_expected_duration']).reset_index(drop=True)

# Show example
print(routes_df.head(10))

  destination_city  num_legs                                        legs  \
0        Las Vegas         1      [(Los Angeles, Las Vegas, 1330, 1443)]   
1        Las Vegas         1        [(Los Angeles, Las Vegas, 530, 643)]   
2        Las Vegas         1      [(Los Angeles, Las Vegas, 1500, 1613)]   
3    San Francisco         1  [(Los Angeles, San Francisco, 1500, 1627)]   
4    San Francisco         1   [(Los Angeles, San Francisco, 900, 1027)]   
5    San Francisco         1     [(Los Angeles, San Francisco, 30, 157)]   
6    San Francisco         1  [(Los Angeles, San Francisco, 1800, 1927)]   
7           Denver         1           [(Los Angeles, Denver, 130, 347)]   
8          Seattle         1        [(Los Angeles, Seattle, 1930, 2217)]   
9          Seattle         1        [(Los Angeles, Seattle, 1230, 1517)]   

   total_expected_duration  avg_freq_delay  avg_delay  num_seats  
0                73.013380        0.393589  39.975113          4  
1                73.013380   

In [39]:
# Filter routes with 2 legs
two_leg_routes = routes_df[routes_df['num_legs'] == 2]

# Show first 10 routes with 2 legs
print(two_leg_routes.head(10))

   destination_city  num_legs  \
16        San Diego         2   
17        San Diego         2   
18        Las Vegas         2   
19         Portland         2   
20        San Diego         2   
21            Boise         2   
22         Portland         2   
23          Seattle         2   
24          Seattle         2   
25          Seattle         2   

                                                 legs  \
16  [(Los Angeles, Las Vegas, 1330, 1443), (Las Ve...   
17  [(Los Angeles, Las Vegas, 1500, 1613), (Las Ve...   
18  [(Los Angeles, San Francisco, 30, 157), (San F...   
19  [(Los Angeles, Seattle, 1400, 1647), (Seattle,...   
20  [(Los Angeles, Las Vegas, 1330, 1443), (Las Ve...   
21  [(Los Angeles, Seattle, 1930, 2217), (Seattle,...   
22  [(Los Angeles, Seattle, 930, 1217), (Seattle, ...   
23  [(Los Angeles, San Francisco, 1800, 1927), (Sa...   
24  [(Los Angeles, Las Vegas, 1330, 1443), (Las Ve...   
25  [(Los Angeles, Las Vegas, 1500, 1613), (Las Ve...   

    tota

In [37]:
"""
expanded the previous dataframe to show more details of each leg of the route
"""
# Determine the maximum number of legs in any route
max_legs = routes_df['num_legs'].max()

# Prepare rows for the expanded DataFrame
expanded_rows = []

for _, row in routes_df.iterrows():
    expanded_row = {
        'destination_city': row['destination_city'],
        'num_legs': row['num_legs'],
        'total_expected_duration': row['total_expected_duration'],
        'avg_freq_delay': row['avg_freq_delay'],
        'avg_delay': row['avg_delay'],
        'num_seats': row['num_seats']
    }

    # Add each leg as separate columns
    for i, leg in enumerate(row['legs'], start=1):
        expanded_row[f'leg{i}_origin'] = leg[0]
        expanded_row[f'leg{i}_destination'] = leg[1]
        expanded_row[f'leg{i}_dep'] = leg[2]
        expanded_row[f'leg{i}_arr'] = leg[3]

    # Fill missing legs with None
    for i in range(len(row['legs']) + 1, max_legs + 1):
        expanded_row[f'leg{i}_origin'] = None
        expanded_row[f'leg{i}_destination'] = None
        expanded_row[f'leg{i}_dep'] = None
        expanded_row[f'leg{i}_arr'] = None

    expanded_rows.append(expanded_row)

# Create the expanded DataFrame
expanded_routes_df = pd.DataFrame(expanded_rows)

# Show example
print(expanded_routes_df.head(10))


  destination_city  num_legs  total_expected_duration  avg_freq_delay  \
0        Las Vegas         1                73.013380        0.393589   
1        Las Vegas         1                73.013380        0.393589   
2        Las Vegas         1                73.013380        0.393589   
3    San Francisco         1                87.731371        0.395204   
4    San Francisco         1                87.731371        0.395204   
5    San Francisco         1                87.731371        0.395204   
6    San Francisco         1                87.731371        0.395204   
7           Denver         1               137.191554        0.332466   
8          Seattle         1               167.728560        0.295663   
9          Seattle         1               167.728560        0.295663   

   avg_delay  num_seats  leg1_origin leg1_destination  leg1_dep  leg1_arr  \
0  39.975113          4  Los Angeles        Las Vegas      1330      1443   
1  39.975113          4  Los Angeles      

In [10]:
"""
making passenger df with passenger preferences
"""
def generate_passengers(n_passengers, dictionary_df):

    passengers = []

    # extract valid airports from your simulated dictionary
    valid_destinations = dictionary_df.loc['destination_city'].tolist()

    # random grouping (families)
    # groups of size 1–4
    group_sizes = []
    remaining = n_passengers

    while remaining > 0:
        size = random.randint(1, min(4, remaining))
        group_sizes.append(size)
        remaining -= size

    group_id = 1
    passenger_id = 1

    for size in group_sizes:
        # pick shared destination and latest arrival for this group
        group_destination = random.choice(valid_destinations)
        group_latest_arrival = random.choice([h*100 + m for h in range(10,48) for m in [0,30]])

        for _ in range(size):

            passenger = {}
            passenger['passenger_id'] = passenger_id
            passenger['group_id'] = group_id

            # random origin/destination
            passenger['origin_airport'] = 'Los Angeles'
            passenger['destination_airport'] = group_destination
            passenger['latest_arrival'] = group_latest_arrival

            # airline preference (20% chance of having one)
            airline_pref = random.choice(dictionary_df.loc['airline'].tolist())
            passenger['airline_pref'] = airline_pref if random.random() < 0.2 else None

            passengers.append(passenger)
            passenger_id += 1

        group_id += 1

    return pd.DataFrame(passengers)

passengers_df = generate_passengers(10, dictionary_df)
print(passengers_df)

   passenger_id  group_id origin_airport destination_airport  latest_arrival  \
0             1         1    Los Angeles             Spokane            2030   
1             2         1    Los Angeles             Spokane            2030   
2             3         2    Los Angeles      Salt Lake City            1700   
3             4         2    Los Angeles      Salt Lake City            1700   
4             5         2    Los Angeles      Salt Lake City            1700   
5             6         3    Los Angeles           Las Vegas            3200   
6             7         4    Los Angeles           Las Vegas            3730   
7             8         4    Los Angeles           Las Vegas            3730   
8             9         4    Los Angeles           Las Vegas            3730   
9            10         4    Los Angeles           Las Vegas            3730   

   airline_pref  
0           NaN  
1           NaN  
2           NaN  
3       19393.0  
4           NaN  
5          

In [16]:
import cvxpy as cp
"""
optimization problem (NOT UPDATED WITH THE NEW SET OF FLIGHTS)
"""

passengers = passengers_df['passenger_id'].tolist()
flights = dictionary_df.columns.tolist()

num_passengers = len(passengers)
num_flights = len(flights)

# Decision variable: x[p, f] = 1 if passenger p assigned to flight f
x = cp.Variable((num_passengers, num_flights), boolean=True)

# Objective: minimize expected delay = freq_delay * avg_delay
freq_delay = np.array([dictionary_df[f]['freq_delay'] for f in flights])
avg_delay = np.array([dictionary_df[f]['avg_delay'] for f in flights])
objective = cp.Minimize(cp.sum(cp.multiply(x, freq_delay * avg_delay)))

constraints = []

# --- Airline preference constraint ---
# for p_idx, p in enumerate(passengers):
#     airline_pref = passengers_df.loc[passengers_df['passenger_id']==p, 'airline_pref'].values[0]
#     if airline_pref is not None:
#         allowed_flights = [f_idx for f_idx, f in enumerate(flights) if dictionary_df[f]['airline'] == airline_pref]
#         if len(allowed_flights) == 0:
#             print(f"Passenger {p} has airline_pref {airline_pref} but no flights match!")
#         else:
#             # block all flights not in allowed_flights
#             for f_idx in range(num_flights):
#                 if f_idx not in allowed_flights:
#                     constraints.append(x[p_idx, f_idx] == 0)

# --- Ensure every passenger has at least one flight ---
for p_idx in range(num_passengers):
    constraints.append(cp.sum(x[p_idx, :]) >= 1)

# --- Seat availability constraint ---
# for f_idx, f in enumerate(flights):
#     seats = dictionary_df[f]['num_seats']
#     constraints.append(cp.sum(x[:, f_idx]) <= seats)

# --- Group assignment constraint ---
groups = passengers_df['group_id'].unique()
for g in groups:
    group_passengers = passengers_df.loc[passengers_df['group_id']==g].index.tolist()
    if len(group_passengers) > 1:
        y = cp.Variable(num_flights, boolean=True)
        for f_idx in range(num_flights):
            for p_idx in group_passengers:
                constraints.append(x[p_idx, f_idx] == y[f_idx])
        constraints.append(cp.sum(y) == 1)

# --- Multi-leg / connection buffer ---
# dep_times = np.array([dictionary_df[f]['departure(HHMM)'] for f in flights])
# arr_times = dep_times + np.array([dictionary_df[f]['expected_duration'] for f in flights])
# for p_idx in range(num_passengers):
#     for f1 in range(num_flights):
#         for f2 in range(num_flights):
#             if arr_times[f1] + 60 > dep_times[f2]:
#                 constraints.append(x[p_idx, f1] + x[p_idx, f2] <= 1)

#--- Final flight reaches destination ---
# for p_idx, p in enumerate(passengers):
#     dest = passengers_df.loc[passengers_df['passenger_id']==p, 'destination_airport'].values[0]
#     flight_matches = [f_idx for f_idx, f in enumerate(flights) if dictionary_df[f]['destination_airport']==dest]
#     if len(flight_matches) == 0:
#         print(f"No flights available to destination {dest} for passenger {p}!")
#     else:
#         constraints.append(cp.sum(x[p_idx, flight_matches]) >= 1)

# --- Solve MILP ---
prob = cp.Problem(objective, constraints)
prob.solve(solver=cp.GLPK_MI)

print("Solver status:", prob.status)
if prob.status == 'optimal':
    print("Objective value:", prob.value)
    # Extract assignments
    assignment_matrix = x.value
    assigned_pairs = []
    for p_idx, p in enumerate(passengers):
        for f_idx, f in enumerate(flights):
            if assignment_matrix[p_idx, f_idx] > 0.5:
                assigned_pairs.append((p, f))
    assignments_df = pd.DataFrame(assigned_pairs, columns=['passenger_id', 'flight'])
    print(assignments_df)
else:
    print("Problem is infeasible or unbounded. Please check constraints.")


No flights available to destination Spokane for passenger 1!
No flights available to destination Spokane for passenger 2!
No flights available to destination Salt Lake City for passenger 3!
No flights available to destination Salt Lake City for passenger 4!
No flights available to destination Salt Lake City for passenger 5!
No flights available to destination Las Vegas for passenger 6!
No flights available to destination Las Vegas for passenger 7!
No flights available to destination Las Vegas for passenger 8!
No flights available to destination Las Vegas for passenger 9!
No flights available to destination Las Vegas for passenger 10!




Solver status: optimal
Objective value: 75.78817733990147
   passenger_id              flight
0             1  Anchorage-Juneau-2
1             2  Anchorage-Juneau-2
2             3  Anchorage-Juneau-1
3             4  Anchorage-Juneau-1
4             5  Anchorage-Juneau-1
5             6  Anchorage-Juneau-2
6             7  Anchorage-Juneau-1
7             8  Anchorage-Juneau-1
8             9  Anchorage-Juneau-1
9            10  Anchorage-Juneau-1
