In [None]:
!pip install pandas
!pip install numpy
!pip install seaborn
!pip install matplotlib
!pip install scikit-learn
!pip install scipy
!pip install gdown




In [14]:

# Import portion of a package
import matplotlib.pyplot as plt  # Most common visualization package that a lot of others are based on

# Import full packages under custom name
import numpy as np  # Common package for numerical methods
import pandas as pd  # Common package for data storeage/manipulation
import seaborn as sns  # Common package for statistical visualizations

# Import portion of a package
import scipy.stats as stats
from sklearn.impute import SimpleImputer as Imputer  # Specific function from common machine learning package\

#more packages
import gdown


In [15]:


# File ID from your link (assuming this is the cleaned dataset)
file_id = "13elVDJJ6JUH2gsj36CxPjkLryE2rnl-r"
url = f"https://drive.google.com/uc?id={file_id}"

# Download the file
gdown.download(url, "flight_data.csv", quiet=False)

# Now load it
flight_data = pd.read_csv("flight_data.csv")
print(flight_data.shape)
print(flight_data.head())

Downloading...
From (original): https://drive.google.com/uc?id=13elVDJJ6JUH2gsj36CxPjkLryE2rnl-r
From (redirected): https://drive.google.com/uc?id=13elVDJJ6JUH2gsj36CxPjkLryE2rnl-r&confirm=t&uuid=685cab3e-19c7-4722-a662-f13853aeccf7
To: /content/flight_data.csv
100%|██████████| 1.32G/1.32G [00:14<00:00, 90.4MB/s]


(7546988, 32)
   YEAR  QUARTER  MONTH  DAY_OF_MONTH  DAY_OF_WEEK  MKT_CARRIER_AIRLINE_ID  \
0  2024        1      1             1            1                   19393   
1  2024        1      1             1            1                   19393   
2  2024        1      1             1            1                   19393   
3  2024        1      1             1            1                   19393   
4  2024        1      1             1            1                   19393   

   ORIGIN_AIRPORT_ID  ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_NAME  DEST_AIRPORT_ID  \
0              10140                1014005  Albuquerque, NM            10423   
1              10140                1014005  Albuquerque, NM            10423   
2              10140                1014005  Albuquerque, NM            10800   
3              10140                1014005  Albuquerque, NM            10821   
4              10140                1014005  Albuquerque, NM            11259   

   ...  CANCELLATION_CODE CRS_

In [3]:

df = flight_data.copy()
print (df.head())
print(df.columns.tolist())

   YEAR  QUARTER  MONTH  DAY_OF_MONTH  DAY_OF_WEEK  MKT_CARRIER_AIRLINE_ID  \
0  2024        1      1             1            1                   19393   
1  2024        1      1             1            1                   19393   
2  2024        1      1             1            1                   19393   
3  2024        1      1             1            1                   19393   
4  2024        1      1             1            1                   19393   

   ORIGIN_AIRPORT_ID  ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_NAME  DEST_AIRPORT_ID  \
0              10140                1014005  Albuquerque, NM            10423   
1              10140                1014005  Albuquerque, NM            10423   
2              10140                1014005  Albuquerque, NM            10800   
3              10140                1014005  Albuquerque, NM            10821   
4              10140                1014005  Albuquerque, NM            11259   

   ...  CANCELLATION_CODE CRS_ELAPSED_TIME  

In [None]:
"""
minimize sum of (epsilon?) (frequency * average * passengers)
st.
    ...

optimization problem in words
- frequency = probability of flight being delayed
- average = expected delay time (in minutes)
- passengers = number of passengers taking that flight
- possible constraints for passengers:
  - arrival/depaarture time being between certain times
  - flight duration being shorter than x hours
  - groups of passengers staying together (ie families)
  - max number of extra seats on flights
  - need to get to destination by a certain time
  - airline?
  - any connecting flights must have at least a 1 hour buffer for connecting purposes


info needed about the set of flights:
- number of seats available (this is a made up number)
- frequency
- average
- arrival/departure time/day/month/year
- flight duration
- destination/origin airport/city
- airline

"""

In [4]:
#calculating average and frequency of delay

# Boolean series: True if there is a delay
def compute_delay_stats(group):
    delayed = group['DEP_DELAY_NEW'] > 0

    # Probability of delay
    prob_delay = delayed.sum() / len(group)

    # Average delay (only for delayed rows)
    if delayed.sum() > 0:
        avg_delay = group.loc[delayed, 'DEP_DELAY_NEW'].mean()
    else:
        avg_delay = 0.0

    return pd.Series({'freq_delay': prob_delay, 'avg_delay': avg_delay})

# Group by origin and destination airport to calculate delay stats
delay_stats = df.groupby(['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']).apply(compute_delay_stats).reset_index()

# Display
print(delay_stats.head())



   ORIGIN_AIRPORT_ID  DEST_AIRPORT_ID  freq_delay  avg_delay
0              10135            10397    0.259587  51.750000
1              10135            10693    0.247619  45.307692
2              10135            11057    0.282161  76.173252
3              10135            11292    0.255814  51.181818
4              10135            11697    0.283951  38.652174


  delay_stats = df.groupby(['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']).apply(compute_delay_stats).reset_index()


In [28]:
#doing a network of flights instead of top 20
start_airport = 12892  # LA

# --- First leg: LA -> top 5 destinations ---
from_LA = df[df['ORIGIN_AIRPORT_ID'] == start_airport]
top5_first_leg = from_LA['DEST_AIRPORT_ID'].value_counts().head(5).index.tolist()
print("Top 5 destinations from LA:", top5_first_leg)

# --- Second leg: from each first leg destination, top 5 destinations (excluding LA) ---
second_leg = {}
second_leg_airports = set()
for origin in top5_first_leg:
    flights_from_origin = df[df['ORIGIN_AIRPORT_ID'] == origin]
    top5 = flights_from_origin[flights_from_origin['DEST_AIRPORT_ID'] != start_airport] \
            ['DEST_AIRPORT_ID'].value_counts().head(5).index.tolist()
    second_leg[origin] = top5
    second_leg_airports.update(top5)

print("\nSecond leg top 5 destinations from each first leg destination:")
for origin, dests in second_leg.items():
    print(f"{origin} -> {dests}")

# --- Third leg: from second leg destinations, top 2 destinations (excluding first leg, second leg, and return) ---
third_leg = {}
first_leg_set = set(top5_first_leg)

for origin, dests in second_leg.items():
    for dest in dests:
        flights_from_dest = df[df['ORIGIN_AIRPORT_ID'] == dest]

        # exclude first leg airports, all second leg airports, the immediate origin, AND Los Angeles
        exclude_set = first_leg_set | second_leg_airports | {dest, start_airport}

        # pick top 2 *after* removing Los Angeles
        top2 = (
            flights_from_dest[
                ~flights_from_dest['DEST_AIRPORT_ID'].isin(exclude_set)
            ]['DEST_AIRPORT_ID']
            .value_counts()
            .head(2)
            .index
            .tolist()
        )

        third_leg[dest] = top2

print("\nThird leg top 2 destinations from second leg (excluding first leg, second leg, and return trips):")
for origin, dests in third_leg.items():
    print(f"{origin} -> {dests}")


Top 5 destinations from LA: [14771, 12889, 12478, 11292, 14747]

Second leg top 5 destinations from each first leg destination:
14771 -> [14747, 12478, 14679, 12889, 11292]
12889 -> [11292, 14107, 14679, 14747, 11298]
12478 -> [14771, 10721, 13303, 13204, 14492]
11292 -> [14107, 12889, 14869, 14747, 13930]
14747 -> [10299, 14057, 14107, 11292, 12889]

Third leg top 2 destinations from second leg (excluding first leg, second leg, and return trips):
14747 -> [11884, 10713]
12478 -> [11697, 14843]
14679 -> [14831, 14893]
12889 -> [14893, 10397]
11292 -> [13487, 10397]
14107 -> [14908, 10397]
11298 -> [12266, 10397]
14771 -> [14908, 11618]
10721 -> [11278, 12953]
13303 -> [10397, 12953]
13204 -> [10397, 11618]
14492 -> [10397, 12953]
14869 -> [10397, 12266]
13930 -> [12953, 11278]
10299 -> [11630, 12523]
14057 -> [14831, 14893]


In [29]:
airport_to_city = (
    df[['ORIGIN_AIRPORT_ID', 'ORIGIN_CITY_NAME']]
    .drop_duplicates()
    .assign(ORIGIN_CITY_NAME=lambda x: x['ORIGIN_CITY_NAME'].str.split(',').str[0])
    .set_index('ORIGIN_AIRPORT_ID')['ORIGIN_CITY_NAME']
    .to_dict()
)


def id_to_city(airport_id):
    return airport_to_city.get(airport_id, f"Unknown({airport_id})")

def list_to_cities(id_list):
    return [id_to_city(i) for i in id_list]

print("Top 5 destinations from LA:")
for a in top5_first_leg:
    print(a, "->", id_to_city(a))

print("\nSecond leg top 5 destinations from each first leg destination:")
for origin, dests in second_leg.items():
    print(f"{origin} ({id_to_city(origin)}) -> {list_to_cities(dests)}")

print("\nThird leg top 2 destinations from second leg:")
for origin, dests in third_leg.items():
    print(f"{origin} ({id_to_city(origin)}) -> {list_to_cities(dests)}")


Top 5 destinations from LA:
14771 -> San Francisco
12889 -> Las Vegas
12478 -> New York
11292 -> Denver
14747 -> Seattle

Second leg top 5 destinations from each first leg destination:
14771 (San Francisco) -> ['Seattle', 'New York', 'San Diego', 'Las Vegas', 'Denver']
12889 (Las Vegas) -> ['Denver', 'Phoenix', 'San Diego', 'Seattle', 'Dallas/Fort Worth']
12478 (New York) -> ['San Francisco', 'Boston', 'Miami', 'Orlando', 'Raleigh/Durham']
11292 (Denver) -> ['Phoenix', 'Las Vegas', 'Salt Lake City', 'Seattle', 'Chicago']
14747 (Seattle) -> ['Anchorage', 'Portland', 'Phoenix', 'Denver', 'Las Vegas']

Third leg top 2 destinations from second leg:
14747 (Seattle) -> ['Spokane', 'Boise']
12478 (New York) -> ['Fort Lauderdale', 'San Juan']
14679 (San Diego) -> ['San Jose', 'Sacramento']
12889 (Las Vegas) -> ['Sacramento', 'Atlanta']
11292 (Denver) -> ['Minneapolis', 'Atlanta']
14107 (Phoenix) -> ['Santa Ana', 'Atlanta']
11298 (Dallas/Fort Worth) -> ['Houston', 'Atlanta']
14771 (San Francisc

In [None]:
"""
this section lists all possible destinations for the optimization problem
"""
# Collect all city names from each leg
possible_destinations = set()

# --- First leg ---
for a in top5_first_leg:
    possible_destinations.add(id_to_city(a))

# --- Second leg ---
for origin, dests in second_leg.items():
    possible_destinations.add(id_to_city(origin))
    for d in dests:
        possible_destinations.add(d)

# --- Third leg ---
for origin, dests in third_leg.items():
    possible_destinations.add(id_to_city(origin))
    for d in dests:
        possible_destinations.add(d)

# Convert airport IDs in second/third leg to city names where needed
possible_destinations = {id_to_city(c) if isinstance(c, int) else c
                         for c in possible_destinations}

# Turn into a sorted list
possible_destinations = sorted(possible_destinations)

print(possible_destinations)
print("Number of unique cities:", len(possible_destinations))

In [32]:

# --- Gather all origin-destination pairs from 3 legs ---
# First leg pairs
first_leg_pairs = [(id_to_city(start_airport), id_to_city(dest)) for dest in top5_first_leg]

# Second leg pairs
second_leg_pairs = []
for origin_airport, dest_airports in second_leg.items():
    origin_city = id_to_city(origin_airport)
    for dest_airport in dest_airports:
        dest_city = id_to_city(dest_airport)  # convert to city
        second_leg_pairs.append((origin_city, dest_city))

# Third leg pairs
third_leg_pairs = []
for origin_airport, dest_airports in third_leg.items():
    origin_city = id_to_city(origin_airport)
    for dest_airport in dest_airports:
        dest_city = id_to_city(dest_airport)  # convert to city
        third_leg_pairs.append((origin_city, dest_city))

# Combine all pairs
all_pairs = set(first_leg_pairs + second_leg_pairs + third_leg_pairs)

print(all_pairs)


{('Denver', 'Chicago'), ('Seattle', 'Portland'), ('Raleigh/Durham', 'New York'), ('Las Vegas', 'Phoenix'), ('Los Angeles', 'Las Vegas'), ('Seattle', 'Las Vegas'), ('Denver', 'Atlanta'), ('San Francisco', 'New York'), ('New York', 'Orlando'), ('Denver', 'Salt Lake City'), ('New York', 'Miami'), ('Las Vegas', 'Dallas/Fort Worth'), ('Portland', 'Sacramento'), ('Anchorage', 'Juneau'), ('Boston', 'New York'), ('Los Angeles', 'Seattle'), ('Dallas/Fort Worth', 'Houston'), ('Denver', 'Minneapolis'), ('Las Vegas', 'Sacramento'), ('New York', 'San Francisco'), ('Orlando', 'Newark'), ('Chicago', 'Washington'), ('San Francisco', 'Santa Ana'), ('Seattle', 'Anchorage'), ('Denver', 'Las Vegas'), ('Seattle', 'Phoenix'), ('Raleigh/Durham', 'Atlanta'), ('Seattle', 'Spokane'), ('Los Angeles', 'San Francisco'), ('Dallas/Fort Worth', 'Atlanta'), ('San Francisco', 'San Diego'), ('Los Angeles', 'New York'), ('Denver', 'Seattle'), ('San Francisco', 'Denver'), ('New York', 'Fort Lauderdale'), ('Las Vegas', 'At

In [37]:
"""
this part shows all feasible routes starting from LA to each destination in the all pairs network
"""
from collections import defaultdict, deque

# Create adjacency list from all_pairs
graph = defaultdict(list)
for origin, dest in all_pairs:
    graph[origin].append(dest)

start_city = "Los Angeles"

# Dictionary to store feasible routes: destination -> set of tuples (routes)
feasible_routes = defaultdict(set)

# BFS-like traversal for up to 3 legs (direct + 2 connections)
max_legs = 3
queue = deque()
queue.append( ([start_city], 0) )  # (current_route_list, current_depth)

while queue:
    route, depth = queue.popleft()
    current_city = route[-1]

    if depth > 0:  # exclude the starting city itself
        dest = current_city
        feasible_routes[dest].add(tuple(route))

    if depth < max_legs:
        for neighbor in graph[current_city]:
            # avoid cycles in the route
            if neighbor not in route:
                queue.append((route + [neighbor], depth + 1))

# Example: print feasible routes to Seattle
for dest, routes in feasible_routes.items():
    print(dest, routes)


Las Vegas {('Los Angeles', 'Denver', 'Seattle', 'Las Vegas'), ('Los Angeles', 'San Francisco', 'Las Vegas'), ('Los Angeles', 'San Francisco', 'Denver', 'Las Vegas'), ('Los Angeles', 'San Francisco', 'Seattle', 'Las Vegas'), ('Los Angeles', 'Las Vegas'), ('Los Angeles', 'Seattle', 'Las Vegas'), ('Los Angeles', 'New York', 'San Francisco', 'Las Vegas'), ('Los Angeles', 'Seattle', 'Denver', 'Las Vegas'), ('Los Angeles', 'Denver', 'Las Vegas')}
Seattle {('Los Angeles', 'Las Vegas', 'Seattle'), ('Los Angeles', 'Las Vegas', 'Denver', 'Seattle'), ('Los Angeles', 'Denver', 'Las Vegas', 'Seattle'), ('Los Angeles', 'Denver', 'Seattle'), ('Los Angeles', 'New York', 'San Francisco', 'Seattle'), ('Los Angeles', 'San Francisco', 'Las Vegas', 'Seattle'), ('Los Angeles', 'San Francisco', 'Denver', 'Seattle'), ('Los Angeles', 'Seattle'), ('Los Angeles', 'San Francisco', 'Seattle')}
San Francisco {('Los Angeles', 'New York', 'San Francisco'), ('Los Angeles', 'San Francisco')}
New York {('Los Angeles', '

In [34]:
# First, map airport IDs to city names in the DataFrame
df['ORIGIN_CITY'] = df['ORIGIN_AIRPORT_ID'].map(airport_to_city)
df['DEST_CITY']   = df['DEST_AIRPORT_ID'].map(airport_to_city)

# --- Mask for first leg ---
first_leg_mask = df[['ORIGIN_CITY', 'DEST_CITY']].apply(
    lambda row: (row['ORIGIN_CITY'], row['DEST_CITY']) in first_leg_pairs, axis=1)
first_leg_df = df[first_leg_mask]

# --- Mask for second leg ---
second_leg_mask = df[['ORIGIN_CITY', 'DEST_CITY']].apply(
    lambda row: (row['ORIGIN_CITY'], row['DEST_CITY']) in second_leg_pairs, axis=1)
second_leg_df = df[second_leg_mask]

# --- Mask for third leg ---
third_leg_mask = df[['ORIGIN_CITY', 'DEST_CITY']].apply(
    lambda row: (row['ORIGIN_CITY'], row['DEST_CITY']) in third_leg_pairs, axis=1)
third_leg_df = df[third_leg_mask]

# --- Combine all legs ---
filtered_df = pd.concat([first_leg_df, second_leg_df, third_leg_df]).drop_duplicates()

print(f"Number of flights after filtering: {len(filtered_df)}")

Number of flights after filtering: 442179


In [35]:
#making set of top 20 flights

# Assuming the dataframe is already loaded
# Group by 'ORIGIN_CITY_NAME' and 'DEST_AIRPORT_ID' and count the occurrences
top_routes = filtered_df.groupby(['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']).agg(
    origin_city=('ORIGIN_CITY_NAME', 'first'),
    destination_city=('DEST_CITY_NAME', 'first'),
    count=('ORIGIN_CITY_NAME', 'size'),
    avg_duration=('ACTUAL_ELAPSED_TIME', 'mean'),
    most_common_departure=('CRS_DEP_TIME', lambda x: x.mode()[0] if not x.mode().empty else np.nan), #arrival time will be calculated by adding elapsed time to arrival
    most_common_airline=('MKT_CARRIER_AIRLINE_ID', lambda x: x.mode()[0] if not x.mode().empty else np.nan)
).reset_index()

# Add a "made-up" number of seats available (for now I will assume random values, e.g. between 100 and 300)
top_routes['seats_available'] = np.random.randint(1, 6, size=len(top_routes))

# Merge with your top_20_routes dataframe
top_routes = top_routes.merge(delay_stats, on=['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID'], how='left')

top_routes['origin_city'] = top_routes['origin_city'].str.split(',').str[0] #extracting city from city,state
top_routes['destination_city'] = top_routes['destination_city'].str.split(',').str[0]


#calculating the most common city to use as starting point for the group of passengers
origin_counts = top_routes['ORIGIN_AIRPORT_ID'].astype(int).value_counts()
most_common_origin_id = origin_counts.idxmax()

print("Most common origin airport ID:", most_common_origin_id) # it is 12892 (freq of 5) which is LA

# Display the top 20 routes with all the requested details
print(top_routes[['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'origin_city', 'destination_city', 'avg_duration', 'most_common_departure',
                       'most_common_airline','seats_available', 'freq_delay', 'avg_delay']])


Most common origin airport ID: 11292
    ORIGIN_AIRPORT_ID  DEST_AIRPORT_ID     origin_city destination_city  \
0               10299            11630       Anchorage        Fairbanks   
1               10299            12523       Anchorage           Juneau   
2               10721            11278          Boston       Washington   
3               10721            12264          Boston       Washington   
4               10721            12478          Boston         New York   
..                ...              ...             ...              ...   
76              14771            14747   San Francisco          Seattle   
77              14771            14908   San Francisco        Santa Ana   
78              14869            10397  Salt Lake City          Atlanta   
79              14869            12191  Salt Lake City          Houston   
80              14869            12266  Salt Lake City          Houston   

    avg_duration  most_common_departure  most_common_airline  

In [36]:
#dictionary:
#'ORIGIN-DEST-x': [origin airport, dest airport, flight time, departure time, seats available, airline, delay freq,
#delay avg]
#-x because the departure time can be diff if the origin-dest flight has multiple

import random
simulated_dict = {}

for _, row in top_routes.iterrows():

    # Key base like "New York-Los Angeles"
    base_key = f"{row['origin_city']}-{row['destination_city']}"

    # Random number of simulated flights (e.g. between 1 and 5)
    num_simulated_flights = random.randint(1, 5)

    for i in range(1, num_simulated_flights + 1):

        # Random departure time in HHMM (0000–2359 but realistic ranges optional)
        allowed_times = []

        for hour in range(24):          # 0 → 23
            for minute in [0, 30]:      # :00 and :30
                time_value = hour * 100 + minute
                allowed_times.append(time_value)


        random_time = random.choice(allowed_times)
        # Random seats available (exponential)
        min_seats = 1
        max_seats = 5
        scale = 0.5  # smaller scale → more skewed toward larger numbers

        # Generate one random seat number
        rand_val = np.random.exponential(scale=scale)
        # Normalize and scale to min-max range
        random_seats = int(max_seats - (rand_val / (rand_val + 1)) * (max_seats - min_seats))
        random_seats = max(min_seats, min(random_seats, max_seats))

        # Build key with -1, -2, -3 suffix
        key = f"{base_key}-{i}"

        # Store all data exactly like your structure
        simulated_dict[key] = [
            row['origin_city'],
            row['destination_city'],
            row['ORIGIN_AIRPORT_ID'],
            row['DEST_AIRPORT_ID'],
            row['avg_duration'],
            row['freq_delay'],
            row['avg_delay'],
            random_time,
            row['most_common_airline'],
            random_seats
        ]

dictionary_df = pd.DataFrame(simulated_dict, index = ['origin_city', 'destination_city','origin_airport', 'destination_airport', 'expected_duration', 'freq_delay', 'avg_delay','departure(HHMM)', 'airline', 'num_seats'])
dictionary_df



Unnamed: 0,Anchorage-Fairbanks-1,Anchorage-Fairbanks-2,Anchorage-Fairbanks-3,Anchorage-Fairbanks-4,Anchorage-Fairbanks-5,Anchorage-Juneau-1,Anchorage-Juneau-2,Anchorage-Juneau-3,Anchorage-Juneau-4,Boston-Washington-1,...,Salt Lake City-Atlanta-1,Salt Lake City-Atlanta-2,Salt Lake City-Atlanta-3,Salt Lake City-Atlanta-4,Salt Lake City-Atlanta-5,Salt Lake City-Houston-1,Salt Lake City-Houston-2,Salt Lake City-Houston-3,Salt Lake City-Houston-4,Salt Lake City-Houston-5
origin_city,Anchorage,Anchorage,Anchorage,Anchorage,Anchorage,Anchorage,Anchorage,Anchorage,Anchorage,Boston,...,Salt Lake City,Salt Lake City,Salt Lake City,Salt Lake City,Salt Lake City,Salt Lake City,Salt Lake City,Salt Lake City,Salt Lake City,Salt Lake City
destination_city,Fairbanks,Fairbanks,Fairbanks,Fairbanks,Fairbanks,Juneau,Juneau,Juneau,Juneau,Washington,...,Atlanta,Atlanta,Atlanta,Atlanta,Atlanta,Houston,Houston,Houston,Houston,Houston
origin_airport,10299,10299,10299,10299,10299,10299,10299,10299,10299,10721,...,14869,14869,14869,14869,14869,14869,14869,14869,14869,14869
destination_airport,11630,11630,11630,11630,11630,12523,12523,12523,12523,12264,...,10397,10397,10397,10397,10397,12266,12266,12266,12266,12266
expected_duration,59.925075,59.925075,59.925075,59.925075,59.925075,99.441055,99.441055,99.441055,99.441055,97.072364,...,213.182857,213.182857,213.182857,213.182857,213.182857,180.664073,180.664073,180.664073,180.664073,180.664073
freq_delay,0.264935,0.264935,0.264935,0.264935,0.264935,0.246305,0.246305,0.246305,0.246305,0.309638,...,0.43115,0.43115,0.43115,0.43115,0.43115,0.279802,0.279802,0.279802,0.279802,0.279802
avg_delay,29.121849,29.121849,29.121849,29.121849,29.121849,30.77,30.77,30.77,30.77,32.284768,...,30.7,30.7,30.7,30.7,30.7,48.057878,48.057878,48.057878,48.057878,48.057878
departure(HHMM),1330,230,2230,1000,500,1130,800,830,2330,1730,...,2330,300,300,1200,2300,1030,1330,2000,700,400
airline,19930,19930,19930,19930,19930,19930,19930,19930,19930,19977,...,19790,19790,19790,19790,19790,19977,19977,19977,19977,19977
num_seats,4,4,3,3,4,4,3,3,4,4,...,2,4,4,4,2,3,4,2,4,4


In [17]:
distinct_routes = (
    dictionary_df.loc[['origin_city', 'destination_city']]
    .T
    .drop_duplicates()
    .reset_index(drop=True)
)

distinct_routes

Unnamed: 0,origin_city,destination_city
0,Anchorage,Fairbanks
1,Anchorage,Juneau
2,Boston,Washington
3,Boston,New York
4,Denver,Las Vegas
...,...,...
57,San Francisco,San Diego
58,San Francisco,Seattle
59,San Francisco,Santa Ana
60,Salt Lake City,Atlanta


In [10]:
#making passenger df with passenger preferences
def generate_passengers(n_passengers, dictionary_df):

    passengers = []

    # extract valid airports from your simulated dictionary
    valid_destinations = dictionary_df.loc['destination_city'].tolist()

    # random grouping (families)
    # groups of size 1–4
    group_sizes = []
    remaining = n_passengers

    while remaining > 0:
        size = random.randint(1, min(4, remaining))
        group_sizes.append(size)
        remaining -= size

    group_id = 1
    passenger_id = 1

    for size in group_sizes:
        # pick shared destination and latest arrival for this group
        group_destination = random.choice(valid_destinations)
        group_latest_arrival = random.choice([h*100 + m for h in range(10,48) for m in [0,30]])

        for _ in range(size):

            passenger = {}
            passenger['passenger_id'] = passenger_id
            passenger['group_id'] = group_id

            # random origin/destination
            passenger['origin_airport'] = 'Los Angeles'
            passenger['destination_airport'] = group_destination
            passenger['latest_arrival'] = group_latest_arrival

            # airline preference (20% chance of having one)
            airline_pref = random.choice(dictionary_df.loc['airline'].tolist())
            passenger['airline_pref'] = airline_pref if random.random() < 0.2 else None

            passengers.append(passenger)
            passenger_id += 1

        group_id += 1

    return pd.DataFrame(passengers)

passengers_df = generate_passengers(10, dictionary_df)
print(passengers_df)

   passenger_id  group_id origin_airport destination_airport  latest_arrival  \
0             1         1    Los Angeles             Spokane            2030   
1             2         1    Los Angeles             Spokane            2030   
2             3         2    Los Angeles      Salt Lake City            1700   
3             4         2    Los Angeles      Salt Lake City            1700   
4             5         2    Los Angeles      Salt Lake City            1700   
5             6         3    Los Angeles           Las Vegas            3200   
6             7         4    Los Angeles           Las Vegas            3730   
7             8         4    Los Angeles           Las Vegas            3730   
8             9         4    Los Angeles           Las Vegas            3730   
9            10         4    Los Angeles           Las Vegas            3730   

   airline_pref  
0           NaN  
1           NaN  
2           NaN  
3       19393.0  
4           NaN  
5          

In [21]:
# #formulating optimization problem
# import cvxpy as cp

# passengers = passengers_df['passenger_id'].tolist()
# flights = dictionary_df.columns.tolist()

# num_passengers = len(passengers)
# num_flights = len(flights)

# # decision variable: x[p, f] = 1 if passenger p is assigned to flight f
# x = cp.Variable((num_passengers, num_flights), boolean=True)

# # Objective: use freq_delay and avg_delay from dictionary_df
# freq_delay = np.array([dictionary_df[f]['freq_delay'] for f in flights])
# avg_delay = np.array([dictionary_df[f]['avg_delay'] for f in flights])

# # Element-wise multiplication, broadcast over passengers
# objective = cp.Minimize(cp.sum(cp.multiply(x, freq_delay * avg_delay)))

# # Constraints list
# constraints = []



# # # Airline preference constraint
# for p_idx, p in enumerate(passengers):
#     airline_pref = passengers_df.loc[passengers_df['passenger_id']==p, 'airline_pref'].values[0]
#     if airline_pref is not None:
#         for f_idx, f in enumerate(flights):
#             if dictionary_df[f]['airline'] != airline_pref:
#                 constraints.append(x[p_idx, f_idx] == 0)

# # Ensure every passenger takes at least 1 flight
# for p_idx, p in enumerate(passengers):
#     constraints.append(cp.sum(x[p_idx, :]) >= 1)

# # #arrive by certain time constraint
# # for p_idx, p in enumerate(passengers):
# #     latest = passengers_df.loc[passengers_df['passenger_id']==p, 'latest_arrival'].values[0]
# #     for f_idx, f in enumerate(flights):
# #         dep = dictionary_df[f]['departure(HHMM)']
# #         dur = dictionary_df[f]['expected_duration']
# #         arrival_time = dep + int(dur)  # approximate HHMM
# #         if arrival_time > latest:
# #             constraints.append(x[p_idx, f_idx] == 0)

# # #seat availability constraint
# # for f_idx, f in enumerate(flights):
# #     seats = dictionary_df[f]['num_seats']
# #     constraints.append(cp.sum(x[:, f_idx]) <= seats)

# # # Group constraint (on same flight)
# # groups = passengers_df['group_id'].unique()
# # for g in groups:
# #     group_passengers = passengers_df.loc[passengers_df['group_id']==g].index.tolist()
# #     if len(group_passengers) > 1:
# #         # auxiliary variable for this group on each flight
# #         y = cp.Variable(num_flights, boolean=True)
# #         for f_idx in range(num_flights):
# #             # enforce x[group_member, f] == y[f]
# #             for p_idx in group_passengers:
# #                 constraints.append(x[p_idx, f_idx] == y[f_idx])
# #         # ensure the group is assigned to exactly one flight
# #         constraints.append(cp.sum(y) == 1)

# # # --- Multi-leg / connection buffer ---
# # # Precompute departure and arrival times in minutes
# # dep_times = np.array([dictionary_df[f]['departure(HHMM)'] for f in flights])
# # arr_times = dep_times + np.array([dictionary_df[f]['expected_duration'] for f in flights])

# # # enforce 1-hour buffer if passenger takes consecutive flights
# # for p_idx, p in enumerate(passengers):
# #     for f1 in range(num_flights):
# #         for f2 in range(num_flights):
# #             # f1 ends at arr_times[f1], f2 starts at dep_times[f2]
# #             # if x[p,f1] == 1 and x[p,f2] == 1, then f2 must start at least 60 mins after f1
# #             # encode as: arr_times[f1] + 60 <= dep_times[f2] OR x[p,f1] + x[p,f2] <=1
# #             if arr_times[f1] + 60 > dep_times[f2]:
# #                 constraints.append(x[p_idx,f1] + x[p_idx,f2] <= 1)

# # # --- Final flight reaches destination ---
# # for p_idx, p in enumerate(passengers):
# #     dest = passengers_df.loc[passengers_df['passenger_id']==p, 'destination_airport'].values[0]
# #     # sum of x[p,f] for flights ending at destination >= 1
# #     flight_matches = [f_idx for f_idx, f in enumerate(flights) if dictionary_df[f]['destination_airport']==dest]
# #     constraints.append(cp.sum(x[p_idx, flight_matches]) >= 1)

# # Solve MILP
# prob = cp.Problem(objective, constraints)
# prob.solve(solver=cp.GLPK_MI)
# print("Status:", prob.status)
# print('The objective value is {}.'.format(objective.value))
# # Extract assignments
# # assignment_matrix = x.value
# # assigned_pairs = []
# # for p_idx, p in enumerate(passengers):
# #     for f_idx, f in enumerate(flights):
# #         if assignment_matrix[p_idx, f_idx] > 0.5:
# #             assigned_pairs.append((p, f))

# # assignments_df = pd.DataFrame(assigned_pairs, columns=['passenger_id','flight'])
# # print(assignments_df)

Status: unbounded
The objective value is None.


In [16]:
import cvxpy as cp


# passengers_df: your passenger info
# dictionary_df: your flights dictionary (columns = flights)

passengers = passengers_df['passenger_id'].tolist()
flights = dictionary_df.columns.tolist()

num_passengers = len(passengers)
num_flights = len(flights)

# Decision variable: x[p, f] = 1 if passenger p assigned to flight f
x = cp.Variable((num_passengers, num_flights), boolean=True)

# Objective: minimize expected delay = freq_delay * avg_delay
freq_delay = np.array([dictionary_df[f]['freq_delay'] for f in flights])
avg_delay = np.array([dictionary_df[f]['avg_delay'] for f in flights])
objective = cp.Minimize(cp.sum(cp.multiply(x, freq_delay * avg_delay)))

constraints = []

# --- Airline preference constraint ---
# for p_idx, p in enumerate(passengers):
#     airline_pref = passengers_df.loc[passengers_df['passenger_id']==p, 'airline_pref'].values[0]
#     if airline_pref is not None:
#         allowed_flights = [f_idx for f_idx, f in enumerate(flights) if dictionary_df[f]['airline'] == airline_pref]
#         if len(allowed_flights) == 0:
#             print(f"Passenger {p} has airline_pref {airline_pref} but no flights match!")
#         else:
#             # block all flights not in allowed_flights
#             for f_idx in range(num_flights):
#                 if f_idx not in allowed_flights:
#                     constraints.append(x[p_idx, f_idx] == 0)

# --- Ensure every passenger has at least one flight ---
for p_idx in range(num_passengers):
    constraints.append(cp.sum(x[p_idx, :]) >= 1)

# --- Seat availability constraint ---
# for f_idx, f in enumerate(flights):
#     seats = dictionary_df[f]['num_seats']
#     constraints.append(cp.sum(x[:, f_idx]) <= seats)

# --- Group assignment constraint ---
groups = passengers_df['group_id'].unique()
for g in groups:
    group_passengers = passengers_df.loc[passengers_df['group_id']==g].index.tolist()
    if len(group_passengers) > 1:
        y = cp.Variable(num_flights, boolean=True)
        for f_idx in range(num_flights):
            for p_idx in group_passengers:
                constraints.append(x[p_idx, f_idx] == y[f_idx])
        constraints.append(cp.sum(y) == 1)

# --- Multi-leg / connection buffer ---
# dep_times = np.array([dictionary_df[f]['departure(HHMM)'] for f in flights])
# arr_times = dep_times + np.array([dictionary_df[f]['expected_duration'] for f in flights])
# for p_idx in range(num_passengers):
#     for f1 in range(num_flights):
#         for f2 in range(num_flights):
#             if arr_times[f1] + 60 > dep_times[f2]:
#                 constraints.append(x[p_idx, f1] + x[p_idx, f2] <= 1)

#--- Final flight reaches destination ---
# for p_idx, p in enumerate(passengers):
#     dest = passengers_df.loc[passengers_df['passenger_id']==p, 'destination_airport'].values[0]
#     flight_matches = [f_idx for f_idx, f in enumerate(flights) if dictionary_df[f]['destination_airport']==dest]
#     if len(flight_matches) == 0:
#         print(f"No flights available to destination {dest} for passenger {p}!")
#     else:
#         constraints.append(cp.sum(x[p_idx, flight_matches]) >= 1)

# --- Solve MILP ---
prob = cp.Problem(objective, constraints)
prob.solve(solver=cp.GLPK_MI)

print("Solver status:", prob.status)
if prob.status == 'optimal':
    print("Objective value:", prob.value)
    # Extract assignments
    assignment_matrix = x.value
    assigned_pairs = []
    for p_idx, p in enumerate(passengers):
        for f_idx, f in enumerate(flights):
            if assignment_matrix[p_idx, f_idx] > 0.5:
                assigned_pairs.append((p, f))
    assignments_df = pd.DataFrame(assigned_pairs, columns=['passenger_id', 'flight'])
    print(assignments_df)
else:
    print("Problem is infeasible or unbounded. Please check constraints.")


No flights available to destination Spokane for passenger 1!
No flights available to destination Spokane for passenger 2!
No flights available to destination Salt Lake City for passenger 3!
No flights available to destination Salt Lake City for passenger 4!
No flights available to destination Salt Lake City for passenger 5!
No flights available to destination Las Vegas for passenger 6!
No flights available to destination Las Vegas for passenger 7!
No flights available to destination Las Vegas for passenger 8!
No flights available to destination Las Vegas for passenger 9!
No flights available to destination Las Vegas for passenger 10!




Solver status: optimal
Objective value: 75.78817733990147
   passenger_id              flight
0             1  Anchorage-Juneau-2
1             2  Anchorage-Juneau-2
2             3  Anchorage-Juneau-1
3             4  Anchorage-Juneau-1
4             5  Anchorage-Juneau-1
5             6  Anchorage-Juneau-2
6             7  Anchorage-Juneau-1
7             8  Anchorage-Juneau-1
8             9  Anchorage-Juneau-1
9            10  Anchorage-Juneau-1
