In [1]:
import pandas as pd
from pathlib import Path
from itertools import combinations
from tqdm import tqdm

In [2]:
airport_data_path = "/Users/danhleth/Projects/AIATFM/deepflightplan/datasets/airports/iata-icao.csv"

In [3]:
schedule_airport_df = pd.read_csv("/Users/danhleth/Projects/oag/Sch_AS_Jan2024_JobId3311468.csv")


schedule_airport_df = schedule_airport_df[['Carrier Code', "Flight No", "Dep Airport Code", \
                                   "Arr Airport Code", "Local Dep Time", "Local Arr Time", \
                                    "Elapsed Time", "GCD (km)", "Time series" ]]

name_mapping = {"Carrier Code": "Published Carrier",
                 "Flight No": "Flight Number",
                 "Dep Airport Code": "Origin",
                 "Arr Airport Code": "Destination",
                 "Local Dep Time": "Departure Time",
                 "Local Arr Time": "Arrival Time",
                 "Elapsed Time": "Elapsed Time",
                 "GCD (km)": "Distance (KM)",
                 "Flying Time": "Flying Time",
                 "Time series": "Time series"}
schedule_airport_df.rename(columns=name_mapping, inplace=True)
schedule_airport_df.columns

Index(['Published Carrier', 'Flight Number', 'Origin', 'Destination',
       'Departure Time', 'Arrival Time', 'Elapsed Time', 'Distance (KM)',
       'Time series'],
      dtype='object')

In [4]:
capacity_airport_df = pd.read_csv("/Users/danhleth/Projects/oag/CapacityReport_AS_Jan2024.csv")
capacity_airport_list = list(set(capacity_airport_df['Origin'].unique()) & set(capacity_airport_df['Destination'].unique()))

In [5]:
def filter_df_by_time_range(df, start_time="2024-01-01", end_time="2024-01-01"):
    df['Time series'] = pd.to_datetime(df['Time series'], format="%Y-%m-%d")
    mask = (df['Time series'] >= start_time) & (df['Time series'] <= end_time)
    return df.loc[mask]

# Union of two dataframes
# Convert dataframes to sets of tuples

def union_rows(df1,df2):
    rows_df1 = set(df1.itertuples(index=False, name=None))
    rows_df2 = set(df2.itertuples(index=False, name=None))

    # Perform union
    union_rows = rows_df1 | rows_df2
    union_df = pd.DataFrame(list(union_rows), columns=df1.columns)
    return union_df


def intersection_rows(df1,df2):
    rows_df1 = set(df1.itertuples(index=False, name=None))
    rows_df2 = set(df2.itertuples(index=False, name=None))

    # Perform intersection
    intersection_rows = rows_df1 & rows_df2
    intersection_df = pd.DataFrame(list(intersection_rows), columns=df1.columns)
    return intersection_df

def subtract_rows(df1,df2):
    # It identifies the rows in df1 that are not present in df2 (based on the values in all columns).
    # It returns a new DataFrame containing only the rows unique to df1.
    rows_df1 = set(df1.itertuples(index=False, name=None))
    rows_df2 = set(df2.itertuples(index=False, name=None))

    # Perform subtraction
    subtract_rows = rows_df1 - rows_df2
    subtract_df = pd.DataFrame(list(subtract_rows), columns=df1.columns)
    return subtract_df

In [6]:
intersection_columns = list(set(schedule_airport_df.columns) & set(capacity_airport_df.columns))
order_columns = ['Published Carrier', 'Flight Number', 'Origin', 'Destination',
       'Departure Time', 'Arrival Time', 'Elapsed Time', 'Distance (KM)',
       'Time series']
schedule_airport_df = schedule_airport_df[intersection_columns]
capacity_airport_df = capacity_airport_df[intersection_columns]

In [7]:
intersected_df = intersection_rows(schedule_airport_df, capacity_airport_df)
intersected_df[(intersected_df["Published Carrier"] == "TR") & (intersected_df["Flight Number"] == 304)]

Unnamed: 0,Elapsed Time,Flight Number,Arrival Time,Destination,Origin,Time series,Departure Time,Distance (KM),Published Carrier
85554,02:10,304,830,SGN,SIN,2024-01-07,720,1091,TR
87541,02:10,304,830,SGN,SIN,2024-01-15,720,1091,TR
100965,02:10,304,830,SGN,SIN,2024-01-05,720,1091,TR
108883,02:10,304,830,SGN,SIN,2024-01-13,720,1091,TR
114529,02:10,304,830,SGN,SIN,2024-01-04,720,1091,TR
123879,02:10,304,830,SGN,SIN,2024-01-24,720,1091,TR
307088,02:10,304,830,SGN,SIN,2024-01-08,720,1091,TR
312939,02:10,304,830,SGN,SIN,2024-01-21,720,1091,TR
386215,02:10,304,830,SGN,SIN,2024-01-20,720,1091,TR
421388,02:10,304,830,SGN,SIN,2024-01-23,720,1091,TR


In [8]:
airport_df_subtract_capacity_airport_df = subtract_rows(schedule_airport_df, capacity_airport_df)
capacity_airport_df_subtract_airport_df = subtract_rows(capacity_airport_df, schedule_airport_df)


In [9]:
df_results = pd.DataFrame()
airports = []
gt_entries = []
scheduled_entries = []
intersected_entries = []
airport_subtract_capacity_airport_entries = []
capacity_airport_subtract_airport_entries = []

for airport in tqdm(capacity_airport_list):
    unique_tmp_range = schedule_airport_df
    airports.append(airport)
    tmp_capacity_airport_df = capacity_airport_df[(capacity_airport_df["Origin"] == airport) | (capacity_airport_df["Destination"] == airport)]
    gt_entries.append(len(tmp_capacity_airport_df))
    tmp_schedule_airport_df = schedule_airport_df[(schedule_airport_df["Origin"] == airport) | (schedule_airport_df["Destination"] == airport)]
    scheduled_entries.append(len(tmp_schedule_airport_df))
    intersected_entries.append(len(intersection_rows(tmp_schedule_airport_df, capacity_airport_df)))
    airport_subtract_capacity_airport_entries.append(len(subtract_rows(tmp_schedule_airport_df, tmp_capacity_airport_df)))
    capacity_airport_subtract_airport_entries.append(len(subtract_rows(tmp_capacity_airport_df, tmp_schedule_airport_df)))


df = pd.DataFrame({ 
    "Airport": airports,
    "GT Entries": gt_entries,
    "Scheduled Entries": scheduled_entries,
    "Intersected Entries": intersected_entries,
    "Airport Subtract Capacity Airport Entries": airport_subtract_capacity_airport_entries,
    "Capacity Airport Subtract Airport Entries": capacity_airport_subtract_airport_entries
})
df.to_csv("analyze_method_do_download_oag.csv", index=False)

  0%|          | 0/1153 [00:00<?, ?it/s]

100%|██████████| 1153/1153 [15:30<00:00,  1.24it/s]
