In [1]:
# Some Findings
# > Sorting after creation_time and id results in the same order
# > The additional default cols in the main data are from ASMMsgProcessor
# > There is no need to find out which header category is relevant for which action (Eric has done that in preprocessing)
# > Benedikts most common process is similar but not identical to the one found in this notebook

In [2]:
# working dir
import os
import sys

cwd = os.getcwd()
root_dir = os.path.dirname(os.path.dirname(cwd))
sys.path.append(root_dir)

print(root_dir)

c:\Users\david\Desktop\Main\03_Uni\WWI21DSA\02_Vorlesungen\06_Projektrealisierung\Projektrealisierung


In [3]:
# Imports and settings
import pandas as pd
import random

random.seed(42)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

In [4]:
import pandas as pd
from IPython.display import display, HTML

# Display Dataframe (with scrollbars)
def ddf(df, max_height=500, max_width=1500):
    """
    Display a pandas DataFrame with horizontal and vertical scrollbars in a Jupyter notebook.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to display.
    max_height (int): The maximum height of the scrollable area in pixels.
    max_width (int): The maximum width of the scrollable area in pixels.
    """
    style = f"""
    <style>
    .scrollable-dataframe {{
        max-height: {max_height}px;
        max-width: {max_width}px;
        overflow: auto;
        display: inline-block;
        position: relative;
    }}
    .scrollable-dataframe thead th {{
        position: sticky;
        top: 0;
        background-color: white;
        z-index: 1;
    }}
    </style>
    """
    html = style + df.to_html(classes='scrollable-dataframe')
    display(HTML(html))


In [5]:
# Load data
abcd = pd.read_csv("../../src/data/ABCD_tripfiles_preprocessed.csv")
mnop = pd.read_csv("../../src/data/MNOP_tripfiles_preprocessed.csv")
zyxw = pd.read_csv("../../src/data/ZYXW_tripfiles_preprocessed.csv")

  abcd = pd.read_csv("../../src/data/ABCD_tripfiles_preprocessed.csv")
  mnop = pd.read_csv("../../src/data/MNOP_tripfiles_preprocessed.csv")
  zyxw = pd.read_csv("../../src/data/ZYXW_tripfiles_preprocessed.csv")


In [6]:
abcd.columns

Index(['flight_id', 'id', 'creation_time', 'airline_code', 'flight_number',
       'flight_date', 'departure_airport', 'user_name', 'action_name',
       'header_line', 'entry_details', 'extracted_data_path',
       'header_category', 'header_id', 'scheduleState', 'departureAirport',
       'departureTime', 'arrivalAirport', 'arrivalTime',
       'aircraftRegistration', 'aircraftSubtype', 'aircraftVersion',
       'serviceType', 'nonOperational', 'flifoCancel', 'sequenceNumber',
       'bestTimes', 'mvtActual', 'extracted_data'],
      dtype='object')

In [7]:
print(
    f"unique flights: {abcd['flight_id'].nunique()}",
    f"unique actions: {abcd['action_name'].nunique()}",
    sep="\n",
)

unique flights: 2751
unique actions: 49


In [8]:
most_common_benedikt = [
    "ClearFlightsAction",
    "ASMMsgProcessor",
    "UpdateFlightAction",
    "AssignLCCAction",
    "UpdateFlightAction",
    "ASMMsgProcessor",
    "UpdateFlightAction",
    "AssignLCCAction",
    "StoreRegistrationAndConfigurationAc",
    "UpdateCrewDataAction",
    "CalculateWeightAndTrimAction",
    "TransferCargoAction",
    "AssignLCCAction",
    
    "TransferCargoAction",
    "CalculateWeightAndTrimAction",
    "StoreRegistrationAndConfigurationAc",
    "AssignLCCAction",
    "UpdateFlightAction",
    "ASMMsgProcessor"
]

most_common_david = [  # 358 with this process
    'ClearFlightsAction',  # No - unnötig
    'ASMMsgProcessor',  # Yes - per default in main df enthalten
    'UpdateFlightAction',  # Yes
    'AssignLCCAction',   # Yes
    'ASMMsgProcessor', 
    'UpdateFlightAction',
    'AssignLCCAction',
    'StoreRegistrationAndConfigurationAc',  # Yes
    'UpdateCrewDataAction',  # Yes
    'CalculateWeightAndTrimAction',  # Yes
    'TransferCargoAction', # No - sieht eigentlich intressant aus (soll wohl updates von bagagge, mail, total cargo enthalten aber ist immer alles 0kg [zumindest für abcd])
    'AssignLCCAction'
]



In [44]:
from collections import Counter

process_counter = []
for dataset in [abcd, mnop, zyxw]:
    action_sequences = []
    df_prepared = dataset[dataset["header_category"] == "received"].sort_values(["creation_time", "id"]).groupby("flight_id")

    for id, df in df_prepared:
        action_sequence = df["action_name"].to_list()
        action_sequences.append(action_sequence)

    # Now count how often each process occurs
    tupled_lists = [tuple(lst) for lst in action_sequences]
    counter = Counter(tupled_lists)
    
    sorted_counter = sorted(counter, key=lambda x: counter[x], reverse=True)
    process_count = [(counter[x], x) for x in sorted_counter]

    process_counter.append(process_count)

# Use result to get the top 3 most common processes for each dataset
for i, process_count in enumerate(process_counter):
    dataset = ["abcd", "mnop", "zyxw"][i]
    print(dataset)

    for j in range(1, 4):
        print(f"Top {j}, count: {process_count[j-1][0]} process: {process_count[j-1][1]}")

    print("")

abcd
Top 1, count: 358 process: ('ClearFlightsAction', 'ASMMsgProcessor', 'UpdateFlightAction', 'AssignLCCAction', 'ASMMsgProcessor', 'UpdateFlightAction', 'AssignLCCAction', 'StoreRegistrationAndConfigurationAc', 'UpdateCrewDataAction', 'CalculateWeightAndTrimAction', 'TransferCargoAction', 'AssignLCCAction')
Top 2, count: 186 process: ('ClearFlightsAction',)
Top 3, count: 123 process: ('ASMMsgProcessor', 'UpdateFlightAction', 'AssignLCCAction', 'StoreRegistrationAndConfigurationAc', 'ResetLoadingListRecordsAction', 'UpdateLoadTableAction', 'CalculateWeightAndTrimAction', 'UpdateCrewDataAction', 'CalculateWeightAndTrimAction', 'AssignLCCAction')

mnop
Top 1, count: 900 process: ('ClearFlightsAction',)
Top 2, count: 36 process: ('ASMMsgProcessor', 'UpdateFlightAction', 'AssignLCCAction', 'StoreRegistrationAndConfigurationAc', 'UpdateCrewDataAction', 'CalculateWeightAndTrimAction', 'TransferCargoAction', 'AssignLCCAction')
Top 3, count: 13 process: ('ASMMsgProcessor', 'UpdateFlightActio

In [34]:
relevant_flight_ids = []
for i, dataset in enumerate([abcd, mnop, zyxw]):
    df_prepared = dataset[dataset["header_category"] == "received"].sort_values(["creation_time", "id"]).groupby("flight_id")
    relevant_flight_ids.append([])

    for id, df in df_prepared:
        action_sequence = df["action_name"].to_list()
        most_common_process = list(process_counter[i][0 if i != 1 else 2][1])  # Use the most common process for abcd and zyxw, the second most common for mnop
        if action_sequence == most_common_process:
            relevant_flight_ids[i].append(id)

In [35]:
relevant_flight_ids[0]  # abcd

['AB_1070_18_BOM',
 'AB_1071_18_DOH',
 'AB_2102_16_AMD',
 'AB_2102_21_AMD',
 'AB_2104_16_BLR',
 'AB_2104_17_BLR',
 'AB_2104_19_BLR',
 'AB_2104_20_BLR',
 'AB_2104_21_BLR',
 'AB_2107_19_BOM',
 'AB_2107_20_BOM',
 'AB_2107_21_BOM',
 'AB_2108_16_AMD',
 'AB_2108_17_AMD',
 'AB_2108_18_AMD',
 'AB_2108_19_AMD',
 'AB_2108_21_AMD',
 'AB_2109_16_BLR',
 'AB_2109_17_BLR',
 'AB_2109_18_BLR',
 'AB_2109_19_BLR',
 'AB_2109_20_BLR',
 'AB_2109_21_BLR',
 'AB_2110_16_BOM',
 'AB_2110_17_BOM',
 'AB_2110_19_BOM',
 'AB_2110_20_BOM',
 'AB_2110_21_BOM',
 'AB_2120_17_BOM',
 'AB_2120_18_BOM',
 'AB_2120_21_BOM',
 'AB_2124_19_BOM',
 'AB_2124_20_BOM',
 'AB_2124_21_BOM',
 'AB_2125_19_CCU',
 'AB_2125_20_CCU',
 'AB_2125_21_CCU',
 'AB_2127_17_BOM',
 'AB_2127_18_BOM',
 'AB_2127_21_BOM',
 'AB_2128_17_DEL',
 'AB_2128_18_DEL',
 'AB_2128_21_DEL',
 'AB_2134_16_BLR',
 'AB_2134_17_BLR',
 'AB_2134_18_BLR',
 'AB_2134_19_BLR',
 'AB_2134_20_BLR',
 'AB_2134_21_BLR',
 'AB_2135_16_GAU',
 'AB_2135_17_GAU',
 'AB_2135_18_GAU',
 'AB_2135_19

In [None]:
relevant_flight_ids[1] # mnop

In [None]:
relevant_flight_ids[2] # zyxw