In [None]:
# Required for importing modules from parent directory
import os
import sys

current_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.dirname(current_dir)
print(parent_dir)
sys.path.append(parent_dir)

In [None]:
import pandas as pd

from src.loader import TripLoader
from src.utils import *

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

In [None]:
import re


def extract_UpdateFlightAction(entry_string: str, header_category: str):
    if header_category == "received":
        leg_keys = COLUMNS_UpdateFlightAction_RECEIVED
    elif header_category == "saved":
        leg_keys = COLUMNS_UpdateFlightAction_SAVED

    entry_string = entry_string.replace("\r", "")
    lines = entry_string.split("\n")
    print(lines)

    extracted_dict = {}
    for line in lines[0:1]:
        for key in COLUMNS_UpdateFlightAction_METADATA:
            pattern = f"{key}: (.*?)(?=\s+\w+:|$)"
            match = re.search(pattern, line)
            if match:
                extracted_dict[key] = match.group(1).strip()

    legs = []
    legs_start_index = lines.index("Legs:") + 2
    for line in lines[legs_start_index:]:
        if line.strip() == "":
            continue

        values = re.findall(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}|\S+", line)
        leg_data = dict(zip(leg_keys, values))
        legs.append(leg_data)

    extracted_dict["legs"] = legs
    return extracted_dict

In [None]:
trip_loader = TripLoader()
trips_data = trip_loader.trips_ABCD

In [None]:
filtered = trips_data[trips_data["action_name"] == "UpdateFlightAction"]
for idx, row in filtered.iterrows():
    entry_string = row["entry_details"]
    print(row["entry_details"])
    extracted_dict = extract_UpdateFlightAction(entry_string, "received")
    print(extracted_dict)
    break

In [None]:
trips_data2 = trip_loader.trips_MNOP

In [None]:
trips_data3 = trip_loader.trips_ZYXW

In [None]:
# combine all trips data
trips_data_all = pd.concat([trips_data, trips_data2, trips_data3])

In [None]:
trips_data_all.head()

In [None]:
trips_data_all.info()

In [None]:
# number of unique values of departure airport
trips_data_all["departure_airport"].nunique()

In [None]:
trips_data_all["flight_id"].nunique()  # 18415 unique flight ids

In [None]:
trips_data2[trips_data2["departureAirport"] == "ORY"]  # results are true

In [None]:
# df1 = trips_data[["flight_id", "id", "creation_time", "airline_code", "flight_number", "flight_date", "departure_airport", "user_name", "action_name", "header_line", "entry_details", "header_category", "header_id", "scheduleState", "departureAirport", "departureTime", "arrivalAirport", "arrivalTime"]]
# df2 = trips_data2[["flight_id", "id", "creation_time", "airline_code", "flight_number", "flight_date", "departure_airport", "user_name", "action_name", "header_line", "entry_details", "header_category", "header_id", "scheduleState", "departureAirport", "departureTime", "arrivalAirport", "arrivalTime"]]
# df3 = trips_data3[["flight_id", "id", "creation_time", "airline_code", "flight_number", "flight_date", "departure_airport", "user_name", "action_name", "header_line", "entry_details", "header_category", "header_id", "scheduleState", "departureAirport", "departureTime", "arrivalAirport", "arrivalTime"]]

In [None]:
# check if there are flights from the same airline with the same airline code in all data

In [None]:
# groupby with flight id and put in a count column plus the value of departure airport
grouped = (
    trips_data_all.groupby("flight_id")
    .agg({"departure_airport": "first", "id": "count", "airline_code": "first"})
    .reset_index()
)
grouped.info()

In [None]:
grouped.head()

In [None]:
# grouped2 = trips_data2.groupby("flight_id").agg({"departure_airport": "first", "id": "count"}).reset_index()
# grouped3 = trips_data3.groupby("flight_id").agg({"departure_airport": "first", "id": "count"}).reset_index()

In [None]:
# df1 = grouped.groupby("departure_airport").size().reset_index(name="count").sort_values("count", ascending=False)
# df2 = grouped2.groupby("departure_airport").size().reset_index(name="count").sort_values("count", ascending=False)
# df3 = grouped3.groupby("departure_airport").size().reset_index(name="count").sort_values("count", ascending=False)

# df2_1 = df2.iloc[:800000]
# df2_2 = df2.iloc[800000:]

In [None]:
# group departure airport and take airline code and count
df1 = (
    grouped.groupby("departure_airport")
    .agg({"airline_code": "first", "id": "count"})
    .reset_index()
)

# df1 = grouped.groupby("departure_airport").size().reset_index(name="count").sort_values("count", ascending=False)

In [None]:
df1.info()

In [None]:
df1.head()

In [None]:
# sum of count df1
df1["id"].sum()  # sum is same as the number of unique flight ids, no loss of flights

In [None]:
# grouped.groupby("departure_airport").size()

In [None]:
# # dew df with only groupby columns
# trips_data.groupby("departure_airport").size().reset_index()

In [None]:
# # sum of column 0
# trips_data.groupby("departure_airport").size().reset_index().sum()

In [None]:
# df1 = trips_data[["flight_id", "id", "creation_time", "airline_code", "flight_number", "flight_date", "departure_airport", "user_name", "action_name", "header_line", "entry_details", "header_category", "header_id"]]
# df2 = trips_data2[["flight_id", "id", "creation_time", "airline_code", "flight_number", "flight_date", "departure_airport", "user_name", "action_name", "header_line", "entry_details", "header_category", "header_id"]]
# df3 = trips_data3[["flight_id", "id", "creation_time", "airline_code", "flight_number", "flight_date", "departure_airport", "user_name", "action_name", "header_line", "entry_details", "header_category", "header_id"]]


# df1 = trips_data[["flight_id", "id", "creation_time", "airline_code", "flight_number", "flight_date", "departure_airport", "user_name", "action_name", "header_category", "header_id"]]
# df2 = trips_data2[["flight_id", "id", "creation_time", "airline_code", "flight_number", "flight_date", "departure_airport", "user_name", "action_name", "header_category", "header_id"]]
# df3 = trips_data3[["flight_id", "id", "creation_time", "airline_code", "flight_number", "flight_date", "departure_airport", "user_name", "action_name", "header_category", "header_id"]]

# # split df2 in 2 parts
# df2_1 = df2.iloc[:800000]
# df2_2 = df2.iloc[800000:]

In [None]:
airports = pd.read_csv("airports.csv")

In [None]:
airports.head()

In [None]:
# join df1 and airports on airport code and departure_airport
df1_airports = pd.merge(
    df1, airports, left_on="departure_airport", right_on="Airport Code", how="left"
)

In [None]:
df1_airports.head()

In [None]:
# df2_1_airports = pd.merge(df2_1, airports, left_on="departure_airport", right_on="Airport Code", how="left")
# df2_2_airports = pd.merge(df2_2, airports, left_on="departure_airport", right_on="Airport Code", how="left")
# df3_airports = pd.merge(df3, airports, left_on="departure_airport", right_on="Airport Code", how="left")

In [None]:
# df3_airports.head()

In [None]:
# df1airports["count"].sum()

In [None]:
# df2["count"].sum()

In [None]:
# df3["count"].sum()

In [None]:
# export all to excel
df1_airports.to_excel("df_all_airports.xlsx", index=False)
# df2_1_airports.to_excel("df2_1_airports.xlsx", index=False)
# df2_2_airports.to_excel("df2_2_airports.xlsx", index=False)
# df3_airports.to_excel("df3_airports.xlsx", index=False)

In [None]:
# df1.to_csv('result1.csv', index=False)

# df2_1.to_excel('result21.xlsx', index=False)
# df2_2.to_excel('result22.xlsx', index=False)

In [None]:
# export to excel with pandas
# df1.to_excel('result1.xlsx', index=False)

In [None]:
# df = pd.concat([df1, df2, df3])

In [None]:
df.info()

In [None]:
df.head()

In [None]:
# df.to_csv('result.csv', index=False)
# df3.to_excel('result2.xlsx', index=False)

In [None]:
df2.head()