In [None]:
# Required for importing modules from parent directory
import os
import sys

current_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.dirname(current_dir)
print(parent_dir)
sys.path.append(parent_dir)

In [None]:
from pathlib import Path

import pandas as pd

from src.loader import TripLoader

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

In [None]:
trip_loader = TripLoader(fix_csv_errors=False)
trips_data = [trip_loader.trips_ABCD, trip_loader.trips_MNOP, trip_loader.trips_ZYXW]
combined_trips = (
    pd.concat(trips_data, ignore_index=True)
    .dropna()
    .drop(columns=["header_line", "entry_details"])
)

In [None]:
unique_df = combined_trips.drop_duplicates(subset="header_id")
print("=========== Total Flights ===========")
print("Total flights: ", len(unique_df))
combined_trips.head()

In [None]:
combined_trips[(combined_trips["header_id"] == "df3a1cc7d5741610")].head(100)

In [None]:
unique_user_names = unique_df["user_name"].unique()
print("=========== Service Accounts Share ===========")
print("Unique User Names: ", unique_user_names)

# Total number of events
total_events = len(unique_df)

# Number of events posted by service accounts
service_account_events = unique_df[
    unique_df["user_name"].str.contains("service")
].shape[0]

# Calculate the share of events posted by service accounts
service_account_share = service_account_events / total_events * 100

print("Total Events: ", total_events)
print("Service Account Events: ", service_account_events)
print("Service Account Share: {:.2f}%".format(service_account_share))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

unique_df["creation_time"] = pd.to_datetime(unique_df["creation_time"])
print("=========== Unique Flights Over Time ===========")
print("unique days: ", unique_df["creation_time"].dt.date.nunique())
# Sort by 'creation_time' and remove duplicates based on 'header_id' to keep the first event for each flight
df_sorted = unique_df.sort_values(by="creation_time").drop_duplicates(
    subset="header_id", keep="first"
)

# Aggregate data to count unique flights per day
unique_flights_per_day = (
    df_sorted.groupby(df_sorted["creation_time"].dt.date)["header_id"]
    .nunique()
    .reset_index(name="unique_flight_count")
)

# Set the plot style
sns.set_theme(style="whitegrid")

# Create the line plot
plt.figure(figsize=(10, 6))
sns.lineplot(
    data=unique_flights_per_day, x="creation_time", y="unique_flight_count", marker="o"
)

# Set plot labels and title
plt.xlabel("Date")
plt.ylabel("Unique Flight Count")
plt.title("Unique Flight Count Over Time")

# Show the plot
plt.show()

In [None]:
airport_flight_counts = (
    combined_trips.groupby("departure_airport")["flight_number"].nunique().reset_index()
)
airport_flight_counts.columns = ["departure_airport", "unique_flight_numbers"]

In [None]:
import folium

from src.utils import airport_coords

# Initialize a map centered around a geographical location (Brazil in this case)
map_center = [-14.2350, -51.9253]  # Center of Brazil
m = folium.Map(location=map_center, zoom_start=4)

# Add markers to the map
for _, row in airport_flight_counts.iterrows():
    airport = row["departure_airport"]
    count = row["unique_flight_numbers"]
    coords = airport_coords.get(airport)
    if coords:
        folium.Marker(
            location=coords,
            popup=f"{airport}: {count} unique flight numbers",
            tooltip=f"{airport}: {count} unique flight numbers",
        ).add_to(m)

# Save the map to an HTML file
m.save("airport_flight_map.html")

In [None]:
# Create a list to store the sequences
sequences = []

# Iterate over each group and create the sequences
for i in range(len(unique_df["header_id"])):
    flight_events = combined_trips[
        (combined_trips["header_id"] == unique_df["header_id"].iloc[i])
    ]
    sequence = flight_events["action_name"].tolist()
    sequences.append(sequence)
# Inspect the sequences
for seq in sequences[:5]:
    print(seq)

In [None]:
from collections import Counter, defaultdict

# Create a defaultdict to store transitions
transitions = defaultdict(Counter)

# Calculate the transitions
for seq in sequences:
    for i in range(len(seq) - 1):
        current_event = seq[i]
        next_event = seq[i + 1]
        transitions[current_event][next_event] += 1

# Calculate the probabilities
transition_probabilities = {
    k: {k2: v2 / sum(v.values()) for k2, v2 in v.items()}
    for k, v in transitions.items()
}

# Inspect the transition probabilities
for event, probs in transition_probabilities.items():
    print(f"{event}: {probs}")

In [None]:
import matplotlib.pyplot as plt
import networkx as nx

# Create a directed graph
G = nx.DiGraph()

# Add nodes and edges with probabilities
for event, probs in transition_probabilities.items():
    for next_event, prob in probs.items():
        G.add_edge(event, next_event, weight=prob)

# Draw the graph
pos = nx.spring_layout(G)  # Layout for better visualization
edges = G.edges(data=True)
weights = [d["weight"] for (u, v, d) in edges]

plt.figure(figsize=(12, 8))
nx.draw(
    G,
    pos,
    with_labels=True,
    node_size=3000,
    node_color="lightblue",
    font_size=10,
    font_weight="bold",
    edge_color=weights,
    edge_cmap=plt.cm.Blues,
)
edge_labels = nx.get_edge_attributes(G, "weight")
nx.draw_networkx_edge_labels(
    G, pos, edge_labels={(u, v): f"{d['weight']:.2f}" for u, v, d in edges}
)

plt.title("Process Overview and Transition Probabilities")
plt.show()