In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px

sns.set_theme(style="darkgrid")
current_palette=sns.color_palette()

In [None]:
pd.options.display.min_rows = 20
pd.options.display.max_columns = 50

### Load in the data

In [None]:
## Read in 2022 data from filesystem
df_flights = pd.read_csv("../../data/eda/flights/flight_data_Y2022/flight_data_2022.csv.zip",  low_memory=False)

In [None]:
airlines = pd.read_csv("../../data/databases/flight_attributes/airlines.csv.zip", low_memory=False)
airline_mapper = airlines.set_index("Code")["Description"]

### Clean the data

In [None]:
SELECTED_COLUMNS = [
    'FlightDate',
    'Operating_Airline ',
    'OriginAirportID',
    'Origin',
    'OriginCityName',
    'OriginState',
    'DestAirportID',
    'Dest',
    'DestCityName',
    'DestState',
    'DestStateName',
    'CRSDepTime',
    'DepTime',
    'DepDelay',
    'DepDelayMinutes',
    'DepDel15',
    'DepartureDelayGroups',
    'DepTimeBlk',
    'TaxiOut',
    'WheelsOff',
    'WheelsOn',
    'TaxiIn',
    'CRSArrTime',
    'ArrTime',
    'ArrDelay',
    'ArrDelayMinutes',
    'ArrDel15',
    'ArrivalDelayGroups',
    'ArrTimeBlk',
    'Cancelled',
    'CancellationCode',
    'Diverted',
    'CRSElapsedTime',
    'ActualElapsedTime',
    'AirTime',
    'Flights',
    'Distance',
    'DistanceGroup',
    'CarrierDelay',
    'WeatherDelay',
    'NASDelay',
    'SecurityDelay',
    'LateAircraftDelay',
    'FirstDepTime',
    'TotalAddGTime',
    'LongestAddGTime'
]

In [None]:
def airline_mapping(df_flights):
    df_flights["Operating_Airline"] = df_flights["Operating_Airline"].map(airline_mapper)
    return df_flights

def tweak_df_flights(df_flights):
    df_flights = (
        df_flights[SELECTED_COLUMNS]
        .copy()
        .rename(columns={"Operating_Airline ": "Operating_Airline"})
    )
    return df_flights.assign(
        DepTime=df_flights.DepTime.fillna(0).astype("float16"),
        DepDelay=df_flights.DepDelay.fillna(0).astype("float16"),
        DepDelayMinutes=df_flights.DepDelayMinutes.fillna(0).astype("float16"),
        DepDel15=df_flights.DepDel15.fillna(0).astype("float16"),
        DepartureDelayGroups=df_flights.DepartureDelayGroups.fillna(0).astype("float16"),
        TaxiOut=df_flights.TaxiOut.fillna(0).astype("float16"),
        WheelsOff=df_flights.WheelsOff.fillna(0).astype("float16"),
        WheelsOn=df_flights.WheelsOn.fillna(0).astype("float16"),
        TaxiIn=df_flights.TaxiIn.fillna(0).astype("float16"),
        ArrTime=df_flights.ArrTime.fillna(0).astype("float16"),
        ArrDelay=df_flights.ArrDelay.fillna(0).astype("float16"),
        ArrDelayMinutes=df_flights.ArrDelayMinutes.fillna(0).astype("float16"),
        ArrDel15=df_flights.ArrDel15.fillna(0).astype("float16"),
        ArrivalDelayGroups=df_flights.ArrivalDelayGroups.fillna(0).astype("float16"),
        CRSElapsedTime=df_flights.CRSElapsedTime.fillna(0).astype("float16"),
        ActualElapsedTime=df_flights.ActualElapsedTime.fillna(0).astype("float16"),
        AirTime=df_flights.AirTime.fillna(0).astype("float16"),
        Flights=df_flights.Flights.fillna(0).astype("float16"),
        Distance=df_flights.Distance.fillna(0).astype("float16"),
        CarrierDelay=df_flights.CarrierDelay.fillna(0).astype("float16"),
        WeatherDelay=df_flights.WeatherDelay.fillna(0).astype("float16"),
        NASDelay=df_flights.NASDelay.fillna(0).astype("float16"),
        SecurityDelay=df_flights.SecurityDelay.fillna(0).astype("float16"),
        LateAircraftDelay=df_flights.LateAircraftDelay.fillna(0).astype("float16"),
        FirstDepTime=df_flights.FirstDepTime.fillna(0).astype("float16"),
        TotalAddGTime=df_flights.TotalAddGTime.fillna(0).astype("float16"),
        LongestAddGTime=df_flights.LongestAddGTime.fillna(0).astype("float16"),
        CancellationCode=df_flights.CancellationCode.fillna("none").astype("category"),
        FlightDate=pd.to_datetime(df_flights.FlightDate),
    ).astype(
        {
            "Operating_Airline": "category",
            "Origin": "category",
            "Dest": "category",
            "DestState": "category",
            "OriginState": "category",
            "OriginCityName": "category",
            "DestStateName": "category",
            "DestCityName": "category",
            "DepTimeBlk": "category",
            "ArrTimeBlk": "category",
            "Cancelled": "bool",
            "Diverted": "bool",
            "OriginAirportID": "int16",
            "DestAirportID": "int16",
            "CRSDepTime": "int16",
            "CRSArrTime": "int16",
            "DistanceGroup": "int16",
        }
    )


df_flights = tweak_df_flights(df_flights)
df_flights.info()

In [None]:
df_flights.shape

### Airlines
airline mapping from Code to Name
airline analysis

In [None]:
#airlines = pd.read_csv("../../data/databases/flight_attributes/airlines.csv.zip", low_memory=False)

In [None]:

#airline_mapper = airlines.set_index("Code")["Description"]
#df_flights["Operating_Airline"] = df_flights["Operating_Airline"].map(airline_mapper)
airline_counts=(df_flights["Operating_Airline"].value_counts()).index.map(airline_mapper)
airline_counts

In [None]:
df_flights.head()

In [None]:
# df_flights.info()

#### Delays by Airline in 2022

In [None]:
airline_counts = df_flights["Operating_Airline"].value_counts()
airline_counts.plot(kind='barh', figsize=(10, 10))


In [None]:
airline_counts.sort_values(ascending=True).plot(kind='barh', figsize=(10,10))

In [None]:
airline_counts.sort_values(ascending=True).to_frame("Row Count")

In [None]:
px.bar(
    airline_counts.sort_values(ascending=True), 
    orientation='h', 
    template="plotly_dark",
    title="Flights per airline",
    height=700,
    )

### Cancelled flights, group by Year

In [None]:
pct_cancelled = df_flights["Cancelled"].mean()
print(f"{pct_cancelled:0.2f}% of flights are cancelled")
pct_cancelled

In [None]:
df_flights["Cancelled"].value_counts()

In [None]:
#df_flights.query("Year==2022").query("Cancelled")
df_flights.groupby("FlightDate")["Cancelled"].agg("mean").to_frame()

In [None]:
# flights in 2022
df_flights.query("'2022-01-01' <FlightDate < '2022-06-30'")

In [None]:
cancelled_flights_analysis = (
    (df_flights.query("'2022-01-01' < FlightDate < '2022-12-31'")
     .groupby(["Operating_Airline", "Cancelled"])['Cancelled']
     .size()
     .unstack()
     .reset_index()
     )
    .rename(columns={False: "Departed", True: "Cancelled"})
    .rename_axis(None, axis=1))

cancelled_flights_analysis["Total"] = cancelled_flights_analysis["Departed"] + \
    cancelled_flights_analysis["Cancelled"]
cancelled_flights_analysis["Pct_Cancelled"] = (
    cancelled_flights_analysis["Cancelled"] / cancelled_flights_analysis["Total"])*100


In [None]:
cancelled_flights_analysis.head()     ## TODO: add geographical information of flight, weather information

In [None]:
(cancelled_flights_analysis
 .set_index("Operating_Airline")
 .sort_values("Cancelled")["Cancelled"]
 .plot(kind="barh",
       figsize=(8, 8),
       width=.8,
       edgecolor="black",
       title="Flight Cancellations by Airline in 2022"))
plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
(cancelled_flights_analysis.set_index("Operating_Airline")
 .sort_values("Cancelled")["Cancelled"]
 .plot(kind="barh",
       title="Total Operations Cancelled by Airline in 2022",
       width=1
       ))

ax.bar_label(ax.containers[0], padding=5, fontsize=9, color='b')
plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
(cancelled_flights_analysis
 .set_index("Operating_Airline")
 .sort_values("Pct_Cancelled")["Pct_Cancelled"]
 .plot(kind="barh",
       title="Percentage of Operations Cancelled by Airline in 2022",
       width=1,
       # edgecolor="black",
       color=current_palette[6]
       ))
ax.bar_label(ax.containers[0], fmt='%.3f%%', padding=5, fontsize=9, color='b')
plt.show()


In [None]:
df_flights[["FlightDate"]].info()

In [None]:
cancelled_flights = df_flights.query("'2022-01-01' < FlightDate < '2022-12-31'")
cancelled_flights

In [None]:
cancelled_flights_count = (df_flights.query("'2022-01-01' < FlightDate < '2022-12-31'").query("Cancelled")["Operating_Airline"].value_counts())
cancelled_flights_count

Analysis for January - June 2022

In [None]:
df_flights["FlightDate"].dt.month.value_counts()

In [None]:
df_flights["FlightDate"].dt.quarter.value_counts()

In [None]:
df_flights["FlightDate"].value_counts()

In [None]:
cancelled_flights_analysis = (
    (df_flights.query("'2022-01-01' < FlightDate < '2022-12-31'")
     .groupby(["Operating_Airline", "Cancelled"])['Cancelled']
     .size()
     .unstack()
     .reset_index()
     )
    .rename(columns={False: "Departed", True: "Cancelled"})
    .rename_axis(None, axis=1))

cancelled_flights_analysis["Total"] = cancelled_flights_analysis["Departed"] + cancelled_flights_analysis["Cancelled"]
cancelled_flights_analysis["Pct_Cancelled"] = (cancelled_flights_analysis["Cancelled"] / cancelled_flights_analysis["Total"])*100

fig, ax = plt.subplots(figsize=(8, 8))
cancelled_flights_analysis.set_index("Operating_Airline").sort_values(
    "Pct_Cancelled")["Pct_Cancelled"].plot(kind="barh",
                                           title="Percentage of Operations Cancelled by Airline in January - June 2022",
                                           width=1,
                                           #edgecolor="black",
                                           color=current_palette[2]
                                           )
ax.bar_label(ax.containers[0], fmt='%.3f%%', padding=5, fontsize=9, color='b')
plt.show()