In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px

#### Upload the data

In [None]:
## Convert csv to parquet format to reduce dimension
## This is done once to load the data
# df = pd.read_csv("../../data/eda/flights/flight_data_082022/flight_data_082022.csv", low_memory=False)
# df.to_parquet("../../data/eda/flights/flight_data_082022/flight_data_082022.parquet",engine='fastparquet')

df_flights = pd.read_parquet("../../data/eda/flights/flight_data_082022/flight_data_082022.parquet")


#### Data cleaning


In [None]:
## clean the data by removing spaces
df_flights= df_flights.rename(columns={'Operating_Airline ' : 'Operating_Airline'})
df_flights["Cancelled"] = df_flights["Cancelled"].astype("bool")

In [None]:
[x for x in df_flights.columns]

In [None]:
df_flights.shape

In [None]:
df_flights.tail()

#### Analysis of Features
Identification of Potential Features 
- Time Series Features:
    'Year',
    'Quarter',
    'Month',
    'DayofMonth',
    'DayOfWeek',
    'FlightDate',
- Flight Info:
    - Airline: Marketing_Airline_Network
    - Origin: Origin, OriginCityName
    - Destination: Dest, DestCityName
- Departure / Delay Info:
    - CRSDepTime: Scheduled Departure
    - DepTime: Actual Departure
    - DepDelay: Departure Delay

In [None]:
[c for c in df_flights.columns]

#### Delays by Airline in August 2022

In [None]:
airline_csv = "https://raw.githubusercontent.com/jenbam/airlines/master/data-raw/airlines.csv"
airlines = pd.read_csv(airline_csv)
## save locally
#airlines.to_csv("airlines.csv") 
airlines.query("Code == '9E'")
#airlines


In [None]:
airline_counts=df_flights["Operating_Airline"].value_counts()
airline_counts.head()

In [None]:
airline_counts=df_flights["Operating_Airline"].value_counts()
airline_counts
airline_counts.index=airline_counts.index.map(
    airlines.set_index('Code')['Description']
    ).rename("Airline")
airline_counts.head()

In [None]:
airline_counts.plot(kind='barh', figsize=(10,10))

In [None]:
airline_counts.sort_values(ascending=True).plot(kind='barh', figsize=(10,10))

In [None]:
df_flights.query('Year == 2022').head()

In [None]:
airline_counts.sort_values(ascending=True).to_frame("Row Count")

In [None]:
px.bar(
    airline_counts.sort_values(ascending=True), 
    orientation='h', 
    template="plotly_dark",
    title="Flights per airline",
    height=700,
    )

In [None]:
df_flights["Cancelled"] = df_flights["Cancelled"].astype("bool")

In [None]:
df_flights["CancellationCode"]

In [None]:
df_flights["Cancelled"].value_counts()

In [None]:
df_flights.query("Cancelled")