In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px

#### Convert csv to parquet format to reduce dimension

In [13]:
## This is done once to load the data
#df = pd.read_csv("../../data/eda/flights/flight_data_082022/flight_data_082022.csv", low_memory=False)
#df.to_parquet("../../data/eda/flights/flight_data_082022/flight_data_082022.parquet",engine='fastparquet')

#### Upload the data

In [14]:
df_flights = pd.read_parquet("../../data/eda/flights/flight_data_082022/flight_data_082022.parquet")
## clean the data by removing spaces
df_flights= df_flights.rename(columns={'Operating_Airline ' : 'Operating_Airline'})

In [15]:
[x for x in df_flights.columns]

['Year',
 'Quarter',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'FlightDate',
 'Marketing_Airline_Network',
 'Operated_or_Branded_Code_Share_Partners',
 'DOT_ID_Marketing_Airline',
 'IATA_Code_Marketing_Airline',
 'Flight_Number_Marketing_Airline',
 'Originally_Scheduled_Code_Share_Airline',
 'DOT_ID_Originally_Scheduled_Code_Share_Airline',
 'IATA_Code_Originally_Scheduled_Code_Share_Airline',
 'Flight_Num_Originally_Scheduled_Code_Share_Airline',
 'Operating_Airline',
 'DOT_ID_Operating_Airline',
 'IATA_Code_Operating_Airline',
 'Tail_Number',
 'Flight_Number_Operating_Airline',
 'OriginAirportID',
 'OriginAirportSeqID',
 'OriginCityMarketID',
 'Origin',
 'OriginCityName',
 'OriginState',
 'OriginStateFips',
 'OriginStateName',
 'OriginWac',
 'DestAirportID',
 'DestAirportSeqID',
 'DestCityMarketID',
 'Dest',
 'DestCityName',
 'DestState',
 'DestStateFips',
 'DestStateName',
 'DestWac',
 'CRSDepTime',
 'DepTime',
 'DepDelay',
 'DepDelayMinutes',
 'DepDel15',
 'DepartureDelayGroups',
 'D

In [16]:
df_flights.shape

(613649, 120)

In [17]:
df_flights.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Marketing_Airline_Network,Operated_or_Branded_Code_Share_Partners,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,...,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum,Duplicate,Unnamed: 119
0,2022,3,8,17,3,2022-08-17,WN,WN,19393,WN,...,,,,,,,,,N,
1,2022,3,8,17,3,2022-08-17,WN,WN,19393,WN,...,,,,,,,,,N,
2,2022,3,8,17,3,2022-08-17,WN,WN,19393,WN,...,,,,,,,,,N,
3,2022,3,8,17,3,2022-08-17,WN,WN,19393,WN,...,,,,,,,,,N,
4,2022,3,8,17,3,2022-08-17,WN,WN,19393,WN,...,,,,,,,,,N,


#### Analysis of Features
Identification of Potential Features 
- Time Series Features:
    'Year',
    'Quarter',
    'Month',
    'DayofMonth',
    'DayOfWeek',
    'FlightDate',
- Flight Info:
    - Airline: Marketing_Airline_Network
    - Origin: Origin, OriginCityName
    - Destination: Dest, DestCityName
- Departure / Delay Info:
    - CRSDepTime: Scheduled Departure
    - DepTime: Actual Departure
    - DepDelay: Departure Delay

In [18]:
[c for c in df_flights.columns]

['Year',
 'Quarter',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'FlightDate',
 'Marketing_Airline_Network',
 'Operated_or_Branded_Code_Share_Partners',
 'DOT_ID_Marketing_Airline',
 'IATA_Code_Marketing_Airline',
 'Flight_Number_Marketing_Airline',
 'Originally_Scheduled_Code_Share_Airline',
 'DOT_ID_Originally_Scheduled_Code_Share_Airline',
 'IATA_Code_Originally_Scheduled_Code_Share_Airline',
 'Flight_Num_Originally_Scheduled_Code_Share_Airline',
 'Operating_Airline',
 'DOT_ID_Operating_Airline',
 'IATA_Code_Operating_Airline',
 'Tail_Number',
 'Flight_Number_Operating_Airline',
 'OriginAirportID',
 'OriginAirportSeqID',
 'OriginCityMarketID',
 'Origin',
 'OriginCityName',
 'OriginState',
 'OriginStateFips',
 'OriginStateName',
 'OriginWac',
 'DestAirportID',
 'DestAirportSeqID',
 'DestCityMarketID',
 'Dest',
 'DestCityName',
 'DestState',
 'DestStateFips',
 'DestStateName',
 'DestWac',
 'CRSDepTime',
 'DepTime',
 'DepDelay',
 'DepDelayMinutes',
 'DepDel15',
 'DepartureDelayGroups',
 'D

#### Delays by Airline in August 2022

In [19]:
airline_ = pd.read_csv("../../data/databases/flight_attributes/airlines.csv")
airline_.query("IATA_CODE == '9E'")

Unnamed: 0,IATA_CODE,AIRLINE
126,9E,Endeavor Air Inc.


In [20]:
airlines = pd.read_csv("https://raw.githubusercontent.com/jenbam/airlines/master/data-raw/airlines.csv")
## save locally
#airlines.to_csv("airlines.csv") 
airlines.query("Code == '9E'")

Unnamed: 0,Code,Description
126,9E,Endeavor Air Inc.


In [21]:
df_flights.query('Year == 2022').head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Marketing_Airline_Network,Operated_or_Branded_Code_Share_Partners,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,...,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum,Duplicate,Unnamed: 119
0,2022,3,8,17,3,2022-08-17,WN,WN,19393,WN,...,,,,,,,,,N,
1,2022,3,8,17,3,2022-08-17,WN,WN,19393,WN,...,,,,,,,,,N,
2,2022,3,8,17,3,2022-08-17,WN,WN,19393,WN,...,,,,,,,,,N,
3,2022,3,8,17,3,2022-08-17,WN,WN,19393,WN,...,,,,,,,,,N,
4,2022,3,8,17,3,2022-08-17,WN,WN,19393,WN,...,,,,,,,,,N,


In [22]:
df_flights["Operating_Airline "].value_counts()

KeyError: 'Operating_Airline '