In [None]:
import pandas as pd
import os
from glob import glob
import re
from collections import defaultdict

# This code block used for creating combined flight parquet files
# You should have monthly raw files in csv_folder to execute that code block 
# Path to your CSV files
csv_folder = 'raw/'
output_dir = 'raw/Combined_Flights'  # folder to save yearly files
csv_files = glob(os.path.join(csv_folder, '*.csv'))
# Dictionary to collect DataFrames by year
yearly_data = defaultdict(list) 

used_columns = ["Year", "Month", "FlightDate", "Operating_Airline ", "Origin", "Dest", "Flights","DepDelay", "DepDelayMinutes", "ArrDelay", "ArrDelayMinutes", "Diverted"]
# Loop through files and group by year
for file in csv_files:
    print(f"Reading {file}...")
    match = re.search(r'Flights_(\d{4})_', file)
    if match:
        year = match.group(1)
        df = pd.read_csv(file, usecols=used_columns, low_memory=False)
        df.drop(index=df[df['Diverted'] == 1.00].index, inplace=True)
        yearly_data[year].append(df)

# Combine and write yearly files
for year, dfs in yearly_data.items():
    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df.to_parquet(os.path.join(output_dir, f'Combined_Flights_{year}.parquet'), index=False)
    print(f'Written: Combined_Flights_{year}.parquet')

In [1]:
import pandas as pd
from glob import glob
import os

output_dir = 'combined_flights/'
parquet_files = glob(os.path.join(output_dir, '*.parquet'))

# Read combined parquet files and create a dataframe
dfs = []
for file in parquet_files:
    print(f"Reading {file}...")
    dfs.append(pd.read_parquet(file))
df_all = pd.concat(dfs).reset_index(drop=True)
df_all.info()

Reading combined_flights/Combined_Flights_2018.parquet...
Reading combined_flights/Combined_Flights_2019.parquet...
Reading combined_flights/Combined_Flights_2022.parquet...
Reading combined_flights/Combined_Flights_2020.parquet...
Reading combined_flights/Combined_Flights_2021.parquet...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29125433 entries, 0 to 29125432
Data columns (total 12 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Year                int64  
 1   Month               int64  
 2   FlightDate          object 
 3   Operating_Airline   object 
 4   Origin              object 
 5   Dest                object 
 6   DepDelay            float64
 7   DepDelayMinutes     float64
 8   ArrDelay            float64
 9   ArrDelayMinutes     float64
 10  Diverted            float64
 11  Flights             float64
dtypes: float64(6), int64(2), object(4)
memory usage: 2.6+ GB


In [2]:
# Diverted flights are not direct and has no delay values
df_all.drop("Diverted", axis=1, inplace=True)
df_all.columns

Index(['Year', 'Month', 'FlightDate', 'Operating_Airline ', 'Origin', 'Dest',
       'DepDelay', 'DepDelayMinutes', 'ArrDelay', 'ArrDelayMinutes',
       'Flights'],
      dtype='object')

In [3]:
df_all.columns = df_all.columns.str.strip()
#convert flightdate to datetime
df_all["FlightDate"] = pd.to_datetime(df_all["FlightDate"])
columns = ["FlightDate", "Operating_Airline", "Origin", "Dest", "Flights","DepDelay", "ArrDelay"]
df_all.dropna(axis="index", subset=columns, inplace=True)
df_all.shape

(28346524, 11)

In [4]:
# Merge airline csv instead of codes get full airline name 
airline_map = pd.read_csv("combined_flights/Airlines.csv")
airline_map.columns = airline_map.columns.str.strip()
df_all = df_all.merge(airline_map, how='left', left_on='Operating_Airline', right_on='Code')
df_all.rename(columns={"Description":"Airline"}, inplace=True)
df_all.drop(columns=["Code", 'Operating_Airline'], inplace=True)
df_all.head()

Unnamed: 0,Year,Month,FlightDate,Origin,Dest,DepDelay,DepDelayMinutes,ArrDelay,ArrDelayMinutes,Flights,Airline
0,2018,8,2018-08-07,SFO,FLL,120.0,120.0,128.0,128.0,1.0,JetBlue Airways
1,2018,8,2018-08-08,SFO,FLL,38.0,38.0,54.0,54.0,1.0,JetBlue Airways
2,2018,8,2018-08-09,SFO,FLL,-1.0,0.0,-1.0,0.0,1.0,JetBlue Airways
3,2018,8,2018-08-10,SFO,FLL,157.0,157.0,146.0,146.0,1.0,JetBlue Airways
4,2018,8,2018-08-11,SFO,FLL,3.0,3.0,3.0,3.0,1.0,JetBlue Airways


In [5]:
len(df_all["Airline"].unique())

28

In [6]:
# most busy routes
df_all["Route"] = df_all["Origin"]+ '-' + df_all["Dest"]
df_new = df_all.groupby("Route").size().reset_index(name='TotalFlights')
df_new.sort_values(by='TotalFlights', ascending=False).head(10)

Unnamed: 0,Route,TotalFlights
7256,SFO-LAX,54515
4145,LAX-SFO,54468
4094,LAX-LAS,43868
3967,LAS-LAX,43765
4282,LGA-ORD,42249
5580,ORD-LGA,42163
3162,HNL-OGG,40191
5346,OGG-HNL,40147
3813,JFK-LAX,39565
4092,LAX-JFK,39561


In [7]:
len(df_new["Route"].value_counts())

8187

In [None]:
# 5 airports are selected from the busy routes so that user can make a selection
# This code block is used for generating graphs as json format you can find them under graphs folder
# It generates monthly graphs 1.Flight Traffic by airline  2.Departure Average Delay 3.Arrival Average Delay
import plotly.express as px

def write_to_file(filename, fig):
    with open(f"graphs/{filename}", "w") as f:
        f.write(fig.to_json())


def flight_traffic(org, dest, month, daily_counts):
    # Example: flight traffic per month for multiple years
    fig = px.bar(daily_counts, x='FlightDate', y='TotalFlights', color='Airline', title=org+"-"+dest+' Daily Flight Traffic by Airline-'+str(month))

    # templates  ['ggplot2', 'seaborn', 'simple_white', 'plotly','plotly_white', 'plotly_dark', 'presentation', 'xgridoff','ygridoff', 'gridon', 'none']
    fig.update_layout(
        template='seaborn',
        title_font_size=20,
        xaxis_title='Day of Month',
        yaxis_title='Number of Flights',
        legend_title='Airlines'
    )
    write_to_file((org+"_"+dest+"_"+str(month)+"_traffic.json"), fig)
    #fig.show()

def departure_delay(org, dest, month, dep_delay):
    fig = px.line(dep_delay, x='FlightDate', y='AverageDepDelay', color='Airline', title=org+"-"+dest+' Average Delays per Airline on Departure-'+str(month))

    fig.update_layout(
        template='seaborn',
        title_font_size=20,
        xaxis_title='Day of Month',
        yaxis_title='Delay in minutes',
        legend_title='Airlines'
    )
    #fig.show()
    
    write_to_file((org+"_"+dest+"_"+str(month)+"_departure_delay.json"), fig)

def arrival_delay(org, dest, month, arr_delay):
    fig = px.line(arr_delay, x='FlightDate', y='AverageArrDelay', color='Airline', title=org+"-"+dest+' Average Delays per Airline on Arrival-'+str(month))

    fig.update_layout(
        template='seaborn',
        title_font_size=20,
        xaxis_title='Day of Month',
        yaxis_title='Delay in minutes',
        legend_title='Airlines'
    )
    #fig.show()
    
    write_to_file((org+"_"+dest+"_"+str(month)+"_arrival_delay.json"), fig)

# Through 5 busy routes 
airports = ["SFO", "LAX", "LAS", "ORD", "LGA"]

for org_airport in airports:
    for dest_airport in airports:
        if org_airport == dest_airport:
            continue
        for month in range(1,13):
            # Flight Traffic
            df_filt = ((df_all["Origin"] == org_airport) & (df_all["Dest"] == dest_airport) & (df_all["Month"] == month))
            df_month = df_all[df_filt]
            daily_counts = df_month.groupby([df_month["FlightDate"].dt.day, "Airline"]).size().reset_index(name="TotalFlights")
            daily_counts.sort_values(by=["FlightDate", "TotalFlights"], ascending=[True, False], inplace=True)
            flight_traffic(org_airport, dest_airport, month, daily_counts)

            # Departure Delay
            dep_delay = df_month.groupby([df_month["FlightDate"].dt.day, "Airline"])["DepDelay"].mean().reset_index(name="AverageDepDelay")
            departure_delay(org_airport, dest_airport, month, dep_delay)

            # Arrival Delay
            arr_delay = df_month.groupby([df_month["FlightDate"].dt.day, "Airline"])["ArrDelay"].mean().reset_index(name="AverageArrDelay")
            arrival_delay(org_airport, dest_airport, month, arr_delay)

In [8]:
# Top 10 busy airports
arrivals = df_all.groupby("Dest")["Flights"].sum().rename('Arrivals')
departures = df_all.groupby("Origin")["Flights"].sum().rename('Departures')
airport_traffic = pd.concat([departures, arrivals], axis=1).astype(int)
airport_traffic["Total_Flights"] = airport_traffic["Departures"] + airport_traffic["Arrivals"]
airport_traffic = airport_traffic.nlargest(10, "Total_Flights")
airport_traffic

Unnamed: 0,Departures,Arrivals,Total_Flights
ATL,1347021,1347297,2694318
ORD,1327423,1325511,2652934
DEN,1137846,1136193,2274039
DFW,1064843,1062170,2127013
CLT,880449,879096,1759545
LAX,833505,834181,1667686
SEA,711511,710776,1422287
IAH,672533,671574,1344107
PHX,661851,661392,1323243
LAS,648466,648924,1297390


In [9]:
import plotly.express as px
from IPython.display import IFrame


airports_dict = {"ATL":"Hartsfield-Jackson Int", "ORD":"O'Hare International", "DEN":"Denver Intl.", "DFW":"Dallas/Ft Worth Intl","CLT":"Douglas Intl", 
        "LAX":"Los Angeles Intl.", "SEA":"Seattle-Tacoma Intl", "IAH":"George Bush Intercont.", "PHX":"Sky Harbor Intl", "LAS":"HARRY REID INTL" }

fig = px.bar(airport_traffic, y=airport_traffic.index, x='Total_Flights', color=airport_traffic.index, title='Top 10 busiest Airports')

fig.update_layout(
    template='plotly',
    title_font_size=20,
    yaxis_title='Airports',
    xaxis_title='Number of Flights',
    legend_title='Airports'
)
fig.update_yaxes(labelalias=airports_dict)
fig.show()

fig.write_html("Top10_busiest_Airports.html")
# Display it inline in notebook
IFrame(src="Top10_busiest_Airports.html", width=700, height=600)

In [10]:
# Top 10 airlines
df_airlines = df_all.groupby("Airline")["Flights"].count().reset_index()
df_airlines = df_airlines.nlargest(10,'Flights').reset_index(drop=True)
df_airlines

Unnamed: 0,Airline,Flights
0,Southwest Airlines Co.,5292591
1,Delta Air Lines Inc.,3242141
2,SkyWest Airlines Inc.,3077298
3,American Airlines Inc.,3031275
4,United Air Lines Inc.,2301368
5,Republic Airlines,1239828
6,JetBlue Airways,1073460
7,Envoy Air,1033979
8,Endeavor Air Inc.,974641
9,PSA Airlines,922947


In [11]:
fig = px.bar(df_airlines, y="Airline", x='Flights', color="Airline", title='Top 10 Airlines')

fig.update_layout(
    template='plotly',
    title_font_size=20,
    yaxis_title='Airlines',
    xaxis_title='Number of Flights',
    legend_title='Airlines'
)
fig.show()

fig.write_html("Top10_Airlines.html")
# Display it inline in notebook
IFrame(src="Top10_Airlines.html", width=700, height=600)

In [12]:
import calendar 

df_month = df_all.groupby("Month")["Flights"].count().reset_index()
df_month["Month_Name"] = df_month["Month"].apply(lambda x: calendar.month_name[x])
df_month

Unnamed: 0,Month,Flights,Month_Name
0,1,2613247,January
1,2,2265778,February
2,3,2633624,March
3,4,2357394,April
4,5,2308172,May
5,6,2369762,June
6,7,2636175,July
7,8,2176676,August
8,9,2176366,September
9,10,2302554,October


In [None]:
fig = px.bar(df_month, x="Month_Name", y='Flights', title='Flight Traffic per Month')

fig.update_layout(
    template='plotly',
    title_font_size=20,
    xaxis_title='Months',
    yaxis_title='Number of Flights',
    legend_title='Airlines'
)

fig.show()
fig.write_html("Monthly_Traffic.html")
# Display it inline in notebook
IFrame(src="Monthly_Traffic.html", width=700, height=600)