In [15]:

!pip install pandas
!pip install numpy
!pip install seaborn
!pip install matplotlib
!pip install scikit-learn
!pip install scipy
!pip install gdown




In [16]:
# Import portion of a package
import matplotlib.pyplot as plt  # Most common visualization package that a lot of others are based on

# Import full packages under custom name
import numpy as np  # Common package for numerical methods
import pandas as pd  # Common package for data storeage/manipulation
import seaborn as sns  # Common package for statistical visualizations

# Import portion of a package
import scipy.stats as stats
from sklearn.impute import SimpleImputer as Imputer  # Specific function from common machine learning package\

#more packages
import gdown

In [17]:

# File ID from your link
file_id = "13elVDJJ6JUH2gsj36CxPjkLryE2rnl-r"
url = f"https://drive.google.com/uc?id={file_id}"

# Download the file
gdown.download(url, "flight_data.csv", quiet=False)

# Now load it
flight_data = pd.read_csv("flight_data.csv")
print(flight_data.shape)
print(flight_data.head())

Downloading...
From (original): https://drive.google.com/uc?id=13elVDJJ6JUH2gsj36CxPjkLryE2rnl-r
From (redirected): https://drive.google.com/uc?id=13elVDJJ6JUH2gsj36CxPjkLryE2rnl-r&confirm=t&uuid=70d37155-2ae8-4dd4-a56a-da0505795cbf
To: /content/flight_data.csv
100%|██████████| 1.32G/1.32G [00:27<00:00, 47.5MB/s]


(7546988, 32)
   YEAR  QUARTER  MONTH  DAY_OF_MONTH  DAY_OF_WEEK  MKT_CARRIER_AIRLINE_ID  \
0  2024        1      1             1            1                   19393   
1  2024        1      1             1            1                   19393   
2  2024        1      1             1            1                   19393   
3  2024        1      1             1            1                   19393   
4  2024        1      1             1            1                   19393   

   ORIGIN_AIRPORT_ID  ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_NAME  DEST_AIRPORT_ID  \
0              10140                1014005  Albuquerque, NM            10423   
1              10140                1014005  Albuquerque, NM            10423   
2              10140                1014005  Albuquerque, NM            10800   
3              10140                1014005  Albuquerque, NM            10821   
4              10140                1014005  Albuquerque, NM            11259   

   ...  CANCELLATION_CODE CRS_

In [18]:
df = flight_data.copy()
print (df.head())
print(df.columns.tolist())

   YEAR  QUARTER  MONTH  DAY_OF_MONTH  DAY_OF_WEEK  MKT_CARRIER_AIRLINE_ID  \
0  2024        1      1             1            1                   19393   
1  2024        1      1             1            1                   19393   
2  2024        1      1             1            1                   19393   
3  2024        1      1             1            1                   19393   
4  2024        1      1             1            1                   19393   

   ORIGIN_AIRPORT_ID  ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_NAME  DEST_AIRPORT_ID  \
0              10140                1014005  Albuquerque, NM            10423   
1              10140                1014005  Albuquerque, NM            10423   
2              10140                1014005  Albuquerque, NM            10800   
3              10140                1014005  Albuquerque, NM            10821   
4              10140                1014005  Albuquerque, NM            11259   

   ...  CANCELLATION_CODE CRS_ELAPSED_TIME  

In [19]:

# ---------------------------------------------------------
# Create FL_DATE (needed for daily counts)
# ---------------------------------------------------------
df = df.rename(columns={'DAY_OF_MONTH': 'DAY'}) # for some reason was complaining about day_of_month
df['FL_DATE'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']])

# ---------------------------------------------------------
# 1) ORIGIN BUCKET (quartiles of flights per day per origin)
# ---------------------------------------------------------
origin_counts = (
    df.groupby(['FL_DATE', 'ORIGIN_AIRPORT_ID'])
      .size()
      .reset_index(name='origin_flights_day')
)

df = df.merge(origin_counts, on=['FL_DATE', 'ORIGIN_AIRPORT_ID'], how='left')

df['origin_bucket'] = pd.qcut(df['origin_flights_day'], q=4, labels=[1, 2, 3, 4]) #bottom quartile is 1, top quartile is 4

# ---------------------------------------------------------
# 2) DESTINATION BUCKET (quartiles of flights per day per destination)
# ---------------------------------------------------------
dest_counts = (
    df.groupby(['FL_DATE', 'DEST_AIRPORT_ID'])
      .size()
      .reset_index(name='dest_flights_day')
)

df = df.merge(dest_counts, on=['FL_DATE', 'DEST_AIRPORT_ID'], how='left')

df['destination_bucket'] = pd.qcut(df['dest_flights_day'], q=4, labels=[1, 2, 3, 4]) #bottom quartile is 1, top quartile is 4

# ---------------------------------------------------------
# 3) DISTANCE BUCKET (quartiles of distance)
# ---------------------------------------------------------
df['distance_bucket'] = pd.qcut(df['DISTANCE'], q=4, labels=[1, 2, 3, 4]) #bottom quartile is 1, top quartile is 4

# ---------------------------------------------------------
# 4) AIRLINE BUCKET
# bottom 6 = bucket 0
# top 4    = bucket 1
# ---------------------------------------------------------
airline_counts = df['MKT_CARRIER_AIRLINE_ID'].value_counts()

bottom_6 = airline_counts.sort_values().index[:6]
top_4 = airline_counts.sort_values(ascending=False).index[:4]

df['airline_bucket'] = None
df.loc[df['MKT_CARRIER_AIRLINE_ID'].isin(bottom_6), 'airline_bucket'] = 0
df.loc[df['MKT_CARRIER_AIRLINE_ID'].isin(top_4), 'airline_bucket'] = 1

# ---------------------------------------------------------
# Final preview
# ---------------------------------------------------------
df[['origin_bucket', 'destination_bucket', 'distance_bucket', 'airline_bucket']].head()

Unnamed: 0,origin_bucket,destination_bucket,distance_bucket,airline_bucket
0,1,2,2,1
1,1,2,2,1
2,1,1,3,1
3,1,2,4,1
4,1,2,2,1


In [20]:
df.head()

Unnamed: 0,YEAR,QUARTER,MONTH,DAY,DAY_OF_WEEK,MKT_CARRIER_AIRLINE_ID,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,...,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,index,FL_DATE,origin_flights_day,origin_bucket,dest_flights_day,destination_bucket,distance_bucket,airline_bucket
0,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10423,...,0.0,0.0,0,2024-01-01,67,1,241,2,2,1
1,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10423,...,0.0,0.0,1,2024-01-01,67,1,241,2,2,1
2,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10800,...,0.0,0.0,2,2024-01-01,67,1,90,1,3,1
3,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10821,...,0.0,0.0,3,2024-01-01,67,1,265,2,4,1
4,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",11259,...,0.0,0.0,4,2024-01-01,67,1,214,2,2,1
