# Simple regression to predict flight delays

In [2]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

Load Flights dataset from Kaggle. Contains 300k US flights during 2013. See the features:
- dep_delay is what we want to predict
- dep_time, arr_delay, airtime can lead to data leakage so we will not regress on these

In [3]:
file_path = "flights.csv"
df = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS, "mahoora00135/flights", file_path)

print("First 5 records:\n", df.head())


  df = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS, "mahoora00135/flights", file_path)


First 5 records:
    id  year  month  day  dep_time  sched_dep_time  dep_delay  arr_time  \
0   0  2013      1    1     517.0             515        2.0     830.0   
1   1  2013      1    1     533.0             529        4.0     850.0   
2   2  2013      1    1     542.0             540        2.0     923.0   
3   3  2013      1    1     544.0             545       -1.0    1004.0   
4   4  2013      1    1     554.0             600       -6.0     812.0   

   sched_arr_time  arr_delay  ... flight  tailnum origin dest air_time  \
0             819       11.0  ...   1545   N14228    EWR  IAH    227.0   
1             830       20.0  ...   1714   N24211    LGA  IAH    227.0   
2             850       33.0  ...   1141   N619AA    JFK  MIA    160.0   
3            1022      -18.0  ...    725   N804JB    JFK  BQN    183.0   
4             837      -25.0  ...    461   N668DN    LGA  ATL    116.0   

   distance  hour  minute            time_hour                    name  
0      1400     5  

## Process X and y for regression model

- Our target prediction is dep_delay
- We don't want to regress on dep_time, arr_delay, airtime, so do not include these features in our numerical columns

In [8]:
# choose target
target = "dep_delay"

# drop rows with missing target
df2 = df[df[target].notna()].copy()

# split features/labels
X = df2.drop(columns=[target])
y = df2[target]

#  categorical + numeric columns
categorical_cols = ['carrier', 'tailnum', 'origin', 'dest', 'time_hour', 'name']
numeric_cols = ['id', 'year', 'month', 'day', 'sched_dep_time',
                'sched_arr_time', 'flight', 'distance', 'hour', 'minute']

## Build pipeline for Ridge regression

In [None]:
# build preprocessing
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

# full model pipeline
model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("ridge", Ridge(alpha=1.0))
])

# train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

pred = model.predict(X_test)
mae = mean_absolute_error(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)

print("MAE:", mae)
print("RMSE:", rmse)

MAE: 22.009984664768574
RMSE: 39.67887265195823


## Build model for standard OLS regression

In [5]:
# build preprocessing
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

# full model pipeline
ols_model = Pipeline([("preprocess", preprocess),
        ("model", LinearRegression())])

ols_model.fit(X_train, y_train)

pred = ols_model.predict(X_test)
mae = mean_absolute_error(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)

print("MAE:", mae)
print("RMSE:", rmse)

MAE: 22.00489256421492
RMSE: 39.674338688852686
