In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('merged_dataset.csv')

In [4]:
df.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,FlightDate,Origin,Dest,CRSDepTime,DepTime,DepDelayMinutes,...,weatherCode,precipMM,visibility,pressure,cloudcover,DewPointF,WindGustKmph,tempF,WindChillF,humidity
0,2016,1,1,1,2016-01-01,SEA,JFK,745,741.0,0.0,...,113,0.0,10,1032,0,23,8,36,32,61
1,2016,1,1,2,2016-01-02,SEA,JFK,745,737.0,0.0,...,113,0.0,10,1025,0,24,9,36,32,60
2,2016,1,1,3,2016-01-03,SEA,JFK,745,743.0,0.0,...,113,0.0,10,1020,0,20,10,34,30,56
3,2016,1,1,4,2016-01-04,SEA,JFK,745,737.0,0.0,...,296,0.5,8,1012,100,31,9,34,31,89
4,2016,1,1,5,2016-01-05,SEA,JFK,710,708.0,0.0,...,296,0.6,5,1004,100,35,9,36,34,96


In [5]:
# Date doesn't matter, so we'll remove them

df.drop(['Year', 'Quarter', 'Month', 'DayofMonth', 'FlightDate'],axis=1 , inplace=True)

In [6]:
df.columns

Index(['Origin', 'Dest', 'CRSDepTime', 'DepTime', 'DepDelayMinutes',
       'DepDel15', 'CRSArrTime', 'ArrTime', 'ArrDelayMinutes', 'ArrDel15',
       'windspeedKmph', 'winddirDegree', 'weatherCode', 'precipMM',
       'visibility', 'pressure', 'cloudcover', 'DewPointF', 'WindGustKmph',
       'tempF', 'WindChillF', 'humidity'],
      dtype='object')

In [9]:
df = pd.get_dummies(df,dtype=int)

In [14]:
y=df['DepDelayMinutes']
X=df.drop(['DepDelayMinutes'] , axis=1)

In [15]:
from sklearn.preprocessing import MinMaxScaler

X = MinMaxScaler().fit_transform(X)

In [18]:
y = y.apply(lambda x: 1 if x > 0 else x)

In [20]:
y.value_counts()

DepDelayMinutes
0.0    55772
1.0    34906
Name: count, dtype: int64

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

In [25]:
X_train.shape

(72542, 49)

# XGBOOST CLASSIFIER

In [28]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2]
}

xgb = XGBClassifier()

grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')

grid_search_xgb.fit(X_train, y_train)


predictions_xgb = grid_search_xgb.predict(X_test)

test_score_xgb = f1_score(y_test, predictions_xgb)

print("F1 score on test set:", test_score_xgb)


F1 score on test set: 0.8980424760033255


In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier()

grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')

grid_search_rf.fit(X_train, y_train)

predictions_rf = grid_search_rf.predict(X_test)

test_score_rf = f1_score(y_test, predictions_rf)

print("F1 score on test set (RandomForest):", test_score_rf)


In [None]:
import pickle

with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(grid_search_xgb, f)


with open('rf_model.pkl', 'wb') as f:
    pickle.dump(grid_search_rf, f)

