# Data cleaning

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load csv
jfk = pd.read_csv('/content/JFK_flight_data.csv')

# Drop null values
jfk = jfk.dropna()

# One-hot encoding for categorical variables
one_hot = pd.get_dummies(jfk['OP_UNIQUE_CARRIER'])
jfk = jfk.drop('OP_UNIQUE_CARRIER',axis = 1)
jfk = jfk.join(one_hot)

one_hot = pd.get_dummies(jfk['DEST'])
jfk = jfk.drop('DEST',axis = 1)
jfk = jfk.join(one_hot)

one_hot = pd.get_dummies(jfk['Wind'])
jfk = jfk.drop('Wind',axis = 1)
jfk = jfk.join(one_hot)

one_hot = pd.get_dummies(jfk['Condition'])
jfk = jfk.drop('Condition',axis = 1)
jfk = jfk.join(one_hot)

jfk.loc[jfk['MONTH']==1, 'MONTH'] = 'Jan'
jfk.loc[jfk['MONTH']==11, 'MONTH'] = 'Nov'
jfk.loc[jfk['MONTH']==12, 'MONTH'] = 'Dec'
one_hot = pd.get_dummies(jfk['MONTH'])
jfk = jfk.drop('MONTH', axis = 1)
jfk = jfk.join(one_hot)

# Drop unneeded columns
jfk = jfk.drop(columns = ['TAIL_NUM','DEP_TIME_M'])

# Change type of Dew Point column to int
jfk['Dew Point'] = jfk['Dew Point'].astype(int)

# Train-test-validation split

In [3]:
from sklearn.model_selection import train_test_split

# Train-test split

X,y = jfk.loc[:, jfk.columns != 'DEP_DELAY'], jfk.loc[:, 'DEP_DELAY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=604)

# Train-validation split

X_train_train, X_val, y_train_train, y_val = train_test_split(X_train, y_train,
                                                              test_size = 0.2, 
                                                              random_state = 604)

# Ada Boost regressor

In [7]:
# AdaBoost regressor with decision tree estimator

from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import GridSearchCV

# Get decision tree regressor validation error for comparison
# These hyperparameters were chosen in tree regressor notebook

reg = DecisionTreeRegressor(
    max_depth=7,
    max_features=None,
    min_samples_leaf=5,
    min_samples_split=30,
    min_weight_fraction_leaf = 0,
    splitter='best'
)
model = reg.fit(X_train_train, y_train_train)
pred = model.predict(X_val) 
print('rmse for the Decision Tree Regressor is', np.sqrt(mse(pred,y_val)))

# Hyperparameter tuning for AdaBoost regressor with GridSearchCV

param_grid = { 
    'n_estimators': [2,5,10,50],
    'learning_rate': [0.1, 0.5, 1, 1.5]
}

reg = DecisionTreeRegressor(
    max_depth=7,
    max_features=None,
    min_samples_leaf=5,
    min_samples_split=30,
    min_weight_fraction_leaf = 0,
    splitter='best'
)

grid_cv = GridSearchCV(AdaBoostRegressor(reg),
                          param_grid = param_grid, 
                          cv = 5,
                          verbose=0.5)

grid_cv.fit(X_train, y_train)

print(grid_cv.best_params_)


rmse for the Decision Tree Regressor is 38.21118872284227
Fitting 5 folds for each of 16 candidates, totalling 80 fits
{'learning_rate': 0.1, 'n_estimators': 5}


In [8]:
# Get validation error for model with best parameters

from sklearn.metrics import mean_squared_error as mse

ada_reg = AdaBoostRegressor(estimator = reg, 
                            learning_rate = 0.1,
                            n_estimators = 5)
model = ada_reg.fit(X_train_train, y_train_train)
pred = model.predict(X_val)

print('rmse is', np.sqrt(mse(pred, y_val)))

rmse is 37.73167661368316


In [13]:
# Tune hyperparameters for estimator and AdaBoostRegressor

param_grid = {
    'estimator__max_depth': [1, 2, 5, 10],
    'n_estimators': [2,5,10,50],
    'learning_rate': [0.01, 0.1, 0.5, 1]
}

grid_cv = GridSearchCV(AdaBoostRegressor(DecisionTreeRegressor()), 
                           param_grid=param_grid,
                           cv=5,
                           verbose = 0.5)

grid_cv.fit(X_train, y_train)

print(grid_cv.best_params_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
{'estimator__max_depth': 10, 'learning_rate': 0.01, 'n_estimators': 5}


In [14]:
# Check validation set error

reg = DecisionTreeRegressor(max_depth=10)

ada_reg = AdaBoostRegressor(estimator = reg,
                            learning_rate=0.01,
                            n_estimators = 5)
model = ada_reg.fit(X_train_train, y_train_train)
pred = model.predict(X_val)

print('rmse is', np.sqrt(mse(pred, y_val)))

rmse is 31.790710601191382


In [15]:
# Fine-tune hyperparameters

param_grid = {
    'estimator__max_depth': [10, 15, 20, 25],
    'n_estimators': [3,4,5,6,7],
    'learning_rate': [0.008,0.009,0.01,0.011,0.012]
}

grid_cv = GridSearchCV(AdaBoostRegressor(DecisionTreeRegressor()), 
                           param_grid=param_grid,
                           cv=5,
                           verbose = 0.5)

grid_cv.fit(X_train, y_train)

print(grid_cv.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
{'estimator__max_depth': 20, 'learning_rate': 0.012, 'n_estimators': 7}


In [16]:
# Get validation set error

reg = DecisionTreeRegressor(max_depth = 20)

ada_reg = AdaBoostRegressor(estimator = reg,
                            learning_rate = 0.012,
                            n_estimators = 7)
model = ada_reg.fit(X_train_train, y_train_train)
pred = model.predict(X_val)

print('rmse is', np.sqrt(mse(pred, y_val)))

rmse is 28.99575832240561


In [17]:
# More fine-tuning, since the learning rate and number of estimators were at
#  the top of the range

param_grid = {
    'estimator__max_depth': [15, 20, 25],
    'n_estimators': [6,7,8,9,10],
    'learning_rate': [0.01,0.012,0.014,0.016]
}

grid_cv = GridSearchCV(AdaBoostRegressor(DecisionTreeRegressor()), 
                           param_grid=param_grid,
                           cv=5,
                           verbose = 0.5)

grid_cv.fit(X_train, y_train)

print(grid_cv.best_params_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
{'estimator__max_depth': 15, 'learning_rate': 0.014, 'n_estimators': 6}


In [19]:
# Get validation set error

reg = DecisionTreeRegressor(max_depth = 15)

ada_reg = AdaBoostRegressor(estimator = reg,
                            learning_rate = 0.014, 
                            n_estimators = 6)
model = ada_reg.fit(X_train_train, y_train_train)
pred = model.predict(X_val)

print('rmse is', np.sqrt(mse(pred, y_val)))

rmse is 29.978466308625


In [21]:
# Get test set error for best AdaBoostRegressor model

reg = DecisionTreeRegressor(max_depth = 20)

ada_reg = AdaBoostRegressor(estimator = reg,
                            learning_rate = 0.012,
                            n_estimators = 7)
model = ada_reg.fit(X_train, y_train)
pred = model.predict(X_test)

print('rmse is', np.sqrt(mse(pred, y_test)))

rmse is 30.775344826785577


# XGBoost regressor

In [8]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import GridSearchCV

# XGBoost model with default hyperparameters

reg = XGBRegressor(objective='reg:squarederror')
model = reg.fit(X_train_train, y_train_train) 
pred = model.predict(X_val)

In [5]:
# Check error on the validation set

print('rmse for XGBoost is', np.sqrt(mse(pred, y_val)))

rmse for XGBoost is 23.906570508153127


In [9]:
# Tune hyperparameters for XGBoost regressor

param_grid = {'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 200, 500],
           'colsample_bytree': [0.3, 0.7]}

grid_cv = GridSearchCV(XGBRegressor(), 
                   param_grid=param_grid,
                   scoring='neg_mean_squared_error',
                   cv = 5, 
                   verbose = 0.5)

grid_cv.fit(X_train, y_train)

print(grid_cv.best_params_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
{'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 500}


In [10]:
# Check validation set error

reg = XGBRegressor(objective='reg:squarederror', 
                   max_depth=6,
                   learning_rate=0.1,
                   n_estimators=500,
                   colsample_bytree=0.7)
model = reg.fit(X_train_train, y_train_train)
pred = model.predict(X_val)

print('rmse is', np.sqrt(mse(pred, y_val)))

rmse is 24.700572966588467


In [12]:
# Test another set of hyperparameters

param_grid = {'max_depth': [3,6,10],
           'learning_rate': [0.1, 0.3, 0.5],
           'n_estimators': [100, 200],
           'colsample_bytree': [0.3, 0.7, 1]}

grid_cv = GridSearchCV(XGBRegressor(), 
                   param_grid=param_grid,
                   scoring='neg_mean_squared_error',
                   cv = 5, 
                   verbose = 0.5)

grid_cv.fit(X_train, y_train)

print(grid_cv.best_params_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
{'colsample_bytree': 1, 'learning_rate': 0.3, 'max_depth': 6, 'n_estimators': 200}


In [13]:
# Check validation set error on best parameters

reg = XGBRegressor(objective='reg:squarederror',
                   colsample_bytree = 1,
                   learning_rate=0.3,
                   max_depth=6,
                   n_estimators=200)
model = reg.fit(X_train_train, y_train_train)
pred = model.predict(X_val)

print('rmse is', np.sqrt(mse(pred, y_val)))

rmse is 23.4962306602724


In [14]:
# Check test set error

reg = XGBRegressor(objective='reg:squarederror',
                   colsample_bytree = 1,
                   learning_rate=0.3,
                   max_depth=6,
                   n_estimators=200)
model = reg.fit(X_train, y_train)
pred = model.predict(X_test)

print('rmse is', np.sqrt(mse(pred, y_test)))

rmse is 25.80222270606728
