In [1]:
# !pip install catboost

In [2]:
# !pip install xgboost

In [3]:
# !pip install lightgbm

In [4]:
# !pip install neptune-client

In [1]:
#Importing machine learning algorithms
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

#Importing other packages 
import timeit
import pandas as pd
import numpy as np
import neptune.new as neptune

#Importing packages for machine learning operations
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score

In [2]:
# Retrieving Neptune credentials
## Read all lines and save as a list
content = "credentials.txt"
with open(content, "r") as file:
    content_list = file.readlines()
project_name = content_list[0].strip("\n").strip(",")
neptune_api_token = content_list[1].strip("\n").strip(",")

In [3]:
#Setting up Neptune Client
run = neptune.init(
    project=project_name,
    api_token=neptune_api_token
)

https://app.neptune.ai/abozejohn/catboost-tutorial/e/CAT-10


Info (NVML): Driver Not Loaded. GPU usage metrics may not be reported. For more information, see https://docs-legacy.neptune.ai/logging-and-managing-experiment-results/logging-experiment-data.html#hardware-consumption 


Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
#Reading dataset
data_df = pd.read_csv("/content/drive/MyDrive/Datasets/flights_data.csv")

In [6]:
#Variable Description
def description(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary['Missing'] = df.isnull().sum().values
    summary['PercMissing'] = df.isnull().sum().values / df.isnull().count().values
    summary['Uniques'] = df.nunique().values
    summary['Data type'] = df.dtypes.values
    return summary
print('**Variable Description of  data:**')
description(data_df)

**Variable Description of  data:**
Dataset Shape: (1454770, 32)


Unnamed: 0,index,dtypes,Name,Missing,PercMissing,Uniques,Data type
0,Unnamed: 0,int64,Unnamed: 0,0,0.0,1454770,int64
1,YEAR,int64,YEAR,0,0.0,1,int64
2,MONTH,int64,MONTH,0,0.0,12,int64
3,DAY,int64,DAY,0,0.0,31,int64
4,DAY_OF_WEEK,int64,DAY_OF_WEEK,0,0.0,7,int64
5,AIRLINE,object,AIRLINE,0,0.0,14,object
6,FLIGHT_NUMBER,int64,FLIGHT_NUMBER,0,0.0,6824,int64
7,TAIL_NUMBER,object,TAIL_NUMBER,3663,0.002518,4862,object
8,ORIGIN_AIRPORT,object,ORIGIN_AIRPORT,0,0.0,627,object
9,DESTINATION_AIRPORT,object,DESTINATION_AIRPORT,0,0.0,626,object


In [7]:
import warnings
warnings.filterwarnings('ignore')

#Selecting features (i.e removing highly correlated features, redundant features and features with high missing values percentage)
data_df = data_df[["MONTH","DAY","DAY_OF_WEEK","AIRLINE","DESTINATION_AIRPORT",
                 "ORIGIN_AIRPORT","AIR_TIME", "DEPARTURE_TIME","DISTANCE","ARRIVAL_DELAY", 
                 "DIVERTED", "CANCELLED", "ARRIVAL_TIME"]]


#Filling missing values with mean
data_df['DEPARTURE_TIME'] = data_df['DEPARTURE_TIME'].fillna(data_df['DEPARTURE_TIME'].mean())
data_df['AIR_TIME'] = data_df['AIR_TIME'].fillna(data_df['AIR_TIME'].mean())
data_df['ARRIVAL_DELAY'] = data_df['ARRIVAL_DELAY'].fillna(data_df['ARRIVAL_DELAY'].mean())
data_df['ARRIVAL_TIME'] = data_df['ARRIVAL_TIME'].fillna(data_df['ARRIVAL_TIME'].mean())

#Change some features to categorical data type
cat_cols = ["AIRLINE","DESTINATION_AIRPORT","ORIGIN_AIRPORT"]
for item in cat_cols:
    data_df[item] = data_df[item].astype("category").cat.codes +1

# Encoding the target
data_df['ARRIVAL_DELAY']  = data_df['ARRIVAL_DELAY'].apply(lambda x: 1 if x>15 else 0)

In [8]:
description(data_df)

Dataset Shape: (1454770, 13)


Unnamed: 0,index,dtypes,Name,Missing,PercMissing,Uniques,Data type
0,MONTH,int64,MONTH,0,0.0,12,int64
1,DAY,int64,DAY,0,0.0,31,int64
2,DAY_OF_WEEK,int64,DAY_OF_WEEK,0,0.0,7,int64
3,AIRLINE,int8,AIRLINE,0,0.0,14,int8
4,DESTINATION_AIRPORT,int16,DESTINATION_AIRPORT,0,0.0,626,int16
5,ORIGIN_AIRPORT,int16,ORIGIN_AIRPORT,0,0.0,627,int16
6,AIR_TIME,float64,AIR_TIME,0,0.0,656,float64
7,DEPARTURE_TIME,float64,DEPARTURE_TIME,0,0.0,1438,float64
8,DISTANCE,int64,DISTANCE,0,0.0,1345,int64
9,ARRIVAL_DELAY,int64,ARRIVAL_DELAY,0,0.0,2,int64


In [9]:
X = data_df.drop(columns=['ARRIVAL_DELAY'])
y = data_df['ARRIVAL_DELAY']

#Splitting the dataset with a test size of 30%
X_train, X_test,  y_train, y_test= train_test_split(X,y, random_state=2021, test_size=0.30)

In [10]:
#Metric evaluation  
def metrics(y_pred_test):
    score = roc_auc_score(y_test, y_pred_test)
    run['ROC AUC score'] = score

# Default settings

In [18]:
# # Function to training model, log training and prediction time and tracking performance 
# def run_model(model, description, cat_features=None):
#   #Description
#   run["Description"] = description

#   #Training session
#   start = timeit.default_timer()
#   model.fit(X_train,y_train, categorical_feature=cat_features)
#   stop = timeit.default_timer()
#   run['Training time'] = stop - start

#   #Prediction session
#   start = timeit.default_timer()
#   y_pred_test = model.predict(X_test)
#   stop = timeit.default_timer()
#   run['Prediction time'] = stop - start

#   #Performance evaluation
#   metrics(y_pred_test)

In [11]:
# Function to training model, log training and prediction time and tracking performance 
def run_model(model, description, key, cat_features=[]):
  if key =='LGB':
    #Description
    run["Description"] = description

    #Training session
    start = timeit.default_timer()
    model.fit(X_train,y_train, categorical_feature=cat_features)
    stop = timeit.default_timer()
    run['Training time'] = stop - start

    #Prediction session
    start = timeit.default_timer()
    y_pred_test = model.predict(X_test)
    stop = timeit.default_timer()
    run['Prediction time'] = stop - start

    #Performance evaluation
    metrics(y_pred_test)

  elif key =='CAT':
    #Description
    run["Description"] = description

    #Training session
    start = timeit.default_timer()
    model.fit(X_train,y_train, 
              eval_set=(X_test, y_test),
              cat_features=cat_features,
              use_best_model=True)
    stop = timeit.default_timer()
    run['Training time'] = stop - start

    #Prediction session
    start = timeit.default_timer()
    y_pred_test = model.predict(X_test)
    stop = timeit.default_timer()
    run['Prediction time'] = stop - start

    #Performance evaluation
    metrics(y_pred_test)

  else:
    #Description
    run["Description"] = description

    #Training session
    start = timeit.default_timer()
    model.fit(X_train,y_train)
    stop = timeit.default_timer()
    run['Training time'] = stop - start

    #Prediction session
    start = timeit.default_timer()
    y_pred_test = model.predict(X_test)
    stop = timeit.default_timer()
    run['Prediction time'] = stop - start

    #Performance evaluation
    metrics(y_pred_test)

## LightGBM (Default)

In [19]:
# Default LightGBM without categorical features support
model_lgb_def = lgb.LGBMClassifier()
run_model(model_lgb_def,'Default LightGBM without categorical support', key='LGB')

In [19]:
#Default LightGBM with categorical feature support
model_lgb_cat_def = lgb.LGBMClassifier()
run_model(model_lgb_cat_def, 'Default LightGBM with categorical support',key='LGB', cat_features=cat_cols)

## XGBoost

In [19]:
# Default XGBoost 
model_xgb_def = xgb.XGBClassifier()
run_model(model_xgb_def, 'Default XGBoost', key='XGB')

## Catboost

In [16]:
#Without categorical encoding
model_cat_def = cb.CatBoostClassifier()
run_model(model_cat_def,'Default Catboost without categorical support', key='CAT')

Learning rate set to 0.175331
0:	learn: 0.5977016	test: 0.5975244	best: 0.5975244 (0)	total: 373ms	remaining: 6m 12s
1:	learn: 0.5403341	test: 0.5399247	best: 0.5399247 (1)	total: 614ms	remaining: 5m 6s
2:	learn: 0.5029722	test: 0.5024751	best: 0.5024751 (2)	total: 840ms	remaining: 4m 39s
3:	learn: 0.4787222	test: 0.4780350	best: 0.4780350 (3)	total: 1.05s	remaining: 4m 22s
4:	learn: 0.4633598	test: 0.4625808	best: 0.4625808 (4)	total: 1.28s	remaining: 4m 15s
5:	learn: 0.4531123	test: 0.4521996	best: 0.4521996 (5)	total: 1.5s	remaining: 4m 9s
6:	learn: 0.4454678	test: 0.4445158	best: 0.4445158 (6)	total: 1.74s	remaining: 4m 6s
7:	learn: 0.4402771	test: 0.4392634	best: 0.4392634 (7)	total: 1.96s	remaining: 4m 2s
8:	learn: 0.4365149	test: 0.4354659	best: 0.4354659 (8)	total: 2.19s	remaining: 4m 1s
9:	learn: 0.4336917	test: 0.4325720	best: 0.4325720 (9)	total: 2.41s	remaining: 3m 58s
10:	learn: 0.4311785	test: 0.4300417	best: 0.4300417 (10)	total: 2.63s	remaining: 3m 56s
11:	learn: 0.4294

In [18]:
#With categorical encoding
model_cat_cat_def = cb.CatBoostClassifier()
cat_features_index = [3,4,5]
run_model(model_cat_cat_def,'Default Catboost with categorical support','CAT', cat_features_index)

Learning rate set to 0.175331
0:	learn: 0.6014739	test: 0.6012838	best: 0.6012838 (0)	total: 1.55s	remaining: 25m 53s
1:	learn: 0.5396072	test: 0.5391940	best: 0.5391940 (1)	total: 2.92s	remaining: 24m 14s
2:	learn: 0.5023324	test: 0.5017791	best: 0.5017791 (2)	total: 4.12s	remaining: 22m 48s
3:	learn: 0.4782188	test: 0.4774964	best: 0.4774964 (3)	total: 5.24s	remaining: 21m 44s
4:	learn: 0.4622310	test: 0.4613406	best: 0.4613406 (4)	total: 6.22s	remaining: 20m 37s
5:	learn: 0.4516457	test: 0.4506878	best: 0.4506878 (5)	total: 7.25s	remaining: 20m 1s
6:	learn: 0.4441746	test: 0.4431659	best: 0.4431659 (6)	total: 8.39s	remaining: 19m 50s
7:	learn: 0.4391723	test: 0.4381594	best: 0.4381594 (7)	total: 9.42s	remaining: 19m 28s
8:	learn: 0.4352393	test: 0.4341624	best: 0.4341624 (8)	total: 10.7s	remaining: 19m 34s
9:	learn: 0.4327330	test: 0.4316256	best: 0.4316256 (9)	total: 11.9s	remaining: 19m 41s
10:	learn: 0.4309553	test: 0.4297970	best: 0.4297970 (10)	total: 13.1s	remaining: 19m 36s
1

# Tuned setting

In [None]:
# Tuned parameters for LightGBM
params = {"max_depth": 7, "learning_rate" : 0.08, "num_leaves": 100,  "n_estimators": 1000}

# Without Categorical Features
model_lgb_tun = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metric='auc',**params)
run_model(model_lgb_tun, 'Tuned LightGBM without categorical support', 'LGB')

# With Catgeorical Features
model_lgb_cat_tun = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metric='auc',**params)
run_model(model_lgb_cat_tun, 'Tuned LightGBM with categorical support', 'LGB', cat_cols)

# Tuned parameters for XGBoost
params = {"max_depth": 5, "learning_rate": 0.8, "min_child_weight": 6,  "n_estimators": 1000}

# Tuned XGBoost
model_xgb_tun = xgb.XGBClassifier(**params)
run_model(model_xgb_tun, 'Tuned XGBoost','XGB')

# Tuned parameters for Catboost
params = {"depth": 10, "learning_rate": 0.5, "iterations": 1000, "l2_leaf_reg": 5}

#Tuned Catboost with no categorical feature support
model_cat_tun = cb.CatBoostClassifier(**params)
run_model(model_cat_tun,'Tuned Catboost without categorical support', key='CAT')

# Tuned Catboost with categorical feature support
model_cat_cat_tun = cb.CatBoostClassifier(**params)
cat_features_index = [3,4,5]
run_model(model_cat_cat_tun,'Default Catboost with categorical support','CAT', cat_features_index)

## LightGBM

In [None]:
## Hpyerparameter tuning for LightGBM algorithm
# start = timeit.default_timer()
# lgb_estimator = LGBMClassifier(boosting_type='gbdt', objective='binary', metric='auc')
# param_dist = {"max_depth": [7],
#               "learning_rate" : [0.01,0.05,0.08, 0.1],
#               "num_leaves": [100,120, 150, 200],
#               "n_estimators": [100,500,1000]
#              }

# grid_search = GridSearchCV(lgb_model, n_jobs=-1, param_grid=param_dist, cv = 5, scoring="roc_auc", verbose=5)
# grid_search.fit(X_train,y_train)
# print(grid_search.best_estimator_)

# stop = timeit.default_timer()
# print('Time: ', stop - start) 

In [17]:
params = {"max_depth": 7, "learning_rate" : 0.08, "num_leaves": 100,  "n_estimators": 1000}

In [23]:
# Without Categorical Features
model_lgb_tun = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metric='auc',**params)
run_model(model_lgb_tun, 'Tuned LightGBM without categorical support', 'LGB')

In [21]:
#With Catgeorical Features
model_lgb_cat_tun = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metric='auc',**params)
run_model(model_lgb_cat_tun, 'Tuned LightGBM with categorical support', 'LGB', cat_cols)

## XGBoost

In [None]:
## Hpyerparameter tuning for XGBoost algorithm
# from xgboost import XGBClassifier
# start = timeit.default_timer()
# xgb_model = XGBClassifier()
# param_dist = {"max_depth": [2,5,7,10],
#               "min_child_weight" : [2,6,10,],
#               "n_estimators": [100, 500, 1000],
#               "learning_rate" : [0.01,0.05,0.08, 0.1, 0.5, 0.8]}
# grid_search = GridSearchCV(xgb_model, param_grid=param_dist, cv = 3, 
#                                    verbose=10, n_jobs=-1)
# grid_search.fit(X_train, y_train)
# print(grid_search.best_estimator_)

# stop = timeit.default_timer()
# print('Time: ', stop - start)  

In [12]:
params = {"max_depth": 5, "learning_rate": 0.8, "min_child_weight": 6,  "n_estimators": 1000}

In [13]:
# Tuned XGBoost
model_xgb_tun = xgb.XGBClassifier(**params)
run_model(model_xgb_tun, 'Tuned XGBoost','XGB')

## Catboost

In [None]:
# params = {'depth': [2, 6, 10],
#           'learning_rate' : [0.01, 0.05, 0.08, 0.1,0.5],
#          'l2_leaf_reg': [1, 3, 5, 7, 9],
#          'iterations': [500, 700, 900, 1000]}

# start = timeit.default_timer()
# cb_model = CatBoostClassifier()

# grid_search = GridSearchCV(cb_model, param_grid=params, cv = 3)
# grid_search.fit(X_train, y_train)

# stop = timeit.default_timer()
# print('Time: ', stop - start) 

#grid_search.best_params_

In [13]:
# Tuned parameters for Catboost
params = {"depth": 10, "learning_rate": 0.5, "iterations": 1000, "l2_leaf_reg": 5}

In [18]:
#Tuned Catboost with no categorical feature support
model_cat_tun = cb.CatBoostClassifier(**params)
run_model(model_cat_tun,'Tuned Catboost without categorical support', key='CAT')

0:	learn: 0.4843486	test: 0.4838753	best: 0.4838753 (0)	total: 487ms	remaining: 8m 6s
1:	learn: 0.4408734	test: 0.4400214	best: 0.4400214 (1)	total: 915ms	remaining: 7m 36s
2:	learn: 0.4276852	test: 0.4268249	best: 0.4268249 (2)	total: 1.35s	remaining: 7m 28s
3:	learn: 0.4228324	test: 0.4218812	best: 0.4218812 (3)	total: 1.76s	remaining: 7m 18s
4:	learn: 0.4199014	test: 0.4190478	best: 0.4190478 (4)	total: 2.19s	remaining: 7m 16s
5:	learn: 0.4179515	test: 0.4171149	best: 0.4171149 (5)	total: 2.61s	remaining: 7m 12s
6:	learn: 0.4163026	test: 0.4156137	best: 0.4156137 (6)	total: 3.02s	remaining: 7m 8s
7:	learn: 0.4142893	test: 0.4136864	best: 0.4136864 (7)	total: 3.43s	remaining: 7m 4s
8:	learn: 0.4122870	test: 0.4117707	best: 0.4117707 (8)	total: 3.82s	remaining: 7m 1s
9:	learn: 0.4102332	test: 0.4099374	best: 0.4099374 (9)	total: 4.31s	remaining: 7m 6s
10:	learn: 0.4089932	test: 0.4088305	best: 0.4088305 (10)	total: 4.74s	remaining: 7m 5s
11:	learn: 0.4069933	test: 0.4068625	best: 0.40

In [15]:
#With categorical encoding
model_cat_cat_tun = cb.CatBoostClassifier(**params)
cat_features_index = [3,4,5]
run_model(model_cat_cat_tun,'Tuned Catboost with categorical support','CAT', cat_features_index)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.4878328	test: 0.4873166	best: 0.4873166 (0)	total: 2.52s	remaining: 41m 59s
1:	learn: 0.4400257	test: 0.4392844	best: 0.4392844 (1)	total: 4.79s	remaining: 39m 48s
2:	learn: 0.4281106	test: 0.4271288	best: 0.4271288 (2)	total: 7.01s	remaining: 38m 51s
3:	learn: 0.4230562	test: 0.4222728	best: 0.4222728 (3)	total: 9.04s	remaining: 37m 30s
4:	learn: 0.4198726	test: 0.4192874	best: 0.4192874 (4)	total: 11s	remaining: 36m 24s
5:	learn: 0.4168943	test: 0.4165087	best: 0.4165087 (5)	total: 12.7s	remaining: 35m 6s
6:	learn: 0.4142468	test: 0.4139010	best: 0.4139010 (6)	total: 14.9s	remaining: 35m 20s
7:	learn: 0.4125308	test: 0.4120745	best: 0.4120745 (7)	total: 16.8s	remaining: 34m 38s
8:	learn: 0.4093844	test: 0.4087136	best: 0.4087136 (8)	total: 18.8s	remaining: 34m 27s
9:	learn: 0.4068153	test: 0.4061603	best: 0.4061603 (9)	total: 21.1s	remaining: 34m 52s
10:	learn: 0.4049970	test: 0.4045891	best: 0.4045891 (10)	total: 22.9s	remaining: 34m 22s
11:	learn: 0.4018077	test: 0.4007