In [None]:

import lightgbm as lg
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

import wandb
from wandb.lightgbm import wandb_callback, log_summary

1. Load data

In [None]:
train_df = pd.read_csv('/Users/favea/Downloads/swiss-data/train_df.csv', index_col=0)
train_df = train_df.drop('delay_minutes',axis=1)
x_train, y_train = train_df.drop('delayed',axis=1), train_df.loc[:,'delayed']
x_train.carrier = x_train.carrier.astype('category')
x_train.origin = x_train.origin.astype('category')
x_train.destination = x_train.destination.astype('category')

lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = ['carrier','origin','destination'], free_raw_data=False)

In [None]:
dev_df = pd.read_csv('/Users/favea/Downloads/swiss-data/dev_df.csv', index_col=0)
dev_df = dev_df.drop('delay_minutes',axis=1)

x_dev, y_dev = dev_df.drop('delayed',axis=1), dev_df.loc[:,'delayed']
x_dev.carrier = x_dev.carrier.astype('category')
x_dev.origin = x_dev.origin.astype('category')
x_dev.destination = x_dev.destination.astype('category')

lgb_dev = lgb.Dataset(x_dev, y_dev, reference=lgb_train, categorical_feature = ['carrier','origin','destination'], free_raw_data=False)

2. Train baseline

In [None]:
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['binary_logloss','auc','average_precision'],
    'num_leaves': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbosity': 1,
    'scale_pos_weight': 1,
}

wandb.init(project='swiss-delay-prediction', job_type= 'training-lightgbm-baseline', config=params);

In [None]:
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=lgb_dev,
                valid_names=('validation'),
                callbacks=[wandb_callback()],
                early_stopping_rounds=10,
                )

In [None]:
log_summary(gbm, save_model_checkpoint=True)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, f1_score, precision_recall_fscore_support, roc_curve

In [None]:
# predict
y_pred = gbm.predict(x_dev, num_iteration=gbm.best_iteration)

roc_ac = roc_auc_score(y_dev, y_pred)
wandb.log({'roc_auc': roc_ac})

In [None]:
wandb.sklearn.plot_confusion_matrix(y_dev, (y_pred>=0.5).astype(int))

In [None]:
y_train_pred = gbm.predict(x_train, num_iteration=gbm.best_iteration)
print(classification_report(y_train, (y_train_pred>=0.5).astype(int)))

In [None]:
wandb.finish()

## SWEEP

In [None]:
sweep_config = {
  "method" : "bayes",
  "metric": {
      "name": "avg_precision_dev",
      "goal": "maximize"
  },
  "parameters" : {
    "learning_rate" :{
      "min": 0.001,
      "max": 1.0
    },
    "gamma" :{
      "min": 0.001,
      "max": 1.0
    },
    "min_child_weight" :{
      "min": 1,
      "max": 150
    },
    "early_stopping_rounds" :{
      "values" : [10, 20, 30, 40]
    },
    "boosting_type":{"values":['gbdt','rf','dart']},
    "num_leaves":{"values":[20,40,50]},
    "feature_fraction": {"values":[0.4,0.9,1]},
    "bagging_fraction":{"values":[0.5,0.8,1]},
    "bagging_freq":{'values':[1,5,10]},
    "scale_pos_weight":{"values":[1, 1.5, 2, 0.8]},
    "num_iterations":{"values":[100, 150,200]},
    "lambda_l1":{"values":[0.0, 0.3,0.5]},
    "lambda_l2":{"values":[0.0, 0.3,0.5]},

  }
}


In [None]:
from sklearn import metrics
from sklearn.metrics import average_precision_score
import numpy as np

In [None]:
def train():     
    with wandb.init() as run:

        params = {
            'boosting_type': run.config['boosting_type'],
            'objective': 'binary',
            'metric': ['binary_logloss','auc','average_precision'],
            'num_leaves': run.config['num_leaves'],
            'learning_rate': run.config['learning_rate'],
            'feature_fraction': run.config['feature_fraction'],
            'bagging_fraction': run.config['bagging_fraction'],
            'bagging_freq': run.config['bagging_freq'],
            'verbosity': 1,
            'scale_pos_weight': run.config['scale_pos_weight'],
            'lambda_l2':run.config['lambda_l2'],
            'lambda_l1':run.config['lambda_l1'],

        }
    

        # Initialize and train LightGBM model
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=100,
                        valid_sets=lgb_dev,
                        valid_names=('validation'),
                        callbacks=[wandb_callback()],
                        early_stopping_rounds=10,
                        )

        


        # Log booster metrics
        run.summary["best_score"] = gbm.best_score
        run.summary["best_iteration"] = gbm.best_iteration
        
        # Get train and validation predictions
        y_dev_pred = gbm.predict(x_dev, num_iteration=gbm.best_iteration)
        y_train_pred = gbm.predict(x_train, num_iteration=gbm.best_iteration)


        # Log additional Train metrics
        false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_train, y_train_pred) 

        avg_precision_train = average_precision_score(y_train, y_train_pred)
        run.summary['train_avg_precision'] = avg_precision_train
        run.summary['train_ks_stat'] = max(true_positive_rate - false_positive_rate)
        run.summary['train_auc'] = metrics.auc(false_positive_rate, true_positive_rate)
        run.summary['train_log_loss'] = -(y_train * np.log(y_train_pred) + (1-y_train) * np.log(1-y_train_pred)).sum() / len(y_train)

        # Log additional Validation metrics
        avg_precision_dev = average_precision_score(y_dev, y_dev_pred)
        run.summary['avg_precision_dev'] = avg_precision_dev
        run.summary["val_auc"] = metrics.roc_auc_score(y_dev, y_dev_pred)
        run.summary["val_acc_0.5"] = metrics.accuracy_score(y_dev, np.where(y_dev_pred >= 0.5, 1, 0))
        run.summary["val_log_loss"] = -(y_dev * np.log(y_dev_pred) 
                                             + (1-y_dev) * np.log(1-y_dev_pred)).sum() / len(y_dev)

In [None]:
sweep_id = wandb.sweep(sweep_config, project="swiss-delay-prediction")

In [None]:
count = 20 # number of runs to execute
wandb.agent(sweep_id, function=train, count=100)