In [20]:
import os
import sys
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [21]:
import sklearn.exceptions
import warnings
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

In [22]:
from utils.files.file_helper import load_binary_file
import utils.configuration
import pandas as pd
import utils.model_selection
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [23]:
config = utils.configuration.Configuration()

In [24]:
MERGED_DATA_LOCATION = config.config['DEFAULT']['MERGED_DATA_LOCATION']
PREPROCESSED_DATA_FILE_BIN = config.config['DEFAULT']['PREPROCESSED_DATA_FILE_BIN']
RND_STATE = config.config['DEFAULT']['RND_STATE']

In [25]:
data = load_binary_file(MERGED_DATA_LOCATION, PREPROCESSED_DATA_FILE_BIN)

In [28]:
selection_data = data[data['origin'].isin(['ALB', 'MSP'])]

In [29]:
def process_data(data):
    data_df = data.copy()
    data_df = data_df.drop(['fl_date', 'quarter', 'fl_num', 'tail_num', 'origin_city_name', 'dest_city_name'], axis=1)
    data_df = pd.get_dummies(data_df, columns=['dest', 'origin', 'op_unique_carrier'])
    data_df['status'] = list(map(str, data_df['status']))
    data_df['status'] = LabelEncoder().fit_transform(data_df['status'])
    return data_df

In [30]:
selection_data = process_data(selection_data)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(selection_data.loc[:, selection_data.columns != 'status'], selection_data['status'], test_size = 0.2, random_state = RND_STATE)

## Fitting models

In [32]:
classifiers = []

### Random Forest

In [33]:
param = {'criterion':['gini', 'entropy'], 
         'max_features':[1, 2, 3, 4, 5, 6, 7, 'log2', 'auto'],
         'max_depth':[2, 4, 8, 16, 32, 64], 
         'class_weight':['balanced', None], 
         'n_estimators': [30, 40, 50, 60], 
         'bootstrap': [True, False]}

mt = utils.model_selection.ModelTester(parameters = param, 
                                       model = RandomForestClassifier(random_state=RND_STATE),
                                       rnd_state = RND_STATE)
mt.test_model(X_train, y_train)
rf_clf = mt.best_estimator()
classifiers.append({'name': 'Random Forest Classifier', 'clf': rf_clf})

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  4.0min finished
INFO:root:Best score cv: 0.7544719194274506


### AdaBoostClassifier

In [34]:
param = {'algorithm': ['SAMME.R', 'SAMME'], 
         'learning_rate': [0.1, 0.3, 0.6, 0.8, 1.0]}
mt = utils.model_selection.ModelTester(parameters = param, 
                                       model = AdaBoostClassifier(random_state=RND_STATE),
                                       rnd_state = RND_STATE)
mt.test_model(X_train, y_train)
adc_clf = mt.best_estimator()
classifiers.append({'name': 'AdaBoost Classifier', 'clf': adc_clf})

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  6.0min finished
INFO:root:Best score cv: 0.7311445036448808


### Decision Tree Classifier

In [35]:
param = {'criterion': ['gini', 'entropy'], 
         'splitter': ['best', 'random'], 
         'max_features':[1, 2, 3, 4, 5, 'log2', 'auto'], 
         'class_weight' : ['balanced'], 
         'random_state':[RND_STATE], 
         'presort':[True, False]}

mt = utils.model_selection.ModelTester(parameters = param, 
                                       model = DecisionTreeClassifier(random_state=RND_STATE),
                                       rnd_state = RND_STATE)
mt.test_model(X_train, y_train)
dtc_clf = mt.best_estimator()
classifiers.append({'name': 'Decision Tree Classifier', 'clf': dtc_clf})

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 31.6min finished
INFO:root:Best score cv: 0.7283145013438


### Gradient Boosting Classifier

In [36]:
param = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }


mt = utils.model_selection.ModelTester(parameters = param, 
                                       model = XGBClassifier(),
                                       rnd_state = RND_STATE)


mt.test_model(X_train, y_train)
xgb_clf = mt.best_estimator()
classifiers.append({'name': 'XGBClassifier', 'clf': xgb_clf})

Fitting 3 folds for each of 10 candidates, totalling 30 fits


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 186.8min finished
INFO:root:Best score cv: 0.7448909259115015
