In [5]:
import os
import sys
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [6]:
from utils.files.file_helper import load_binary_file
from utils.progress.log_progress import log_progress
import utils.configuration
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import utils.model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from imblearn.under_sampling import TomekLinks
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

In [None]:
config = utils.configuration.Configuration()

In [None]:
MERGED_DATA_LOCATION = config.config['DEFAULT']['MERGED_DATA_LOCATION']
PREPROCESSED_DATA_FILE_BIN = config.config['DEFAULT']['PREPROCESSED_DATA_FILE_BIN']
RND_STATE = config.config['DEFAULT']['RND_STATE']

In [None]:
data = load_binary_file(MERGED_DATA_LOCATION, PREPROCESSED_DATA_FILE_BIN)

In [None]:
selection_data = data[data['origin'].isin(['ALB', 'MSP'])]

In [None]:
def process_data(data):
    data_df = data.copy()
    data_df = data.drop(['fl_date', 'quarter', 'fl_num', 'tail_num', 'origin_city_name', 'dest_city_name'], axis=1)
    data_df = pd.get_dummies(data_df, columns=['dest', 'origin'])
    return data_df

In [None]:
selection_data = process_data(selection_data)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(selection_data.loc[:, selection_data.columns != 'status'], selection_data['status'], test_size = 0.2, random_state = RND_STATE)

## Fitting models

In [8]:
classifiers = []

### Random Forest

In [None]:
param = {'criterion':['gini', 'entropy'], 'max_features':[1, 2, 3, 4, 5, 6, 7, 'log2', 'auto'],
         'max_depth':[2, 4, 8, 16, 32, 64], 'class_weight':['balanced', None], 'n_estimators': [30, 40, 50, 60], 'bootstrap': [True, False]}

mt = utils.model_selection.ModelTester(parameters = param, model = RandomForestClassifier(random_state=RND_STATE))
mt.test_model(X_train, y_train)
rf_clf = mt.best_estimator()
classifiers.append({'name': 'Random Forest Classifier', 'clf': rf_clf})

### AdaBoostClassifier

In [None]:
param = {'algorithm': ['SAMME.R', 'SAMME'], 'learning_rate': [0.1, 0.3, 0.6, 0.8, 1.0]}
mt = utils.model_selection.ModelTester(parameters = param, model = AdaBoostClassifier(random_state=RND_STATE))
mt.test_model(X_train, y_train)
adc_clf = mt.best_estimator()
classifiers.append({'name': 'AdaBoost Classifier', 'clf': adc_clf})

### Decision Tree Classifier

In [None]:
param = {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_features':[1, 2, 3, 4, 5, 'log2', 'auto'], 
         'class_weight' : ['balanced'], 'random_state':[RND_STATE], 'presort':[True, False]}

mt = utils.model_selection.ModelTester(parameters = param, model = DecisionTreeClassifier(random_state=RND_STATE))
mt.test_model(X_train, y_train)
dtc_clf = mt.best_estimator()
classifiers.append({'name': 'Decision Tree Classifier', 'clf': dtc_clf})

### K-Neighbors Classifier

In [None]:
param = {'n_neighbors': [30, 50, 65, 70], 'weights': ['uniform', 'distance'], 'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'], 
         'leaf_size' : [10, 15, 20], 'p':[1, 2]}

mt = utils.model_selection.ModelTester(parameters = param, model = KNeighborsClassifier())
mt.test_model(X_train, y_train)
knn_clf = mt.best_estimator()
classifiers.append({'name': 'K-Neighbors Classifier', 'clf': knn_clf})

### Gradient Boosting Classifier

In [None]:
param = {'loss': ['deviance'], 'max_features':[1, 2, 3, 4, 5, 'log2', 'auto'], 'presort':[True, False],
         'n_estimators':[200, 300], 'min_samples_leaf' : [3]}

mt = utils.model_selection.ModelTester(parameters = param, model = GradientBoostingClassifier(random_state=RND_STATE))
mt.test_model(X_train, y_train, X_test, y_test)
gbc_clf = mt.best_estimator()
classifiers.append({'name': 'Gradient Boosting Classifier', 'clf': gbc_clf})

In [9]:
results_data = []
for clf in log_progress(classifiers, every = 1):
    print('\n' + clf['name'])
    score = f1_score(clf['clf'].predict(X_test), y_test, average='weighted')
    print('F1 score: ', score)
    results_data.append({'Classifier': clf['name'], 'F1 Score': score})
    print_importances(working_df.loc[:, working_df.columns != 'status'], clf['clf'])

VBox(children=(HTML(value=''), IntProgress(value=0, max=0)))