# Train Model
## i. Overview
The Model Trainer is used to generate and train a ML model for our use case. The data is assumed to already be preprocessed and do not need much additional manipulation.

## ii. Special Notes
1. The file is always expected to be loaded as a dense matrix. For better processing speed, we can convert the dense matrix into a sparse matrix.
2. We need to figure out how to improve processing speed while keeping up performance.

## iii. Methodology
### Selected Algorithm

### Additional





In [1]:
#import libraries
import pandas as pd
import numpy as np
import sklearn as skl
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import imblearn as imbl
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import RUSBoostClassifier

trainDataPath = 'data/trainData_2018-2019'
trainDataPathBinaryTarget = 'data/trainDataBinaryTarget_2018-2019'

selectedFeatures = [
    'Airline',
    'Origin',
    'Dest',
    'CRSDepTime', 
    'Distance', 
    # 'Year', 
    'Quarter', 
    'Month', 
    'DayofMonth', 
    'DayOfWeek', 
    'DepTimeBlk', 
    'ArrTimeBlk', 
    'DistanceGroup'
    ]

In [2]:
#load data and set up for training
trainData = pd.read_parquet(trainDataPath, engine='fastparquet')
print(f'Train data shape: {trainData.shape}')

trainDataBinaryTarget = pd.read_parquet(trainDataPathBinaryTarget, engine='fastparquet')
print(f'Binary target train data shape: {trainDataBinaryTarget.shape}')

Train data shape: (5697559, 16)
Binary target train data shape: (13503883, 15)


In [3]:
#define any relevant functions

# SaveModel
#
# function that can be used to save the model that we just built
# after fitting the model, save it to filename
import pickle
def SaveModel(model, filename):
    pickle.dump(model, open(filename, 'wb'))

#CreateModelUsingCrossVal
#
#Creates a model given the input of
#x features, y target, and which type of
#model to run. Then uses cross validation
#to run that model and prints out the scores.
def CreateModelUsingCrossVal(trainx, trainy, mode):
    if (mode == 'logistic_regression' or mode == 'lr'):
        model = skl.linear_model.LogisticRegression(max_iter=1000)
    elif (mode == 'naive_bayes' or mode == 'nb'):
        model = skl.naive_bayes.GaussianNB()
    elif (mode == 'random_forest' or mode == 'rf'):
        model = skl.ensemble.RandomForestClassifier(max_depth=2)
    elif (mode == 'balanced_random_forest' or mode == 'brf'):
        model = BalancedRandomForestClassifier(max_depth=2)
    elif (mode == 'adaboost' or mode == 'ab'):
        model = RUSBoostClassifier()
    
    model.fit(trainx,trainy)
    SaveModel(model, "models/multiclass_model_"+mode+".pkl")

    desiredScores=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

    results=cross_validate(model, trainx, trainy, cv=5, scoring=desiredScores, return_train_score=False)

    print(f"Model Type: {mode}")
    print("Scores from cross-validation are:")
    print(f'Accuracy: {results["test_accuracy"].mean()}')
    print(f'Precision: {results["test_precision_macro"].mean()}')
    print(f'Recall: {results["test_recall_macro"].mean()}')
    print(f'F-1 Score: {results["test_f1_macro"].mean()}')

# Function to be used for models
def BinaryTargetCreateModelUsingCrossVal(trainx, trainy, mode):
    if (mode == 'logistic_regression' or mode == 'lr'):
        model = skl.linear_model.LogisticRegression(max_iter=1000)
    elif (mode == 'naive_bayes' or mode == 'nb'):
        model = skl.naive_bayes.GaussianNB()
    elif (mode == 'random_forest' or mode == 'rf'):
        model = skl.ensemble.RandomForestClassifier(max_depth=2)
    elif (mode == 'balanced_random_forest' or mode == 'brf'):
        model = BalancedRandomForestClassifier(max_depth=2)
    elif (mode == 'adaboost' or mode == 'ab'):
        model = RUSBoostClassifier()
    
    fit=model.fit(trainx,trainy)
    model.fit(trainx,trainy)
    SaveModel(model, "models/binary_model_"+mode+".pkl")

    desiredScores=['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

    results=cross_validate(model, trainx, trainy, cv=5, scoring=desiredScores, return_train_score=False)

    print(f"Model Type: {mode}")
    print("Scores from cross-validation are:")
    print(f'Accuracy: {results["test_accuracy"].mean()}')
    print(f'Precision: {results["test_precision"].mean()}')
    print(f'Recall: {results["test_recall"].mean()}')
    print(f'F-1 Score: {results["test_f1"].mean()}')
    print(f'AUC: {results["test_roc_auc"].mean()}')

In [4]:
#build model 1
xFeatures = trainData[selectedFeatures]
yTarget = np.ravel(trainData['BinArrDelayMinutes'])

print('MultiClass Evaluation')
CreateModelUsingCrossVal(xFeatures, yTarget, "naive_bayes")
print()

xFeatures = trainDataBinaryTarget[selectedFeatures]
yTarget = np.ravel(trainDataBinaryTarget['BinaryArrDelayMinutes'])
print('Binary Class Evaluation')
BinaryTargetCreateModelUsingCrossVal(xFeatures, yTarget, "naive_bayes")

MultiClass Evaluation
Model Type: naive_bayes
Scores from cross-validation are:
Accuracy: 0.2160500671002426
Precision: 0.19806963728234078
Recall: 0.21409934679506315
F-1 Score: 0.16378578619225465

Binary Class Evaluation
Model Type: naive_bayes
Scores from cross-validation are:
Accuracy: 0.6094819537499819
Precision: 0.4200300746197776
Recall: 0.2899803865846082
F-1 Score: 0.34254612618376
AUC: 0.5773607469326354


In [5]:
#build model 2
smallerSampleDataframe = trainData.sample(n=100000)
xFeatures = smallerSampleDataframe[selectedFeatures]
yTarget = np.ravel(smallerSampleDataframe['BinArrDelayMinutes'])

print('MultiClass Evaluation')
CreateModelUsingCrossVal(xFeatures, yTarget, "logistic_regression")
print()

smallerSampleDataframe = trainDataBinaryTarget.sample(n=100000)
xFeatures = smallerSampleDataframe[selectedFeatures]
yTarget = np.ravel(smallerSampleDataframe['BinaryArrDelayMinutes'])
print('Binary Class Evaluation')
BinaryTargetCreateModelUsingCrossVal(xFeatures, yTarget, "logistic_regression")


MultiClass Evaluation


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model Type: logistic_regression
Scores from cross-validation are:
Accuracy: 0.24562
Precision: 0.2336213556411856
Recall: 0.2404143292970174
F-1 Score: 0.1986082526964126

Binary Class Evaluation
Model Type: logistic_regression
Scores from cross-validation are:
Accuracy: 0.6466700000000001
Precision: 0.4446666666666667
Recall: 0.0014157247448902282
F-1 Score: 0.0028210946277759854
AUC: 0.5796053782315183


In [6]:
#build model 3
smallerSampleDataframe = trainData.sample(n=100000)
xFeatures = smallerSampleDataframe[selectedFeatures]
yTarget = np.ravel(smallerSampleDataframe['BinArrDelayMinutes'])

print('MultiClass Evaluation')
CreateModelUsingCrossVal(xFeatures, yTarget, "random_forest")
print()

smallerSampleDataframe = trainDataBinaryTarget.sample(n=100000)
xFeatures = smallerSampleDataframe[selectedFeatures]
yTarget = np.ravel(smallerSampleDataframe['BinaryArrDelayMinutes'])
print('Binary Class Evaluation')
BinaryTargetCreateModelUsingCrossVal(xFeatures, yTarget, "random_forest")

MultiClass Evaluation


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model Type: random_forest
Scores from cross-validation are:
Accuracy: 0.24991000000000002
Precision: 0.1476562246911561
Recall: 0.2442139985604649
F-1 Score: 0.16829609209091845

Binary Class Evaluation


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model Type: random_forest
Scores from cross-validation are:
Accuracy: 0.64931
Precision: 0.0
Recall: 0.0
F-1 Score: 0.0
AUC: 0.5840577296230373


  _warn_prf(average, modifier, msg_start, len(result))
