# Train Model
## i. Overview
The Model Trainer is used to generate and train a ML model for our use case. The data is assumed to already be preprocessed and do not need much additional manipulation.

## ii. Special Notes
1. The file is always expected to be loaded as a dense matrix. For better processing speed, we can convert the dense matrix into a sparse matrix.
2. We need to figure out how to improve processing speed while keeping up performance.

## iii. Methodology
### Selected Algorithm

### Additional





In [10]:
#import libraries
import pandas as pd
import numpy as np
import sklearn as skl
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import imblearn as imbl
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import RUSBoostClassifier

trainDataPath = 'data/trainData_2018-2019'

selectedFeatures = [
    'Airline',
    'Origin',
    'Dest',
    'CRSDepTime', 
    'Distance', 
    'Year', 
    'Quarter', 
    'Month', 
    'DayofMonth', 
    'DayOfWeek', 
    'DepTimeBlk', 
    'ArrTimeBlk', 
    'DistanceGroup'
    ]

In [11]:
#load data and set up for training
trainData = pd.read_parquet(trainDataPath, engine='fastparquet')
print(f'Train data shape: {trainData.shape}')

Train data shape: (5697559, 15)


In [12]:
#define any relevant functions

#CreateModelUsingCrossVal
#
#Creates a model given the input of
#x features, y target, and which type of
#model to run. Then uses cross validation
#to run that model and prints out the scores.
def CreateModelUsingCrossVal(trainx, trainy, mode):
    if (mode == 'logistic_regression' or mode == 'lr'):
        model = skl.linear_model.LogisticRegression(max_iter=1000)
    elif (mode == 'naive_bayes' or mode == 'nb'):
        model = skl.naive_bayes.GaussianNB()
    elif (mode == 'random_forest' or mode == 'rf'):
        model = skl.ensemble.RandomForestClassifier(max_depth=2)
    elif (mode == 'balanced_random_forest' or mode == 'brf'):
        model = BalancedRandomForestClassifier(max_depth=2)
    elif (mode == 'adaboost' or mode == 'ab'):
        model = RUSBoostClassifier()
    
    fit=model.fit(trainx,trainy)

    desiredScores=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

    results=cross_validate(fit, trainx, trainy, cv=5, scoring=desiredScores, return_train_score=False)

    print(f"Model Type: {mode}")
    print("Scores from cross-validation are:")
    print(f'Accuracy: {results["test_accuracy"].mean()}')
    print(f'Precision: {results["test_precision_macro"].mean()}')
    print(f'Recall: {results["test_recall_macro"].mean()}')
    print(f'F-1 Score: {results["test_f1_macro"].mean()}')

In [13]:
#build model 1
xFeatures = trainData[selectedFeatures]
yTarget = np.ravel(trainData['BinArrDelayMinutes'])

CreateModelUsingCrossVal(xFeatures, yTarget, "naive_bayes")

Model Type: naive_bayes
Scores from cross-validation are:
Accuracy: 0.19063655341326452
Precision: 0.17133174718571303
Recall: 0.18882033687287042
F-1 Score: 0.1395492648544096


In [19]:
#build model 2
smallerSampleDataframe = trainData.sample(n=100000)
xFeatures = smallerSampleDataframe[selectedFeatures]
yTarget = np.ravel(smallerSampleDataframe['BinArrDelayMinutes'])

CreateModelUsingCrossVal(xFeatures, yTarget, "logistic_regression")

Model Type: logistic_regression
Scores from cross-validation are:
Accuracy: 0.24611
Precision: 0.23725487225003478
Recall: 0.2398399965445192
F-1 Score: 0.19757051547384644


In [20]:
#build model 3
smallerSampleDataframe = trainData.sample(n=100000)
xFeatures = smallerSampleDataframe[selectedFeatures]
yTarget = np.ravel(smallerSampleDataframe['BinArrDelayMinutes'])

CreateModelUsingCrossVal(xFeatures, yTarget, "random_forest")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model Type: random_forest
Scores from cross-validation are:
Accuracy: 0.24606
Precision: 0.14508758345317935
Recall: 0.2411481289862746
F-1 Score: 0.17076651155913175


  _warn_prf(average, modifier, msg_start, len(result))
