The purpose of this file is to prototype model creation when using the output of the Sampling.ipynb

In [10]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import sklearn as skl
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import imblearn as imbl
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import RUSBoostClassifier

%matplotlib inline

dataPath = "Binned_Combined_Flights_2018-2019"

In [3]:
data = pd.read_parquet(dataPath)

(5698059, 62)


In [11]:
print(data.shape)
for col in data.columns:
    print(col)

(5698059, 62)
FlightDate
Airline
Origin
Dest
Cancelled
Diverted
CRSDepTime
DepTime
DepDelayMinutes
DepDelay
ArrTime
ArrDelayMinutes
AirTime
CRSElapsedTime
ActualElapsedTime
Distance
Year
Quarter
Month
DayofMonth
DayOfWeek
Marketing_Airline_Network
Operated_or_Branded_Code_Share_Partners
DOT_ID_Marketing_Airline
IATA_Code_Marketing_Airline
Flight_Number_Marketing_Airline
Operating_Airline
DOT_ID_Operating_Airline
IATA_Code_Operating_Airline
Tail_Number
Flight_Number_Operating_Airline
OriginAirportID
OriginAirportSeqID
OriginCityMarketID
OriginCityName
OriginState
OriginStateFips
OriginStateName
OriginWac
DestAirportID
DestAirportSeqID
DestCityMarketID
DestCityName
DestState
DestStateFips
DestStateName
DestWac
DepDel15
DepartureDelayGroups
DepTimeBlk
TaxiOut
WheelsOff
WheelsOn
TaxiIn
CRSArrTime
ArrDelay
ArrDel15
ArrivalDelayGroups
ArrTimeBlk
DistanceGroup
DivAirportLandings
BinArrDelayMinutes


In [36]:
# Function to be used for models
def CreateModelUsingCrossVal(trainx, trainy, mode):
    if (mode == 'logistic_regression' or mode == 'lr'):
        model = skl.linear_model.LogisticRegression(max_iter=1000)
    elif (mode == 'naive_bayes' or mode == 'nb'):
        model = skl.naive_bayes.GaussianNB()
    elif (mode == 'random_forest' or mode == 'rf'):
        model = skl.ensemble.RandomForestClassifier(max_depth=2)
    elif (mode == 'balanced_random_forest' or mode == 'brf'):
        model = BalancedRandomForestClassifier(max_depth=2)
    elif (mode == 'adaboost' or mode == 'ab'):
        model = RUSBoostClassifier()
    
    fit=model.fit(trainx,trainy)

    desiredScores=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

    results=cross_validate(fit, trainx, trainy, cv=5, scoring=desiredScores, return_train_score=False)

    print(f"Model Type: {mode}")
    print("Scores from cross-validation are:")
    print(f'Accuracy: {results["test_accuracy"].mean()}')
    print(f'Precision: {results["test_precision_macro"].mean()}')
    print(f'Recall: {results["test_recall_macro"].mean()}')
    print(f'F-1 Score: {results["test_f1_macro"].mean()}')

In [37]:
selectedFeatures = ["CRSDepTime", "Distance", "Year", "Quarter", "DistanceGroup"]

xFeatures = data[selectedFeatures]
yTarget = np.ravel(data['BinArrDelayMinutes'])

CreateModelUsingCrossVal(xFeatures, yTarget, "naive_bayes")


Model Type: naive_bayes
Scores from cross-validation are:
Accuracy: 0.14822995493939878
Precision: 0.14435980618364183
Recall: 0.14618760747951648
F-1 Score: 0.10678071988942346
