In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
jan_2019 = pd.read_csv('Jan_2019_ontime.csv')
jan_2019.head()

In [None]:
jan_2019.dtypes

Matplotlib

In [None]:
jan_2019["OP_CARRIER"].value_counts()

In [None]:
grouped_airport = jan_2019.groupby("OP_CARRIER").mean().reset_index()
grouped_airport_best = grouped_airport.nsmallest(5, "DEP_DEL15")
grouped_airport_best

In [None]:
grouped_airport_worst = grouped_airport.nlargest(5, "DEP_DEL15")
grouped_airport_worst

In [None]:
y_axis = grouped_airport_best["DEP_DEL15"] * 100
x_axis = grouped_airport_best["OP_CARRIER"]
plt.figure(figsize=(15,5))
plt.bar(x_axis, y_axis, color='skyblue', align="center")

    
plt.title("Airlines with the least chance of a delay in January")
plt.xlabel("Airline")
plt.ylabel("Flights delayed (%)")
plt.xlim(-0.75, len(x_axis)-0.25)

In [None]:
y_axis = grouped_airport_worst["DEP_DEL15"] * 100
x_axis = grouped_airport_worst["OP_CARRIER"]
plt.figure(figsize=(15,5))
plt.bar(x_axis, y_axis, color='r', alpha=0.25, align="center")

    
plt.title("Airlines with the highest chance of a delay in January")
plt.xlabel("Airline")
plt.ylabel("Flights delayed (%)")
plt.xlim(-0.75, len(x_axis)-0.25)

In [None]:
jan_2019['DEP_DEL15'].value_counts()

In [None]:
jan_2019['CANCELLED'].value_counts()

In [None]:
jan_2019['DIVERTED'].value_counts()

In [None]:
jan_2019['ARR_DEL15'].value_counts()

In [None]:
jan_2019['ORIGIN'].value_counts()

In [None]:
jan_2019['ORIGIN'].describe()

In [None]:
jan_2019['DEST'].value_counts()

In [None]:
jan_2019_Ontime = jan_2019[jan_2019['DEP_DEL15']== 0.0]
jan_2019_Ontime.head()

In [None]:
jan_2019_Ontime['DEP_DEL15'].value_counts()

In [None]:
remove_n = 368703
drop_indices = np.random.choice(jan_2019_Ontime.index, remove_n, replace=False)
jan_2019_Ontime_R = jan_2019_Ontime.drop(drop_indices)
jan_2019_Ontime_R.head()

In [None]:
jan_2019_Ontime_R['DEP_DEL15'].value_counts()

In [None]:
jan_2019_Delayed = jan_2019[jan_2019['DEP_DEL15']==1.0]
jan_2019_Delayed.head()

In [None]:
jan_2019_Delayed['DEP_DEL15'].value_counts()

In [None]:
jan_2019_Balanced = pd.concat([jan_2019_Ontime_R,jan_2019_Delayed])
jan_2019_Balanced.head()

In [None]:
jan_2019_Balanced['DEP_DEL15'].value_counts()

In [None]:
jan_2019.dtypes

In [None]:
jan_2019_Balanced['Unnamed: 21'].describe()

In [None]:
jan_2019_Balanced_Dropped=jan_2019_Balanced.drop(columns={'OP_UNIQUE_CARRIER','TAIL_NUM','OP_CARRIER_FL_NUM','ORIGIN_AIRPORT_SEQ_ID','DEST_AIRPORT_SEQ_ID','DEP_TIME_BLK','Unnamed: 21'})

jan_2019_Balanced_Dropped.head()

In [None]:
y = jan_2019_Balanced_Dropped['DEP_DEL15']
X = jan_2019_Balanced_Dropped.drop(columns="DEP_DEL15",axis=1)
X.head()

In [None]:
names = X.columns.to_list()
names

In [None]:
cat = X.select_dtypes(include="object").columns.to_list()
cat

In [None]:
num =  X.select_dtypes(exclude="object").columns.to_list()
num

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectFromModel

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [None]:
cat_pip = Pipeline([
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ("one_hot",OneHotEncoder(sparse=False,handle_unknown='ignore'))
])
num_pip = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scale', MinMaxScaler())
])

data_processing = ColumnTransformer([
    ('cat',cat_pip,cat),
    ('num',num_pip,num)
])


RandomForestClassifier - Default

In [None]:
rf_1 = RandomForestClassifier(verbose=1)

In [None]:
model_rf = Pipeline([
    ('processing', data_processing),
    ('classify', rf_1)
])

In [None]:
model_rf.fit(X_train, y_train)

In [None]:
print('Train Acc: %.3f' % model_rf.score(X_train, y_train))
print('Test Acc: %.3f' % model_rf.score(X_test, y_test))

Saving the model

In [None]:
from joblib import dump, load

In [None]:
dump(model, 'rf.pkl')

In [None]:
model_rf_load = load("rf.pkl")

Checking random forest classifier prediction

In [None]:
encoded_predictions = model_rf_load.predict(X_test[:5])
print(f"Predicted classes: {encoded_predictions}")
print(f"Actual classes: {list(y_test[:5])}")

RandomForestClassifier - Params

In [None]:
rf_2 = RandomForestClassifier(n_estimators=200, max_depth=7, verbose=2, n_jobs=2)

In [None]:
rf_2 = RandomForestClassifier()

In [None]:
model_rf_2 = Pipeline([
    ('processing', data_processing),
    ('classify', rf_2)
])

In [None]:
model_rf_2.fit(X_train, y_train)

In [None]:
print('Train Acc: %.3f' % model_rf_2.score(X_train, y_train))
print('Test Acc: %.3f' % model_rf_2.score(X_test, y_test))

In [None]:
dump(model, 'rf_2.pkl')

In [None]:
model_rf_2_load = load("rf_2.pkl")

Checking random forest classifier (2nd model) prediction

In [None]:
encoded_predictions = model_rf_2_load.predict(X_test[:5])
print(f"Predicted classes: {encoded_predictions}")
print(f"Actual classes: {list(y_test[:5])}")

GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier(n_estimators=50, max_depth=3, verbose=1)

In [None]:
model = Pipeline([
    ('processing', data_processing),
    ('feature_selection', SelectFromModel(gbc)),
    ('classify', gbc)
])

In [None]:
model.fit(X_train, y_train)

In [None]:
print('Train Acc: %.3f' % model.score(X_train, y_train))
print('Test Acc: %.3f' % model.score(X_test, y_test))

Loading the model

In [None]:
dump(model, 'gbc.pkl')

In [None]:
model_load = load("gbc.pkl")

Checking gradient boosting classifier prediction

In [None]:
encoded_predictions = model_load.predict(X_test[:5])
print(f"Predicted classes: {encoded_predictions}")
print(f"Actual classes: {list(y_test[:5])}")