In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

## Loading the data

In [2]:
df1 = pd.read_csv("DatasetCleanFinal.csv", sep=",")

In [3]:
df1.head()

Unnamed: 0,ActualLandingTime,Airline,AircraftType,RunwayNumber,StandNumber,Date,AWNDmean,PRCPmean,TAVGmean,WDF5mean,...,IceRoad,WetDay,ShipmentWeightCat,NbPlanesLast10Mn,Hour,WeekDay,ParkingArea,StandRank,TaxiTime,LogTaxiTime
0,2018-07-30 04:25:00,CAR1,B737/9-WL,2.0,4,2018-07-30 00:00:00,4.173333,0.006667,79.0,153.333333,...,0.0,1.0,M,0.0,4,0,16223.1,1,4.0,1.609438
1,2018-07-30 08:14:00,CAR1,B757/2-WL,2.0,7,2018-07-30 00:00:00,4.173333,0.006667,79.0,153.333333,...,0.0,1.0,M,0.0,8,0,20919.9,1,4.0,1.609438
2,2018-07-30 08:26:00,CAR1,A321/2,1.0,8,2018-07-30 00:00:00,4.173333,0.006667,79.0,153.333333,...,0.0,1.0,M,0.0,8,0,16337.8,1,3.0,1.386294
3,2018-07-30 08:42:00,CAR1,B737/9-WL,2.0,9,2018-07-30 00:00:00,4.173333,0.006667,79.0,153.333333,...,0.0,1.0,M,0.0,8,0,16223.1,1,4.0,1.609438
4,2018-07-30 08:52:00,CAR1,A321/2,2.0,11,2018-07-30 00:00:00,4.173333,0.006667,79.0,153.333333,...,0.0,1.0,M,0.0,8,0,16337.8,1,5.0,1.791759


In [4]:
df1.shape

(129018, 26)

In [5]:
df1.isnull().sum()

ActualLandingTime    0
Airline              0
AircraftType         0
RunwayNumber         0
StandNumber          0
Date                 0
AWNDmean             0
PRCPmean             0
TAVGmean             0
WDF5mean             0
WSF2max              0
WSF5max              0
WT01max              0
WT02max              0
WT03max              0
WT08max              0
IceRoad              0
WetDay               0
ShipmentWeightCat    0
NbPlanesLast10Mn     0
Hour                 0
WeekDay              0
ParkingArea          0
StandRank            0
TaxiTime             0
LogTaxiTime          0
dtype: int64

In [6]:
df1.drop(['Date','ActualLandingTime', 'AircraftType'], axis=1, inplace=True)

In [7]:
df1.shape

(129018, 23)

## One Hot encoding

In [8]:
df1['Airline'] = pd.Categorical(df1['Airline'])
df1['ShipmentWeightCat'] = pd.Categorical(df1['ShipmentWeightCat'])

In [9]:
dfDummiesAirline = pd.get_dummies(df1['Airline'], prefix = 'Airline')
dfDummiesShipmentWeightCat = pd.get_dummies(df1['ShipmentWeightCat'], prefix = 'ShipmentWeightCat')

In [10]:
df2 = pd.concat([df1, dfDummiesAirline, ], axis=1)

In [11]:
df2.shape

(129018, 27)

In [12]:
df2.head()

Unnamed: 0,Airline,RunwayNumber,StandNumber,AWNDmean,PRCPmean,TAVGmean,WDF5mean,WSF2max,WSF5max,WT01max,...,Hour,WeekDay,ParkingArea,StandRank,TaxiTime,LogTaxiTime,Airline_CAR1,Airline_CAR2,Airline_CAR3,Airline_CAR4
0,CAR1,2.0,4,4.173333,0.006667,79.0,153.333333,15.0,19.0,1.0,...,4,0,16223.1,1,4.0,1.609438,1,0,0,0
1,CAR1,2.0,7,4.173333,0.006667,79.0,153.333333,15.0,19.0,1.0,...,8,0,20919.9,1,4.0,1.609438,1,0,0,0
2,CAR1,1.0,8,4.173333,0.006667,79.0,153.333333,15.0,19.0,1.0,...,8,0,16337.8,1,3.0,1.386294,1,0,0,0
3,CAR1,2.0,9,4.173333,0.006667,79.0,153.333333,15.0,19.0,1.0,...,8,0,16223.1,1,4.0,1.609438,1,0,0,0
4,CAR1,2.0,11,4.173333,0.006667,79.0,153.333333,15.0,19.0,1.0,...,8,0,16337.8,1,5.0,1.791759,1,0,0,0


In [13]:
X = df2.drop(['TaxiTime', 'LogTaxiTime', 'Airline', 'ShipmentWeightCat'], axis=1)
y = df2['LogTaxiTime']

In [14]:
print(X.shape)
print(y.shape)

(129018, 23)
(129018,)


## Generate training & testing sets

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=False,random_state=42)

In [16]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(86442, 23) (86442,)
(42576, 23) (42576,)


## Random Forest Regressor

In [17]:
clf = RandomForestRegressor(n_estimators = 100, max_depth = 10, n_jobs =-1)

In [18]:
clf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [19]:
y_test.head(10)

86442    1.609438
86443    1.791759
86444    2.079442
86445    2.302585
86446    2.302585
86447    1.945910
86448    2.890372
86449    2.944439
86450    1.791759
86451    2.484907
Name: LogTaxiTime, dtype: float64

In [20]:
y_pred = np.exp(clf.predict(X_test))-1

In [21]:
y_pred[:10]

array([ 5.70140098,  5.69078432,  6.20648505,  6.84460171,  6.84460171,
        5.80865767, 10.21583056,  9.39369792,  5.744754  ,  9.37655751])

In [22]:
np.sqrt(mean_squared_error(np.exp(y_test)-1, y_pred))

4.04681871146359

*Compute accuracy +/- 3 and 5 minutes*

In [23]:
accuracy_dataframe = pd.DataFrame(data={'y_pred':y_pred, 'y_test':np.exp(y_test)-1})

In [24]:
accuracy_dataframe

Unnamed: 0,y_pred,y_test
86442,5.701401,4.0
86443,5.690784,5.0
86444,6.206485,7.0
86445,6.844602,9.0
86446,6.844602,9.0
86447,5.808658,6.0
86448,10.215831,17.0
86449,9.393698,18.0
86450,5.744754,5.0
86451,9.376558,11.0


In [25]:
accuracy_dataframe['3MinutesRange'] = abs(accuracy_dataframe['y_pred'] - accuracy_dataframe['y_test']) <= 3
accuracy_dataframe['5MinutesRange'] = abs(accuracy_dataframe['y_pred'] - accuracy_dataframe['y_test']) <= 5

In [26]:
accuracy_dataframe.head()

Unnamed: 0,y_pred,y_test,3MinutesRange,5MinutesRange
86442,5.701401,4.0,True,True
86443,5.690784,5.0,True,True
86444,6.206485,7.0,True,True
86445,6.844602,9.0,True,True
86446,6.844602,9.0,True,True


In [27]:
print(accuracy_dataframe['3MinutesRange'].value_counts(normalize=True) * 100)

print(accuracy_dataframe['5MinutesRange'].value_counts(normalize=True) * 100)

True     81.285231
False    18.714769
Name: 3MinutesRange, dtype: float64
True     92.380684
False     7.619316
Name: 5MinutesRange, dtype: float64
