# Random Forest Regressor Model

In [16]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import mean_squared_error

## Loading the data

In [17]:
#To remove
df1 = pd.read_csv("TestsetCleanFinal.csv", sep=",")

In [None]:
df1 = pd.read_csv("../../data/TestsetCleanFinal.csv", sep=",")

In [18]:
df1.head()

Unnamed: 0,ActualLandingTime,Airline,AircraftType,RunwayNumber,StandNumber,Date,AWNDmean,PRCPmean,TAVGmean,WDF5mean,...,IceRoad,WetDay,ShipmentWeightCat,NbPlanesLast10Mn,Hour,WeekDay,ParkingArea,StandRank,TaxiTime,LogTaxiTime
0,2019-01-17 03:18:00,CAR1,A321/2,9.0,128,2019-01-17 00:00:00,2.61,0.043,42.0,223.333333,...,1.0,1.0,M,0.0,3,3,16337.8,1,5.0,1.791759
1,2019-01-17 03:34:00,CAR1,CS100,9.0,37,2019-01-17 00:00:00,2.61,0.043,42.0,223.333333,...,1.0,1.0,L,0.0,3,3,13110.0,0,8.0,2.197225
2,2019-01-17 03:36:00,CAR1,A321/2,9.0,9,2019-01-17 00:00:00,2.61,0.043,42.0,223.333333,...,1.0,1.0,M,1.0,3,3,16337.8,1,7.0,2.079442
3,2019-01-17 03:43:00,CAR1,CS100,0.0,28,2019-01-17 00:00:00,2.61,0.043,42.0,223.333333,...,1.0,1.0,L,2.0,3,3,13110.0,0,5.0,1.791759
4,2019-01-17 03:44:00,CAR1,MD88,9.0,32,2019-01-17 00:00:00,2.61,0.043,42.0,223.333333,...,1.0,1.0,L,2.0,3,3,15739.7,1,5.0,1.791759


We drop the AircraftType. We already put this information in the parking area column.

In [19]:
df1.drop(['AircraftType'], axis=1, inplace=True)

We drop useless columns

In [20]:
df1.drop(['ActualLandingTime', 'Date', 'StandNumber'], axis=1, inplace=True)

In [21]:
df1.shape

(8904, 22)

## One-Hot Encoding

As the Random Forest Regressor of scikit learn does not support categorical variables, we use One-Hot encoding.

In [22]:
df1.dtypes

Airline               object
RunwayNumber         float64
AWNDmean             float64
PRCPmean             float64
TAVGmean             float64
WDF5mean             float64
WSF2max              float64
WSF5max              float64
WT01max              float64
WT02max              float64
WT03max              float64
WT08max              float64
IceRoad              float64
WetDay               float64
ShipmentWeightCat     object
NbPlanesLast10Mn     float64
Hour                   int64
WeekDay                int64
ParkingArea          float64
StandRank              int64
TaxiTime             float64
LogTaxiTime          float64
dtype: object

In [23]:
df1['RunwayNumber'] = pd.Categorical(df1['RunwayNumber'])
df1['Airline'] = pd.Categorical(df1['Airline'])
df1['ShipmentWeightCat'] = pd.Categorical(df1['ShipmentWeightCat'])

In [24]:
dfDummiesRunwayNumber = pd.get_dummies(df1['RunwayNumber'], prefix = 'RunwayNumber')
dfDummiesAirline = pd.get_dummies(df1['Airline'], prefix = 'Airline')
dfDummiesShipmentWeightCat = pd.get_dummies(df1['ShipmentWeightCat'], prefix = 'ShipmentWeightCat')

In [25]:
df2 = pd.concat([df1, dfDummiesRunwayNumber, dfDummiesAirline, dfDummiesShipmentWeightCat], axis=1)

In [26]:
df2.shape

(8904, 38)

One column is missing because an Airline Company disapeared in the testing set (the CAR2 one). We need to add a column.

In [27]:
df2['Airline_CAR2']=0

In [28]:
df2.head()

Unnamed: 0,Airline,RunwayNumber,AWNDmean,PRCPmean,TAVGmean,WDF5mean,WSF2max,WSF5max,WT01max,WT02max,...,RunwayNumber_7.0,RunwayNumber_8.0,RunwayNumber_9.0,Airline_CAR1,Airline_CAR3,Airline_CAR4,ShipmentWeightCat_L,ShipmentWeightCat_M,ShipmentWeightCat_S,Airline_CAR2
0,CAR1,9.0,2.61,0.043,42.0,223.333333,8.9,13.0,1.0,0.0,...,0,0,1,1,0,0,0,1,0,0
1,CAR1,9.0,2.61,0.043,42.0,223.333333,8.9,13.0,1.0,0.0,...,0,0,1,1,0,0,1,0,0,0
2,CAR1,9.0,2.61,0.043,42.0,223.333333,8.9,13.0,1.0,0.0,...,0,0,1,1,0,0,0,1,0,0
3,CAR1,0.0,2.61,0.043,42.0,223.333333,8.9,13.0,1.0,0.0,...,0,0,0,1,0,0,1,0,0,0
4,CAR1,9.0,2.61,0.043,42.0,223.333333,8.9,13.0,1.0,0.0,...,0,0,1,1,0,0,1,0,0,0


## Creat input matrix and target columns

In [29]:
X = df2.drop(['TaxiTime', 'LogTaxiTime', 'RunwayNumber', 'Airline', 'ShipmentWeightCat'], axis=1)
y = df2['LogTaxiTime']

In [30]:
print(X.shape)
print(y.shape)

(8904, 34)
(8904,)


## Random Forest Regressor

Load the model

In [31]:
loaded_rf_model = pickle.load(open('finalized_random_forest_model.sav', 'rb'))

In [32]:
y_pred = np.exp(loaded_rf_model.predict(X))-1

In [33]:
y_pred[:10]

array([6.35591228, 6.3556139 , 6.47383181, 6.20197799, 6.42763827,
       6.01294258, 6.485661  , 6.69152525, 6.47098207, 6.47394647])

#### Compute RMSE score

In [34]:
np.sqrt(mean_squared_error(np.exp(y)-1, y_pred))

3.90080957127268

#### Compute accuracy +/- 3 and 5 minutes

In [35]:
accuracy_dataframe = pd.DataFrame(data={'y_pred':y_pred, 'y':np.exp(y)-1})

In [36]:
accuracy_dataframe['3MinutesRange'] = abs(accuracy_dataframe['y_pred'] - accuracy_dataframe['y']) <= 3
accuracy_dataframe['5MinutesRange'] = abs(accuracy_dataframe['y_pred'] - accuracy_dataframe['y']) <= 5

In [37]:
print(accuracy_dataframe['3MinutesRange'].value_counts(normalize=True) * 100)

print(accuracy_dataframe['5MinutesRange'].value_counts(normalize=True) * 100)

True     81.469003
False    18.530997
Name: 3MinutesRange, dtype: float64
True     93.025606
False     6.974394
Name: 5MinutesRange, dtype: float64


#### Compute 10% and 90% error

In [45]:
Error = (np.exp(y)-1 - y_pred)**2
print(np.sqrt(Error.quantile(0.1)))
print(np.sqrt(Error.quantile(0.9)))

4.123139054485239