In [1]:
import xgboost as xgb

# You can then load the model later with:
loaded_model = xgb.Booster()
loaded_model.load_model("my_xgb_model.model")

In [2]:
import pandas as pd

In [3]:
df_flights_test = pd.read_csv('df_flights_test.csv')

In [4]:
df_flights_test.shape

(300000, 21)

In [5]:
df_flights_test.mkt_unique_carrier.value_counts()

AA    78777
DL    63728
UA    58920
WN    49622
AS    16643
B6    11175
NK     8006
F9     5800
G4     3766
HA     3563
Name: mkt_unique_carrier, dtype: int64

In [6]:
df_flights_test[['mkt_unique_carrier']] = df_flights_test[['mkt_unique_carrier']].apply(lambda col: pd.factorize(col, sort=True)[0])

In [7]:
df_flights_test.mkt_unique_carrier.value_counts()

0    78777
3    63728
8    58920
9    49622
1    16643
2    11175
7     8006
4     5800
5     3766
6     3563
Name: mkt_unique_carrier, dtype: int64

In [8]:
df_flights_test.head(2)

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,0,2020-01-01,9,WN,WN,5888,WN,N951WN,5888,13891,...,"Ontario, CA",14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363
1,1,2020-01-01,9,WN,WN,6276,WN,N467WN,6276,13891,...,"Ontario, CA",14771,SFO,"San Francisco, CA",1150,1320,N,90,1,363


In [9]:
df_flights_test['fl_date'] = pd.to_datetime(df_flights_test['fl_date'])
df_flights_test['weekday'] = df_flights_test['fl_date'].dt.dayofweek
df_flights_test.head()

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance,weekday
0,0,2020-01-01,9,WN,WN,5888,WN,N951WN,5888,13891,...,14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363,2
1,1,2020-01-01,9,WN,WN,6276,WN,N467WN,6276,13891,...,14771,SFO,"San Francisco, CA",1150,1320,N,90,1,363,2
2,2,2020-01-01,9,WN,WN,4598,WN,N7885A,4598,13891,...,14831,SJC,"San Jose, CA",2020,2130,N,70,1,333,2
3,3,2020-01-01,9,WN,WN,4761,WN,N551WN,4761,13891,...,14831,SJC,"San Jose, CA",1340,1455,N,75,1,333,2
4,4,2020-01-01,9,WN,WN,5162,WN,N968WN,5162,13891,...,14831,SJC,"San Jose, CA",915,1035,N,80,1,333,2


In [10]:
df_flights_test.columns

Index(['Unnamed: 0', 'fl_date', 'mkt_unique_carrier', 'branded_code_share',
       'mkt_carrier', 'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time',
       'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance',
       'weekday'],
      dtype='object')

In [11]:
df_flights_test.dtypes

Unnamed: 0                     int64
fl_date               datetime64[ns]
mkt_unique_carrier             int64
branded_code_share            object
mkt_carrier                   object
mkt_carrier_fl_num             int64
op_unique_carrier             object
tail_num                      object
op_carrier_fl_num              int64
origin_airport_id              int64
origin                        object
origin_city_name              object
dest_airport_id                int64
dest                          object
dest_city_name                object
crs_dep_time                   int64
crs_arr_time                   int64
dup                           object
crs_elapsed_time               int64
flights                        int64
distance                       int64
weekday                        int64
dtype: object

In [12]:
df_data = df_flights_test[['mkt_unique_carrier', 'mkt_carrier_fl_num', 'origin_airport_id', 'dest_airport_id', 'crs_dep_time', \
                           'crs_arr_time', 'crs_elapsed_time', 'distance']] #, 'weekday']]

In [13]:
df_data.columns

Index(['mkt_unique_carrier', 'mkt_carrier_fl_num', 'origin_airport_id',
       'dest_airport_id', 'crs_dep_time', 'crs_arr_time', 'crs_elapsed_time',
       'distance'],
      dtype='object')

In [14]:
# Convert the dataset into an optimized data structure called Dmatrix that XGBoost supports and gives it acclaimed performance and efficiency gains.
data_dmatrix = xgb.DMatrix(data=df_data)

In [15]:
data_dmatrix

<xgboost.core.DMatrix at 0x1a837dc4370>

In [16]:
# And use it for predictions.
predicts = loaded_model.predict(data_dmatrix)
predicts

array([  7.1981263,  -6.489672 ,  29.8308   , ...,   7.727312 ,
       -14.305009 ,  -8.031256 ], dtype=float32)

In [17]:
import numpy as np
np.mean(predicts)

-0.7495863

In [18]:
max(predicts), min(predicts)

(143.4979, -27.318256)

In [21]:
df_flights_test['predicted_delay'] = predicts

In [22]:
df_flights_test

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance,weekday,predicted_delay
0,0,2020-01-01,9,WN,WN,5888,WN,N951WN,5888,13891,...,SFO,"San Francisco, CA",1810,1945,N,95,1,363,2,7.198126
1,1,2020-01-01,9,WN,WN,6276,WN,N467WN,6276,13891,...,SFO,"San Francisco, CA",1150,1320,N,90,1,363,2,-6.489672
2,2,2020-01-01,9,WN,WN,4598,WN,N7885A,4598,13891,...,SJC,"San Jose, CA",2020,2130,N,70,1,333,2,29.830799
3,3,2020-01-01,9,WN,WN,4761,WN,N551WN,4761,13891,...,SJC,"San Jose, CA",1340,1455,N,75,1,333,2,1.195062
4,4,2020-01-01,9,WN,WN,5162,WN,N968WN,5162,13891,...,SJC,"San Jose, CA",915,1035,N,80,1,333,2,3.401259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,299995,2020-01-15,6,HA,HA,38,HA,N205HA,38,13830,...,SAN,"San Diego, CA",1210,1935,N,325,1,2541,2,-9.940062
299996,299996,2020-01-15,6,HA,HA,39,HA,N213HA,39,14057,...,OGG,"Kahului, HI",810,1220,N,370,1,2562,2,-14.305009
299997,299997,2020-01-15,6,HA,HA,40,HA,N215HA,40,13830,...,PDX,"Portland, OR",1455,2220,N,325,1,2562,2,7.727312
299998,299998,2020-01-15,6,HA,HA,41,HA,N220HA,41,14771,...,OGG,"Kahului, HI",855,1240,N,345,1,2338,2,-14.305009


In [25]:
df_flights_test.to_csv('df_flights_test_predicted.csv')