In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [74]:
pd.set_option('display.max_columns', None)

In [75]:
df = pd.read_csv('ftest_to_encode.csv')

In [76]:
df = df.drop(['Unnamed: 0'],axis=1)

In [77]:
### adding part of day

In [78]:
def daypart(x):
    if x >= 200 and x <= 559:
        return 'dawn'
    elif x >= 600 and x <= 959:
        return 'morning'
    elif x >=1000 and x <= 1359:
        return 'noon'
    elif x >= 1400 and x <= 1759:
        return 'noon'
    elif x >= 1800 and x <= 2259:
        return 'evening'
    else: return 'midnight'

In [79]:
df['dep_day_part']=df.crs_dep_time.apply(daypart)
df['arr_day_part']=df.crs_arr_time.apply(daypart)

In [80]:
### adding day of week

In [81]:
df.fl_date = pd.to_datetime(df.fl_date)

In [82]:
df['day_of_week'] = df.fl_date.dt.day_name()

In [83]:
df = df.drop(['crs_dep_time','crs_arr_time'], axis=1)

In [84]:
### encoding weather(type)

In [85]:
df = pd.get_dummies(df, prefix=['Type'], columns=['Type_Mode'])

In [86]:
df = df.drop(['Type_Storm'],axis=1)

In [87]:
### encoding day of week

In [88]:
df = pd.get_dummies(df, prefix=['dow'], columns=['day_of_week'])

In [89]:
df = df.drop(['dow_Wednesday'],axis=1)

In [90]:
### encoding time of day

In [91]:
df = pd.get_dummies(df, prefix=['arr'], columns=['arr_day_part'])

In [92]:
df = pd.get_dummies(df, prefix=['dep'], columns=['dep_day_part'])

In [93]:
df = df.drop(['arr_midnight'],axis=1)

In [94]:
df = df.drop(['dep_midnight'],axis=1)

In [95]:
### encode market unique carrier

In [96]:
df = pd.get_dummies(df, prefix=['muc'], columns=['mkt_unique_carrier'], drop_first=True)

In [97]:
### encode market_op_carrier_difference

In [98]:
df.mkt_op_carrier_difference = df.mkt_op_carrier_difference.replace([True,False],[1,0])

In [99]:
### scale numerical values

In [100]:
from sklearn.preprocessing import StandardScaler

In [101]:
scaler = StandardScaler()
df.reset_index(drop=True, inplace=True)
df_scaled = scaler.fit_transform(df[['Passengers_Seat_Ratio','distance','Taxi_Holdup','crs_elapsed_time']])
s_df = pd.DataFrame(df_scaled)
df = df.drop(['Passengers_Seat_Ratio','distance','Taxi_Holdup','crs_elapsed_time'], axis=1)
s_df = s_df.rename(columns={0:'Passengers_Seat_Ratio',
                                  1:'distance',
                                  2:'Taxi_Holdup',
                                  3:'crs_elapsed_time'})
df = pd.concat([df,s_df],axis=1)

In [102]:
### encoding origin and destination

In [103]:
from sklearn.feature_extraction import FeatureHasher

In [104]:
fh = FeatureHasher (n_features = 10, input_type='string')
hashed_features = fh.fit_transform(df['origin_city_name'])
hashed_features = hashed_features.toarray()
hf = pd.DataFrame(hashed_features)
df = pd.concat([df, hf], axis=1)
df = df.rename(columns={0:'origin_0',
                                  1:'origin_1',
                                  2:'origin_2',
                                  3:'origin_3',
                                  4:'origin_4',
                                  5:'origin_5',
                                  6:'origin_6',
                                  7:'origin_7',
                                  8:'origin_8',
                                  9:'origin_9',
                                  10:'origin_10',})

In [105]:
fh2 = FeatureHasher (n_features = 10, input_type='string')
hashed_features2 = fh2.fit_transform(df['dest_city_name'])
hashed_features2 = hashed_features2.toarray()
hf2 = pd.DataFrame(hashed_features2)
df = pd.concat([df, hf2], axis=1)
df = df.rename(columns={0:'dest_0',
                                  1:'dest_1',
                                  2:'dest_2',
                                  3:'dest_3',
                                  4:'dest_4',
                                  5:'dest_5',
                                  6:'dest_6',
                                  7:'dest_7',
                                  8:'dest_8',
                                  9:'dest_9',
                                  10:'dest_10',})

In [106]:
df = df.drop(['fl_date','origin_city_name','dest_city_name'],axis=1)

In [107]:
df.head(1)

Unnamed: 0,mkt_op_carrier_difference,Type_Cold,Type_Fog,Type_Hail,Type_Precipitation,Type_Rain,Type_Snow,dow_Friday,dow_Monday,dow_Saturday,dow_Sunday,dow_Thursday,dow_Tuesday,arr_dawn,arr_evening,arr_morning,arr_noon,dep_dawn,dep_evening,dep_morning,dep_noon,muc_AS,muc_B6,muc_DL,muc_F9,muc_G4,muc_HA,muc_NK,muc_UA,muc_WN,Passengers_Seat_Ratio,distance,Taxi_Holdup,crs_elapsed_time,origin_0,origin_1,origin_2,origin_3,origin_4,origin_5,origin_6,origin_7,origin_8,origin_9,dest_0,dest_1,dest_2,dest_3,dest_4,dest_5,dest_6,dest_7,dest_8,dest_9
0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,-0.221416,-0.764502,-0.243555,-1.013757,1.0,-3.0,3.0,1.0,-1.0,1.0,0.0,0.0,0.0,-1.0,1.0,-1.0,3.0,1.0,-1.0,0.0,0.0,0.0,-1.0,0.0


### Based on Features Selected by XGBoost Model:

In [109]:
df = df.drop(['muc_AS','muc_B6','muc_DL','muc_F9','muc_G4','muc_HA','muc_NK','muc_UA','muc_WN'],axis=1)

In [111]:
df.to_csv('flight_test_encoded.csv')

### PREDITED DELAYS USING FLIGHT_TEST

In [114]:
import pickle

In [119]:
xg_model = pickle.load(open('final_gxboost_model.sav', 'rb'))

In [120]:
xg_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=45, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0.57, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [121]:
delay_pred = xg_model.predict(df)

In [122]:
df.head()

Unnamed: 0,mkt_op_carrier_difference,Type_Cold,Type_Fog,Type_Hail,Type_Precipitation,Type_Rain,Type_Snow,dow_Friday,dow_Monday,dow_Saturday,dow_Sunday,dow_Thursday,dow_Tuesday,arr_dawn,arr_evening,arr_morning,arr_noon,dep_dawn,dep_evening,dep_morning,dep_noon,Passengers_Seat_Ratio,distance,Taxi_Holdup,crs_elapsed_time,origin_0,origin_1,origin_2,origin_3,origin_4,origin_5,origin_6,origin_7,origin_8,origin_9,dest_0,dest_1,dest_2,dest_3,dest_4,dest_5,dest_6,dest_7,dest_8,dest_9
0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,-0.221416,-0.764502,-0.243555,-1.013757,1.0,-3.0,3.0,1.0,-1.0,1.0,0.0,0.0,0.0,-1.0,1.0,-1.0,3.0,1.0,-1.0,0.0,0.0,0.0,-1.0,0.0
1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,-0.221416,-0.764502,0.216971,-0.942544,1.0,-3.0,3.0,1.0,-1.0,1.0,0.0,0.0,0.0,-1.0,1.0,-1.0,3.0,1.0,-1.0,0.0,0.0,0.0,-1.0,0.0
2,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,-0.221416,-0.764502,1.300387,-0.871331,1.0,-3.0,3.0,1.0,-1.0,1.0,0.0,0.0,0.0,-1.0,1.0,-1.0,3.0,1.0,-1.0,0.0,0.0,0.0,-1.0,0.0
3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,-0.221416,-0.764502,1.300387,-0.942544,1.0,-3.0,3.0,1.0,-1.0,1.0,0.0,0.0,0.0,-1.0,1.0,-1.0,3.0,1.0,-1.0,0.0,0.0,0.0,-1.0,0.0
4,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,-0.221416,-0.764502,-1.198826,-0.871331,1.0,-3.0,3.0,1.0,-1.0,1.0,0.0,0.0,0.0,-1.0,1.0,-1.0,3.0,1.0,-1.0,0.0,0.0,0.0,-1.0,0.0


In [124]:
flight_test_predict_pd = pd.DataFrame(delay_pred, columns=['predicted_delay'])

In [125]:
flight_test_predict_pd

Unnamed: 0,predicted_delay
0,-3.247165
1,-6.271799
2,10.167065
3,9.698260
4,-5.604270
...,...
244055,75.217621
244056,43.647591
244057,4.482183
244058,51.418526
