In [351]:
import datetime, warnings, scipy
import pandas as pd
import numpy as np
from sklearn import metrics, linear_model
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler
import json
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from scipy.optimize import curve_fit
warnings.filterwarnings("ignore")

In [311]:
def reload_file(fname):
    return pd.read_csv(fname)


fn_flight = '../data/flight_data.csv'
fn_passenger = '../data/passengers_sample.csv'
fn_fuel = '../data/fuel_consumption.csv'
fn_jan_flights = '../data/january_flights.csv'
fn_test_flights = '../data/test_flights.csv'
fn_semi_test = '../data/tester_flights.csv'
flight_df = pd.read_csv(fn_flight)
passenger_df = pd.read_csv(fn_passenger)
fuel_df = pd.read_csv(fn_fuel)
jan_flights = pd.read_csv(fn_jan_flights)
test_flights = pd.read_csv(fn_test_flights)
semi_test = pd.read_csv(fn_semi_test)

In [312]:
def parse_dates(df):
    """Takes string date parses to pd.datetime object"""
    try:
        df['fl_date'] = pd.to_datetime(df['fl_date'])
    except KeyError as err:
        print('Unsuccessful parsing...')
        raise KeyError(f'passed DataFrame does not have key {err}')
    return df


def format_time(series):
    """Formats time"""
    if pd.isnull(series):
        return np.nan
    else:
        if series == 2400:
            series = 0
        series = f"{int(series):04d}"
        hour = datetime.time(int(series[0:2]), int(series[2:4]))
        return hour


def combine_date_time(x):
    """Produces datetime.datetime object"""
    if pd.isnull(x[0]) or pd.isnull(x[1]):
        return pd.nan
    else:
        return datetime.datetime.combine(x[0], x[1])


# TODO look over to try avoid looping
def create_date_time(df, feature):
    lst = []
    for index, cols in df[['fl_date', feature]].iterrows():
        if pd.isnull(cols[1]):
            lst.append(np.nan)
        elif float(cols[1]) == 2400:
            cols[0] += datetime.timedelta(days=1)
            cols[1] = datetime.time(0,0)
            lst.append(combine_date_time(cols))
        else:
            cols[1] = format_time(cols[1])
            lst.append(combine_date_time(cols))
    return pd.Series(lst)

In [313]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean()}

In [314]:
def get_flight_delays(df, airline, airport_id, extreme_values = False):
    df2 = df[(df['airline'] == airline) & (df['origin'] == airport_id)]
    # remove extreme values before fitting
    if extreme_values:
        df2['arr_delay'] = df2['arr_delay'].apply(lambda x: x if x < 60 else np.nan)
        df2.dropna(axis=0, inplace=True)
    # Conversion: datetime to time
    df2.sort_values('scheduled_dep', inplace = True)
    df2['departing_hour'] =  df2['scheduled_dep'].apply(lambda x:x.time())
    # grouping by time and calculating mean
    # print(airline, airport_id, 'right before unstacking')
    test2 = df2['arr_delay'].groupby(df2['departing_hour']).apply(get_stats).unstack()
    test2.reset_index(inplace=True)
    # converting time to seconds
    fct = lambda x:x.hour*3600+x.minute*60+x.second
    test2.reset_index(inplace=True)
    test2['departing_in_sec'] = test2['departing_hour'].apply(fct)
    return test2

In [315]:
def get_merged_delays(df, carrier):
    airports = df[df['airline'] == carrier]['origin'].unique()
    i = 0
    columns = ['origin', 'departing_in_sec', 'mean']
    for airport in airports:
        # ********************** SPECIAL CASE **************
        if (airport == 'MMH' and carrier == 'AS') or (airport=='LIT' and
        carrier == 'G4') or (airport=='ROA' and carrier == 'G4') or (airport=='SWF' and carrier == 'G4'):
            print('continue')
            continue
        # ************************************************
        df2 = get_flight_delays(df, carrier, airport, True)
        # **********************
        df2.loc[:, 'origin'] = airport
        df2 = df2[columns]
        df2.dropna(axis=0, inplace=True)
        if i == 0:
            merged_df = df2.copy()
        else:
            merged_df = pd.concat([merged_df, df2], ignore_index=True)
        i += 1
    return merged_df

In [428]:
def test_datasets_preparation(data):
    data = parse_dates(data)
    data['crs_dep_datetime'] = create_date_time(data, 'crs_dep_time')
    data['crs_dep_time'] = data['crs_dep_time'].apply(format_time)
    data['airline'] = data['mkt_unique_carrier']
    data = data[['crs_dep_datetime','airline','mkt_carrier_fl_num', 'origin','dest', 'crs_arr_time','crs_elapsed_time','distance']]
    return data

In [316]:
def datasets_preparation(data):
    data = parse_dates(data)
    data['crs_dep_datetime'] = create_date_time(data, 'crs_dep_time')
    data['crs_dep_time'] = data['crs_dep_time'].apply(format_time)
    data['crs_arr_time'] = data['crs_arr_time'].apply(format_time)
    data['arr_time'] = data['arr_time'].apply(format_time)

    data['airline'] = data['mkt_unique_carrier']
    data['scheduled_dep'] = data['crs_dep_datetime']

    data = data[['airline','tail_num','origin','dest', 'crs_dep_datetime','dep_delay', 'arr_delay', 'crs_arr_time','taxi_out','taxi_in','scheduled_dep','crs_elapsed_time','air_time','distance']]
    # data = data.dropna(inplace=True, axis=0)
    return data

In [317]:
def splitting_data(data, airlines):
    data = data[data[[*data.columns.tolist()]].notna()]
    datasets = {}
    i = 0
    for airline in airlines:
        i += 1
        # **********************
        merged_df = get_merged_delays(data, airline)
        # **********************
        merged_df = pd.get_dummies(merged_df, ['origin'])
        # –––––––––––––––––––––COLS TO KEEP––––––––––––––––––
        merged_df['distance'] = data['distance']
        merged_df['crs_elapsed_time'] = data['crs_elapsed_time']
        # –––––––––––––––––––––––––––––––––––––––––––––––––––
        x_not_scaled = merged_df.drop(['mean'], axis=1)
        y = merged_df['mean']
        # ––––––––––––––––––––––––Scaling–––––––––––––––––––––
        # x_scaled = scaler(x_not_scaled)
        # X = pd.DataFrame(x_scaled, columns=x_not_scaled.columns)
        # –––––––––––––––––––––––––––––––––––––––––––––––––––
        datasets[airline] = (x_not_scaled, y)
        print(f'Dataset for {airline} is ready: {i}/{len(airlines)}')
    data_records = {al:dfs[0].columns.tolist() for al, dfs in datasets.items()}
    with open('../data/data_records.json', 'w') as fp:
            json.dump(data_records, fp)
    return datasets

In [318]:
def splitting_test_data(data, airlines):
    data = data[data[[*data.columns.tolist()]].notna()]
    # data = data.dropna(inplace=True, axis=0)
    datasets = {}
    i = 0
    for airline in airlines:
        i += 1
        # **********************
        merged_df = get_merged_delays(data, airline)
        # **********************
        merged_df = pd.get_dummies(merged_df, ['origin'])
        # –––––––––––––––––––––COLS TO KEEP––––––––––––––––––
        merged_df['distance'] = data['distance']
        merged_df['crs_elapsed_time'] = data['crs_elapsed_time']
        # –––––––––––––––––––––––––––––––––––––––––––––––––––
        x_not_scaled = merged_df.drop(['mean'], axis=1)
        y = merged_df['mean']
        datasets[airline] = (x_not_scaled, y)
        print(f'Dataset for {airline} is ready: {i}/{len(airlines)}')
    return datasets

In [319]:
def scaler(df):
    scaler_ = MinMaxScaler()
    scaled = scaler_.fit_transform(df)
    return scaled

In [521]:
def trainer(datasets, airlines):
    models = {}
    for airline in airlines:
        x, y = datasets[airline]
        ## –––––––––––XGB–––––––––––
        # xg_reg = xgb.XGBRegressor(objective='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
        #                           max_depth = 5, alpha = 10, n_estimators = 10)
        # model = xg_reg.fit(x, y)
        # model_name = 'xgb'
        ## –––––––––––––––––––––––––
        # print(id(xg_reg))
        ## –––––––Linear Regressor––––––
        # regressor = linear_model.LinearRegression()
        # model = regressor.fit(x, y)
        # model_name = 'lin_reg'
        ## –––––––––––––––––––––––––
        # –––––––Random Forest Regressor––––––
        regressor = RandomForestRegressor()
        model = regressor.fit(x, y)
        model_name = 'rfr'
        # –––––––––––––––––––––––––
        # –––––––Random Forest Regressor––––––
        # regressor = SVR(kernel='poly')
        # model = regressor.fit(x, y)
        # model_name = 'svr'
        # –––––––––––––––––––––––––



        fn_name = '../data/' + airline + f'_{model_name}.sav'
        pickle.dump(model, open(fn_name, 'wb'))
        models[airline] = fn_name
        print(f'Model for airline {airline} is built.')
    return models

In [321]:
def pre_populate(data_predict, airlines):
    """Assuming Init Model seen all the columns"""
    with open('../data/data_records.json') as f:
        records = json.load(f)
    dict_df = {}
    for airline in airlines:
        x_cur, y = data_predict[airline]
        init_col = records[airline]
        curr_cols = x_cur.columns.to_list()
        missing_cols = set(init_col) - set(curr_cols)
        for col in missing_cols:
            x_cur[col] = 0
        x = x_cur.reindex(columns=init_col)
        dict_df[airline] = x, y
    return dict_df

In [322]:
def eval(models, data):
    results = {}
    for dataset in data:
        airline, x, y, = dataset
        model = models[airline]
        y_pred = model.predict(x)
        mse = f"MSE ={metrics.mean_squared_error(y_pred, y)}"
        mae = f"MAE ={metrics.mean_absolute_error(y_pred, y)}"
        r2 = f"R^2 ={metrics.r2_score(y_pred, y)}"
        results[airline] = (mse, mae, r2)
        print(mse)
        print(mae)
        print(r2)
    return results

In [323]:
training_data = reload_file(fn_semi_test)
semi_test = reload_file(fn_flight)

In [324]:
airlines = training_data['mkt_unique_carrier'].unique().tolist()

In [325]:
training_datasets = datasets_preparation(training_data)

In [442]:
training_datasets

Unnamed: 0,airline,tail_num,origin,dest,crs_dep_datetime,dep_delay,arr_delay,crs_arr_time,taxi_out,taxi_in,scheduled_dep,crs_elapsed_time,air_time,distance
0,HA,N488HA,HNL,ITO,2018-01-01 06:00:00,-10.0,2.0,06:53:00,21.0,7.0,2018-01-01 06:00:00,53.0,37.0,216.0
1,NK,N603NK,BWI,RSW,2018-01-01 09:00:00,-4.0,-7.0,11:40:00,12.0,3.0,2018-01-01 09:00:00,160.0,142.0,919.0
2,NK,N622NK,RSW,BWI,2018-01-01 19:44:00,18.0,2.0,22:07:00,10.0,5.0,2018-01-01 19:44:00,143.0,112.0,919.0
3,NK,N633NK,ACY,MYR,2018-01-01 07:15:00,-6.0,-9.0,08:48:00,9.0,3.0,2018-01-01 07:15:00,93.0,78.0,466.0
4,NK,N669NK,FLL,DTW,2018-01-01 20:30:00,-5.0,0.0,23:28:00,24.0,11.0,2018-01-01 20:30:00,178.0,148.0,1127.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634726,DL,N557NW,MCO,ATL,2019-07-31 07:45:00,-2.0,-5.0,09:23:00,22.0,14.0,2019-07-31 07:45:00,98.0,59.0,404.0
634727,DL,N382DN,ATL,LGA,2019-07-31 21:30:00,,,23:50:00,,,2019-07-31 21:30:00,140.0,,762.0
634728,DL,N925DN,ATL,IAH,2019-07-31 15:46:00,-1.0,-23.0,17:05:00,12.0,9.0,2019-07-31 15:46:00,139.0,96.0,689.0
634729,DL,N925DN,IAH,ATL,2019-07-31 17:49:00,46.0,35.0,21:02:00,14.0,11.0,2019-07-31 17:49:00,133.0,97.0,689.0


In [326]:
split_data_train = splitting_data(training_datasets, airlines)

Dataset for HA is ready: 1/11
Dataset for NK is ready: 2/11
Dataset for AA is ready: 3/11
Dataset for UA is ready: 4/11
continue
Dataset for AS is ready: 5/11
Dataset for DL is ready: 6/11
continue
continue
continue
Dataset for G4 is ready: 7/11
Dataset for WN is ready: 8/11
Dataset for VX is ready: 9/11
Dataset for B6 is ready: 10/11
Dataset for F9 is ready: 11/11


In [522]:
models = trainer(split_data_train, airlines)

Model for airline WN is built.
Model for airline AA is built.
Model for airline UA is built.
Model for airline HA is built.
Model for airline AS is built.
Model for airline NK is built.
Model for airline DL is built.
Model for airline B6 is built.
Model for airline F9 is built.
Model for airline G4 is built.


In [328]:
airlines1 = semi_test['mkt_unique_carrier'].unique().tolist()

In [329]:
ready_to_test = datasets_preparation(semi_test)

In [330]:
split_test_data = splitting_test_data(ready_to_test, airlines1)

Dataset for UA is ready: 1/11
Dataset for AA is ready: 2/11
Dataset for DL is ready: 3/11
Dataset for F9 is ready: 4/11
Dataset for NK is ready: 5/11
Dataset for VX is ready: 6/11
Dataset for WN is ready: 7/11
continue
Dataset for AS is ready: 8/11
continue
continue
continue
Dataset for G4 is ready: 9/11
Dataset for B6 is ready: 10/11
Dataset for HA is ready: 11/11


In [331]:
data_to_test = pre_populate(split_test_data, airlines)

### Evaluating Models

In [332]:
def get_results(model_name):
    print(f'–––––––––––––––––––{model_name} Predictions–––––––––––––––––––––')
    for airline in airlines:
        # load the model from disk
        print(airline)
        loaded_model = pickle.load(open(models[airline], 'rb'))
        X_test, y_test = data_to_test[airline]
        y_pred = loaded_model.predict(X_test)
        mse = f"MSE = {metrics.mean_squared_error(y_pred, y_test)}"
        mae = f"MAE = {metrics.mean_absolute_error(y_pred, y_test)}"
        r2 = f"R^2 = {metrics.r2_score(y_pred, y_test)}"
        print(f'******Results for {airline}*************')
        print(mse)
        print(mae)
        print(r2)
        print()

In [337]:
fn_models_xgb = {'HA': '../data/HA_xgb.sav',
                 'NK': '../data/NK_xgb.sav',
                 'AA': '../data/AA_xgb.sav',
                 'UA': '../data/UA_xgb.sav',
                 'AS': '../data/AS_xgb.sav',
                 'DL': '../data/DL_xgb.sav',
                 'G4': '../data/G4_xgb.sav',
                 'WN': '../data/WN_xgb.sav',
                 'VX': '../data/VX_xgb.sav',
                 'B6': '../data/B6_xgb.sav',
                 'F9': '../data/F9_xgb.sav'}
get_results('XGBOOST')

–––––––––––––––––––XGBOOST Predictions–––––––––––––––––––––
HA
******Results for HA*************
MSE = 121.2604418537729
MAE = 7.711917607137268
R^2 = -173.68141211772414

NK
******Results for NK*************
MSE = 263.5446634506673
MAE = 12.465441057393608
R^2 = -617.7390787818701

AA
******Results for AA*************
MSE = 256.1402648369833
MAE = 12.118718660841614
R^2 = -263.412991171914

UA
******Results for UA*************
MSE = 267.47374070554406
MAE = 12.39639967671234
R^2 = -558.1640168307237

AS
******Results for AS*************
MSE = 249.071197294509
MAE = 11.746317104683705
R^2 = -502.25796664921876

DL
******Results for DL*************
MSE = 232.66065229361698
MAE = 11.523182551207887
R^2 = -658.5775200678738

G4
******Results for G4*************
MSE = 318.1452921698196
MAE = 13.596668880857793
R^2 = -353.9382280572989

WN
******Results for WN*************
MSE = 188.27708442290654
MAE = 10.16939949420154
R^2 = -110.88272807160696

VX
******Results for VX*************
MSE = 

In [342]:
fn_models_lin_reg = {'HA': '../data/HA_linreg.sav',
                     'NK': '../data/NK_linreg.sav',
                     'AA': '../data/AA_linreg.sav',
                     'UA': '../data/UA_linreg.sav',
                     'AS': '../data/AS_linreg.sav',
                     'DL': '../data/DL_linreg.sav',
                     'G4': '../data/G4_linreg.sav',
                     'WN': '../data/WN_linreg.sav',
                     'VX': '../data/VX_linreg.sav',
                     'B6': '../data/B6_linreg.sav',
                     'F9': '../data/F9_linreg.sav'}
get_results('Linear Regression')

–––––––––––––––––––Linear Regression Predictions–––––––––––––––––––––
HA
******Results for HA*************
MSE = 123.26087156722743
MAE = 7.6861741912891555
R^2 = -29.614281251741385

NK
******Results for NK*************
MSE = 253.82190268346915
MAE = 11.857093081170477
R^2 = -27.252596399737612

AA
******Results for AA*************
MSE = 250.64809324903598
MAE = 11.790800602659244
R^2 = -24.30962120783478

UA
******Results for UA*************
MSE = 263.43685441800994
MAE = 12.044446996778946
R^2 = -42.08931641366809

AS
******Results for AS*************
MSE = 239.9488890764281
MAE = 11.355106098107585
R^2 = -19.70089482780543

DL
******Results for DL*************
MSE = 224.42657686423985
MAE = 10.884630870070565
R^2 = -39.47352861581824

G4
******Results for G4*************
MSE = 316.67167087707816
MAE = 13.30118482211203
R^2 = -14.904447145538839

WN
******Results for WN*************
MSE = 181.39307783840428
MAE = 9.740823128131675
R^2 = -10.05528115097552

VX
******Results for VX***

In [348]:
fn_models_rfr = {'HA': '../data/HA_rfr.sav',
                 'NK': '../data/NK_rfr.sav',
                 'AA': '../data/AA_rfr.sav',
                 'UA': '../data/UA_rfr.sav',
                 'AS': '../data/AS_rfr.sav',
                 'DL': '../data/DL_rfr.sav',
                 'G4': '../data/G4_rfr.sav',
                 'WN': '../data/WN_rfr.sav',
                 'VX': '../data/VX_rfr.sav',
                 'B6': '../data/B6_rfr.sav',
                 'F9': '../data/F9_rfr.sav'}
get_results('RandomForestRegressor')

–––––––––––––––––––RandomForestRegressor Predictions–––––––––––––––––––––
HA
******Results for HA*************
MSE = 117.7230412596377
MAE = 7.592602332653435
R^2 = -6.104765575621075

NK
******Results for NK*************
MSE = 266.76821254955627
MAE = 12.16856601040853
R^2 = -10.110609911012263

AA
******Results for AA*************
MSE = 264.33372027721435
MAE = 12.09297147036694
R^2 = -8.476420273203363

UA
******Results for UA*************
MSE = 277.66267153479276
MAE = 12.329773262676559
R^2 = -9.632467330630528

AS
******Results for AS*************
MSE = 241.87565551362812
MAE = 11.486633735119955
R^2 = -7.793706074571061

DL
******Results for DL*************
MSE = 238.163953008909
MAE = 11.262200596941746
R^2 = -10.807232721598112

G4
******Results for G4*************
MSE = 354.5572223171357
MAE = 13.926751645469794
R^2 = -6.351079736583477

WN
******Results for WN*************
MSE = 186.79515586053537
MAE = 9.915837756971822
R^2 = -6.739885306734721

VX
******Results for VX*****

In [418]:
fn_models_sav ={'HA': '../data/HA_svr.sav',
                'NK': '../data/NK_svr.sav',
                'AA': '../data/AA_svr.sav',
                'UA': '../data/UA_svr.sav',
                'AS': '../data/AS_svr.sav',
                'DL': '../data/DL_svr.sav',
                'G4': '../data/G4_svr.sav',
                'WN': '../data/WN_svr.sav',
                'VX': '../data/VX_svr.sav',
                'B6': '../data/B6_svr.sav',
                'F9': '../data/F9_svr.sav'}
get_results('Support Vector Machine')

–––––––––––––––––––Support Vector Machine Predictions–––––––––––––––––––––
HA
******Results for HA*************
MSE = 127.3141459834535
MAE = 7.604924093910531
R^2 = -13482.238825259483

NK
******Results for NK*************
MSE = 256.40861367556215
MAE = 11.572942866908363
R^2 = -225.25440511991877

AA
******Results for AA*************
MSE = 258.0318855761492
MAE = 11.768734634529613
R^2 = -95.91572397843207

UA
******Results for UA*************
MSE = 266.9858784943705
MAE = 11.905431101786764
R^2 = -394.35872740230747

AS
******Results for AS*************
MSE = 247.90150345419576
MAE = 11.46920453037563
R^2 = -8695.292589794739

DL
******Results for DL*************
MSE = 228.74397669067332
MAE = 10.840668085771117
R^2 = -272.4540780446128

G4
******Results for G4*************
MSE = 330.63518419968267
MAE = 13.057178583433616
R^2 = -370.2915850944284

WN
******Results for WN*************
MSE = 185.51835580639528
MAE = 9.811603404493717
R^2 = -14.639997970298833

VX
******Results for VX

### Final Evalutation

In [473]:
test_data = reload_file(fn_test_flights)

In [474]:
airlines = test_data['mkt_unique_carrier'].unique().tolist()

In [475]:
f_test_preparation = test_datasets_preparation(test_data)

In [476]:
first_7_days = pd.to_datetime('2020-01-08')

In [477]:
f_test_preparation = f_test_preparation[f_test_preparation['crs_dep_datetime'] < first_7_days]

In [508]:
# (time, airline, airtime, origin, distance)
def split(df, airlines):
    df['departing_hour'] = df['crs_dep_datetime'].apply(lambda c:c.time())
    df['dep_time_in_sec'] = df['departing_hour'].apply(lambda c:c.hour*3600+c.minute*60)
    datasets = {}
    for airline in airlines:
        df_predict = df[['dep_time_in_sec', 'crs_elapsed_time', 'origin', 'distance']]
        df_predict = pd.get_dummies(df_predict, ['origin'])
        df_submit = df[['crs_dep_datetime', 'airline', 'mkt_carrier_fl_num', 'origin', 'dest']] # predicted_delay
        datasets[airline] = df_predict, df_submit
    return datasets

In [515]:
def pre_populate(data_predict, airlines):
    """Assuming Init Model seen all the columns"""
    with open('../data/data_records.json') as f:
        records = json.load(f)
    dict_df = {}
    for airline in airlines:
        x_cur, df_submit = data_predict[airline]
        init_col = records[airline]
        curr_cols = x_cur.columns.to_list()
        missing_cols = set(init_col) - set(curr_cols)
        for col in missing_cols:
            x_cur[col] = 0
        x = x_cur.reindex(columns=init_col)
        dict_df[airline] = x, df_submit
    return dict_df

In [518]:
def get_results(df_dict, airlines):
    results = {}
    for airline in airlines:
        # load the model from disk
        loaded_model = pickle.load(open(models[airline], 'rb'))
        X_test, df_submit = df_dict[airline]
        y_pred = loaded_model.predict(X_test)
        print(f'Predicted for {airline}...')
        df_submit['predicted_delay'] = y_pred
        results[airline] = df_submit
    return results

In [512]:
test_dicts = split(f_test_preparation, airlines)

In [516]:
populated_df = pre_populate(test_dicts, airlines)

In [523]:
results = get_results(populated_df, airlines)

In [529]:
def combine_results(results_dict, airlines):
    to_concat = [results_dict[airline] for airline in airlines]
    return pd.concat(to_concat)

In [530]:
df_to_submit = combine_results(results, airlines)

In [531]:
df_to_submit

Unnamed: 0,crs_dep_datetime,airline,mkt_carrier_fl_num,origin,dest,predicted_delay
0,2020-01-01 18:10:00,WN,5888,ONT,SFO,-9.237199
1,2020-01-01 11:50:00,WN,6276,ONT,SFO,-10.738094
2,2020-01-01 20:20:00,WN,4598,ONT,SJC,-11.376552
3,2020-01-01 13:40:00,WN,4761,ONT,SJC,-12.360735
4,2020-01-01 09:15:00,WN,5162,ONT,SJC,-12.449402
...,...,...,...,...,...,...
150618,2020-01-07 17:55:00,DL,4813,DTW,JFK,-5.795025
150619,2020-01-07 06:00:00,DL,4814,GSP,LGA,-2.756883
150620,2020-01-07 17:15:00,DL,4815,ATL,XNA,-3.135142
150621,2020-01-07 18:51:00,DL,4815,XNA,ATL,-3.144292


In [532]:
df_to_submit.to_csv('../data/final_submission.csv', index=False)