In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, HuberRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
data = pd.read_csv("data\\flights_train.csv")
testData = pd.read_csv("data\\flights_test.csv")
airportData = pd.read_csv('data\\airports.csv', low_memory=False)

In [3]:
testModelData = data
# Add departure hour
testModelData['SCHEDULED_DEPARTURE'] = testModelData['SCHEDULED_DEPARTURE'].apply('{:0>4}'.format)
testModelData['DEPARTURE_HOUR'] = testModelData['SCHEDULED_DEPARTURE'].str[:2]
# Add arrival hour
testModelData['SCHEDULED_ARRIVAL'] = testModelData['SCHEDULED_ARRIVAL'].apply('{:0>4}'.format)
testModelData['ARRIVAL_HOUR'] = testModelData['SCHEDULED_ARRIVAL'].str[:2]

In [4]:
# create dummies
dummy_fields = ['ARRIVAL_HOUR', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'AIRLINE']
for each in dummy_fields:
    dummies = pd.get_dummies(testModelData[each], prefix=each, drop_first=False)
    testModelData = pd.concat([testModelData, dummies], axis=1)
testModelData.head()

Unnamed: 0,id,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,...,AIRLINE_EV,AIRLINE_F9,AIRLINE_HA,AIRLINE_MQ,AIRLINE_NK,AIRLINE_OO,AIRLINE_UA,AIRLINE_US,AIRLINE_VX,AIRLINE_WN
0,0,2015,1,1,4,AS,98,N407AS,ANC,SEA,...,0,0,0,0,0,0,0,0,0,0
1,1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,...,0,0,0,0,0,0,0,0,0,0
2,2,2015,1,1,4,US,840,N171US,SFO,CLT,...,0,0,0,0,0,0,0,1,0,0
3,3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,...,0,0,0,0,0,0,0,0,0,0
4,4,2015,1,1,4,AS,135,N527AS,SEA,ANC,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# remove original columns
testModelData = testModelData.drop(dummy_fields, axis=1)

In [6]:
# Select columns
trainData3 = testModelData.drop(columns=['YEAR', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'DEPARTURE_TIME' , 'TAXI_OUT', 'AIR_TIME', 'DISTANCE'])

In [7]:
# ReduceDataSet

# Split data
X = trainData3.drop(columns='ARRIVAL_DELAY')
Y = trainData3['ARRIVAL_DELAY']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(2254842, 693) (563711, 693) (2254842,) (563711,)


In [8]:
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0, n_jobs=-1)
regressor.fit(X_train, Y_train)
# R^2 scores
print('Train:', regressor.score(X_train, Y_train))
print('Test:', regressor.score(X_test, Y_test))

Train: 0.9916114018065509
Test: 0.9584787665700656


# TEST DATA

In [9]:

# Add departure hour
testData['SCHEDULED_DEPARTURE'] = testData['SCHEDULED_DEPARTURE'].apply('{:0>4}'.format)
testData['DEPARTURE_HOUR'] = testData['SCHEDULED_DEPARTURE'].str[:2]
# Add arrival hour
testData['SCHEDULED_ARRIVAL'] = testData['SCHEDULED_ARRIVAL'].apply('{:0>4}'.format)
testData['ARRIVAL_HOUR'] = testData['SCHEDULED_ARRIVAL'].str[:2]

In [10]:
# Create dummies
dummy_fields = ['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'AIRLINE', 'ARRIVAL_HOUR']
for each in dummy_fields:
    dummies = pd.get_dummies(testData[each], prefix=each, drop_first=False)
    testData = pd.concat([testData, dummies], axis=1)
testData.head()   

Unnamed: 0,id,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,...,ARRIVAL_HOUR_14,ARRIVAL_HOUR_15,ARRIVAL_HOUR_16,ARRIVAL_HOUR_17,ARRIVAL_HOUR_18,ARRIVAL_HOUR_19,ARRIVAL_HOUR_20,ARRIVAL_HOUR_21,ARRIVAL_HOUR_22,ARRIVAL_HOUR_23
0,0,2015,7,1,3,NK,298,N624NK,LAS,IAH,...,0,0,0,0,0,0,0,0,0,0
1,1,2015,7,1,3,AA,1965,N507AY,SFO,CLT,...,0,0,0,0,0,0,0,0,0,0
2,2,2015,7,1,3,DL,2020,N3759,LAX,MEM,...,0,0,0,0,0,0,0,0,0,0
3,3,2015,7,1,3,NK,612,N629NK,LAS,MSP,...,0,0,0,0,0,0,0,0,0,0
4,4,2015,7,1,3,UA,1044,N39418,ANC,ORD,...,0,0,0,0,0,0,0,0,0,0


In [11]:
testData.columns[:40]

Index(['id', 'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
       'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'TAXI_OUT', 'WHEELS_OFF',
       'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'SCHEDULED_ARRIVAL', 'DEPARTURE_HOUR', 'ARRIVAL_HOUR',
       'ORIGIN_AIRPORT_ABE', 'ORIGIN_AIRPORT_ABI', 'ORIGIN_AIRPORT_ABQ',
       'ORIGIN_AIRPORT_ABR', 'ORIGIN_AIRPORT_ABY', 'ORIGIN_AIRPORT_ACK',
       'ORIGIN_AIRPORT_ACT', 'ORIGIN_AIRPORT_ACV', 'ORIGIN_AIRPORT_ACY',
       'ORIGIN_AIRPORT_ADK', 'ORIGIN_AIRPORT_ADQ', 'ORIGIN_AIRPORT_AEX',
       'ORIGIN_AIRPORT_AGS', 'ORIGIN_AIRPORT_AKN', 'ORIGIN_AIRPORT_ALB',
       'ORIGIN_AIRPORT_ALO', 'ORIGIN_AIRPORT_AMA', 'ORIGIN_AIRPORT_ANC',
       'ORIGIN_AIRPORT_APN'],
      dtype='object')

In [12]:
# remove original dummies
testData = testData.drop(dummy_fields, axis=1)


In [13]:
# Select data
testData = testData.drop(columns=['YEAR', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'DEPARTURE_TIME' , 'TAXI_OUT', 'AIR_TIME', 'DISTANCE'])
#testData = testData.drop(columns=['TAIL_NUMBER', 'SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'DEPARTURE_HOUR']) #megiel added 
# select missing columns
missingCols = X_test.columns.difference(testData.columns)
# add zeros to missing columns
for col in missingCols:
    testData[col] = 0

## Debugging the testdata

In [14]:
testData.columns[testData.dtypes=='object']

Index(['SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'DEPARTURE_HOUR'], dtype='object')

In [15]:
trainData3.columns[trainData3.dtypes=='object']

Index(['SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'DEPARTURE_HOUR'], dtype='object')

In [16]:
finalPrediction = regressor.predict(testData)
finalDataset = pd.DataFrame(data=finalPrediction, columns=['ARRIVAL_DELAY'])

In [17]:
# save file
finalDataset.to_csv('prediction.csv', index_label='id')