# Airline OnTime
### predict whether arrival of flight will be delayed 

In [1]:
#import necessary libraries
import numpy as np
import pandas as pd
#linear, ridge and lasso regression will be performed on the data set
from scipy import stats
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.linear_model import Ridge, Lasso

import warnings
warnings.filterwarnings('ignore')

  from pandas.core import datetools


### load and explore data

In [2]:
df = pd.read_csv('AIRLINE_ONTIME.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75470 entries, 0 to 75469
Data columns (total 12 columns):
YEAR                     75470 non-null int64
UNIQUE_CARRIER           75470 non-null object
ORIGIN_AIRPORT_ID        75470 non-null int64
ORIGIN_AIRPORT_SEQ_ID    75470 non-null int64
ORIGIN_CITY_MARKET_ID    75470 non-null int64
DEST_AIRPORT_ID          75470 non-null int64
DEST_AIRPORT_SEQ_ID      75470 non-null int64
DEST_CITY_MARKET_ID      75470 non-null int64
DEP_DELAY                73742 non-null float64
ARR_DELAY                73428 non-null float64
CANCELLED                75470 non-null int64
DISTANCE                 75470 non-null int64
dtypes: float64(2), int64(9), object(1)
memory usage: 6.9+ MB


it appears that there are null values for departure and arrival delays

### clean and refine data
it appears that where both departure and arrival delay is both null, flight was cancelled
however there are still more null values in arrival delay in which there is a non-null value for departure delay and the flight was not cancelled, the assumption that can be made is that either the flight departed but did not reach it's destination (i.e. Malysia Airlines Flight 370) or it can be assumed that the flight arrived on time and a value was not put it
the latter will be assumed 

In [3]:
#change the null values of arrival delay in which the flight was not cancelled to 0
df.ARR_DELAY = np.where((df.ARR_DELAY.isnull()) & (df.CANCELLED == 0), 0, df.ARR_DELAY)

also since I am predicting the probability that arrival of a flight is delayed, cancelled flights never departed nor arrive so the flights that were cancelled were taken out

In [4]:
df1 = df[df.CANCELLED == 0]

### Feature Engineering
a weight was put on whether or not the flight had a delay for both departure and arrival

also whether or not the flight departed or arrived early was weighed as well

In [5]:
df1['ARR_ONTIME'] = df1['ARR_DELAY'].apply(lambda x: 1 if x <= 0 else 0)
df1['DEP_ONTIME'] = df1['DEP_DELAY'].apply(lambda x: 1 if x <= 0 else 0)
df1['ARR_EARLY'] = df1['ARR_DELAY'].apply(lambda x: 1 if x < 0 else 0)
df1['DEP_EARLY'] = df1['DEP_DELAY'].apply(lambda x: 1 if x < 0 else 0)

### define feature and target

In [6]:
x = df1[["DEP_DELAY", "DISTANCE", "ORIGIN_AIRPORT_ID", "ORIGIN_AIRPORT_SEQ_ID", "ORIGIN_CITY_MARKET_ID", "DEST_AIRPORT_ID", "DEST_AIRPORT_SEQ_ID", "DEST_CITY_MARKET_ID", 'ARR_ONTIME', 'DEP_ONTIME', 'ARR_EARLY', 'DEP_EARLY']]
y = df1["ARR_DELAY"]

### add constant and get dummy variables

In [7]:
x1 = sm.add_constant(x)
x1 = pd.get_dummies(x1, drop_first=True)

### split the data set into training and testing data sets

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x1, y, train_size=.817, random_state=2)

### perform linear regression

In [9]:
reg = linear_model.LinearRegression()
regmodel = reg.fit(x_train, y_train)
y_test_pred = regmodel.predict(x_test)
print "The RMSE:", (mean_squared_error(y_test, y_test_pred))**0.5 
print "The r-squared:", (r2_score(y_test, y_test_pred))

The RMSE: 12.5030420134
The r-squared: 0.952028640225


### perform ridge regression

In [10]:
reg = linear_model.Ridge(alpha=.4)
regmodel = reg.fit(x_train, y_train)
y_test_pred = regmodel.predict(x_test)
print "The RMSE:", (mean_squared_error(y_test, y_test_pred))**0.5 
print "The r-squared:",(r2_score(y_test, y_test_pred))

The RMSE: 12.5027779855
The r-squared: 0.952030666235


### perform lasso regression

In [11]:
lassoreg = linear_model.Lasso()
lassoreg.fit(x_train, y_train)
y_pred_lasso = lassoreg.predict(x_test)
print "The RMSE:", (mean_squared_error(y_test, y_test_pred))**0.5 
print "The r-squared:", (r2_score(y_test, y_test_pred))

The RMSE: 12.5027779855
The r-squared: 0.952030666235
