# Assignment
Predict how late flights will be. A flight only counts as late if it is more than 30 minutes late.

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
from sklearn import preprocessing
from sklearn import linear_model
pd.options.display.float_format = '{:.3f}'.format
from sklearn.model_selection import cross_val_score
# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")
from sklearn import ensemble

In [2]:
with open("Airlines2008short.csv", 'w') as output_file: 
    with open("Airlines2008.csv", 'r') as input_file: 
        line = input_file.readline() 
        counter = 0 
        while line: 
            if counter % 20 == 0:  
                output_file.write(line) 
            counter += 1 
            line = input_file.readline() 

In [3]:
airlines2008=pd.read_csv('Airlines2008short.csv',usecols=lambda x:x in ['Month','DayOfWeek','CRSDepTime','ArrDelay'])

In [4]:
airlines2008.head()

Unnamed: 0,Month,DayOfWeek,CRSDepTime,ArrDelay
0,1,4,1325,37.0
1,1,4,1355,11.0
2,1,4,1535,-7.0
3,1,4,1445,43.0
4,1,4,1715,52.0


In [5]:
def process_frame(df):
    is_morning=(df.loc[:,'CRSDepTime']>=700) & (df.loc[:,'CRSDepTime']<=900)
    df['MorningDep']=is_morning
    df['MorningDep']=df['MorningDep'].astype(int)
    is_evening=(df.loc[:,'CRSDepTime']>=1700) & (df.loc[:,'CRSDepTime']<=1900)
    df['EveningDep']=is_evening
    df['EveningDep']=df['EveningDep'].astype(int)
    df.dropna(inplace=True)
    df['Net_Late']=df['ArrDelay']-30
    df = pd.get_dummies(df,columns=['Month','DayOfWeek'])
    return df

In [6]:
new_df=process_frame(airlines2008)

In [7]:
new_df.shape

(342542, 24)

In [8]:
training_set=new_df.iloc[::2,:].copy()

In [9]:
test_set=new_df.iloc[1::2,:].copy()

In [10]:
new_df.columns

Index(['CRSDepTime', 'ArrDelay', 'MorningDep', 'EveningDep', 'Net_Late',
       'Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6',
       'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12',
       'DayOfWeek_1', 'DayOfWeek_2', 'DayOfWeek_3', 'DayOfWeek_4',
       'DayOfWeek_5', 'DayOfWeek_6', 'DayOfWeek_7'],
      dtype='object')

In [11]:
air_training_features=training_set[['CRSDepTime', 'MorningDep', 'EveningDep',
       'Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6',
       'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12',
       'DayOfWeek_1', 'DayOfWeek_2', 'DayOfWeek_3', 'DayOfWeek_4',
       'DayOfWeek_5', 'DayOfWeek_6', 'DayOfWeek_7']]

In [12]:
air_training_target=training_set[['Net_Late']]

In [13]:
air_test_features=test_set[['CRSDepTime', 'MorningDep', 'EveningDep',
       'Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6',
       'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12',
       'DayOfWeek_1', 'DayOfWeek_2', 'DayOfWeek_3', 'DayOfWeek_4',
       'DayOfWeek_5', 'DayOfWeek_6', 'DayOfWeek_7']]

In [14]:
air_test_target=test_set[['Net_Late']]

In [15]:
regr = linear_model.LinearRegression()

# Fit our model to our training data.
regr.fit(air_training_features,air_training_target)

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)

# Performance on the training data:
print('R squared of model on training data: \n',regr.score(air_training_features,air_training_target)) 

# Performance on the test data:
print('R squared of model on test data: \n',regr.score(air_test_features,air_test_target))

Coefficients: 
 [[  8.26963848e-03  -1.62940136e+00   3.02064820e+00   1.72863187e+00
    5.04501171e+00   3.39257099e+00  -1.36088976e+00  -2.32753808e+00
    4.58202629e+00   1.43433708e+00  -9.74936866e-01  -7.39920177e+00
   -7.70933901e+00  -5.44828546e+00   9.03761301e+00   6.65468141e-02
   -5.93252814e-01  -1.28708341e+00   2.16688516e-01   2.81170392e+00
   -2.09282098e+00   8.78217958e-01]]
Intercept: 
 [-33.1392222]
R squared of model on training data: 
 0.0329061583735
R squared of model on test data: 
 0.0306132770889


In [16]:
regr = ensemble.RandomForestRegressor(n_estimators=20,max_depth=10)

# Fit our model to our training data.
regr.fit(air_training_features,air_training_target.values.ravel())


# Performance on the training data:
print('R squared of model on training data: \n',regr.score(air_training_features,air_training_target)) 

# Performance on the test data:
print('R squared of model on test data: \n',regr.score(air_test_features,air_test_target))

R squared of model on training data: 
 0.0634488751153
R squared of model on test data: 
 0.0319003966567
