In [48]:
import pandas as pd

df = pd.read_csv('../data/sea_all_years.csv')

In [49]:
import numpy as np

In [50]:
df = df.drop('Unnamed: 0', axis=1)

df = df.dropna(axis=0, subset=['ArrTime', 'ArrDelay', 'DepDelay', 'DepTime', 'ActualElapsedTime', 'CRSElapsedTime', 'Distance'])

# Datetime processing
dates = df.ix[:, ['Year', 'Month', 'DayofMonth']]
dates.columns = ['year', 'month', 'day']
dates = pd.to_datetime(dates, infer_datetime_format=True)
df['Date'] = dates

In [52]:
df.head(1).T

Unnamed: 0,0
ActualElapsedTime,147
AirTime,
ArrDelay,-4
ArrTime,926
CRSArrTime,930
CRSDepTime,700
CRSElapsedTime,150
CancellationCode,
Cancelled,0
CarrierDelay,


In [60]:
# Select flights leaving Seattle
df_origin_sea = df.ix[df['Origin'] == 'SEA', :]

# Alaska Airlines indicator
df_origin_sea['Alaska'] = (df_origin_sea['UniqueCarrier'] == 'AS').astype(int)

# Limit to after 2008-12-01 to eliminate runway effects
df_origin_sea_3rwy = df_origin_sea.ix[df_origin_sea['Date'] > '2008-12-01', :]

carrier_dummies = pd.get_dummies(df_origin_sea_3rwy.ix[:, ['UniqueCarrier']])
week_dummies = pd.get_dummies(df_origin_sea_3rwy.ix[:, ['DayOfWeek']].astype(str))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [61]:
carrier_dummies.head(1)

Unnamed: 0,UniqueCarrier_AA,UniqueCarrier_AS,UniqueCarrier_B6,UniqueCarrier_CO,UniqueCarrier_DL,UniqueCarrier_F9,UniqueCarrier_FL,UniqueCarrier_HA,UniqueCarrier_NK,UniqueCarrier_NW,UniqueCarrier_OO,UniqueCarrier_UA,UniqueCarrier_US,UniqueCarrier_VX,UniqueCarrier_WN
3916022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [62]:
week_dummies.head(4)

Unnamed: 0,DayOfWeek_1,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,DayOfWeek_6,DayOfWeek_7
3916022,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3916023,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3916024,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3916025,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [63]:
# Get model
df_model = pd.concat((df_origin_sea_3rwy, carrier_dummies), axis=1)
df_model = pd.concat((df_model, week_dummies), axis=1)

# Test


In [64]:
df_origin_sea_3rwy.shape

(824705, 33)

In [65]:
df_model.shape

(824705, 55)

In [67]:
df_model.columns.values

array(['ActualElapsedTime', 'AirTime', 'ArrDelay', 'ArrTime', 'CRSArrTime',
       'CRSDepTime', 'CRSElapsedTime', 'CancellationCode', 'Cancelled',
       'CarrierDelay', 'DayOfWeek', 'DayofMonth', 'DepDelay', 'DepTime',
       'Dest', 'Distance', 'Diverted', 'FlightNum', 'LateAircraftDelay',
       'Month', 'NASDelay', 'Origin', 'SecurityDelay', 'TailNum', 'TaxiIn',
       'TaxiOut', 'UniqueCarrier', 'Unnamed: 0.1', 'Unnamed: 0.1',
       'WeatherDelay', 'Year', 'Date', 'Alaska', 'UniqueCarrier_AA',
       'UniqueCarrier_AS', 'UniqueCarrier_B6', 'UniqueCarrier_CO',
       'UniqueCarrier_DL', 'UniqueCarrier_F9', 'UniqueCarrier_FL',
       'UniqueCarrier_HA', 'UniqueCarrier_NK', 'UniqueCarrier_NW',
       'UniqueCarrier_OO', 'UniqueCarrier_UA', 'UniqueCarrier_US',
       'UniqueCarrier_VX', 'UniqueCarrier_WN', 'DayOfWeek_1',
       'DayOfWeek_2', 'DayOfWeek_3', 'DayOfWeek_4', 'DayOfWeek_5',
       'DayOfWeek_6', 'DayOfWeek_7'], dtype=object)

In [23]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 824705 entries, 3916022 to 5578481
Data columns (total 46 columns):
ActualElapsedTime    824705 non-null float64
AirTime              824705 non-null float64
ArrDelay             824705 non-null float64
ArrTime              824705 non-null float64
CRSArrTime           824705 non-null float64
CRSDepTime           824705 non-null float64
CRSElapsedTime       824705 non-null float64
CancellationCode     0 non-null object
Cancelled            824705 non-null float64
CarrierDelay         124582 non-null float64
DayOfWeek            824705 non-null int64
DayofMonth           824705 non-null int64
DepDelay             824705 non-null float64
DepTime              824705 non-null float64
Dest                 824705 non-null object
Distance             824705 non-null float64
Diverted             824705 non-null float64
FlightNum            824705 non-null int64
LateAircraftDelay    124582 non-null float64
Month                824705 non-null int

In [70]:
df_model['Delay_15_Indicator'] = (df_model['DepDelay'] >= 15).astype(int)

In [71]:
y = df_model.pop('Delay_15_Indicator')

In [75]:
cols_m1 =  ['AirTime', 
         'CRSDepTime', 
         'Distance', 
         'UniqueCarrier_AS',
         'UniqueCarrier_B6', 
         'UniqueCarrier_CO', 
         'UniqueCarrier_DL',
         'UniqueCarrier_F9', 
         'UniqueCarrier_FL', 
         'UniqueCarrier_HA',
         'UniqueCarrier_NK', 
         'UniqueCarrier_NW', 
         'UniqueCarrier_OO',
         'UniqueCarrier_UA', 
         'UniqueCarrier_US', 
         'UniqueCarrier_VX',
         'UniqueCarrier_WN', 
        'DayOfWeek_2', 
        'DayOfWeek_3', 
        'DayOfWeek_4', 
        'DayOfWeek_5',
        'DayOfWeek_6', 
        'DayOfWeek_7']

In [76]:
np.in1d(df_model.columns.values, cols_m1)

array([False,  True, False, False, False,  True, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,  True], dtype=bool)

In [77]:
X = df_model.ix[:, np.in1d(df_model.columns.values, cols_m1)]

In [78]:
X

Unnamed: 0,AirTime,CRSDepTime,Distance,UniqueCarrier_AS,UniqueCarrier_B6,UniqueCarrier_CO,UniqueCarrier_DL,UniqueCarrier_F9,UniqueCarrier_FL,UniqueCarrier_HA,...,UniqueCarrier_UA,UniqueCarrier_US,UniqueCarrier_VX,UniqueCarrier_WN,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,DayOfWeek_6,DayOfWeek_7
3916022,137.0,700.0,1180.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3916023,135.0,1545.0,1180.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3916024,219.0,1205.0,1977.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3916025,54.0,1215.0,399.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3916026,56.0,2045.0,399.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3916027,55.0,615.0,399.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3916028,131.0,1740.0,1024.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3916029,122.0,820.0,1024.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3916030,130.0,1430.0,1024.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3916031,35.0,1320.0,224.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
