In [11]:
import pandas as pd
import numpy as np
from dateutil.parser import parse
import datetime
df_train = pd.read_csv('train.csv')
df_train.head()



Unnamed: 0,DateOfDeparture,Departure,CityDeparture,LongitudeDeparture,LatitudeDeparture,Arrival,CityArrival,LongitudeArrival,LatitudeArrival,WeeksToDeparture,std_wtd,PAX
0,2011-12-05,ORD,Chicago,41.978603,-87.904842,EWR,Newark,40.6925,-74.168667,8.352941,5.667243,7
1,2013-03-01,ATL,Atlanta,33.636719,-84.428067,LGA,New York,40.777245,-73.872608,10.421053,10.001754,7
2,2012-02-06,ORD,Chicago,41.978603,-87.904842,BOS,Boston,42.364347,-71.005181,9.25,7.136821,7
3,2012-12-10,ORD,Chicago,41.978603,-87.904842,SFO,San Francisco,37.618972,-122.374889,8.666667,7.404291,7
4,2012-04-13,SFO,San Francisco,37.618972,-122.374889,JFK,New York,40.639751,-73.778925,14.037037,9.858544,7


In [12]:
#  Using DictVectorizer, we enforce one-hot encoding on arrival cities,
#      arrival airports, departure cities and departure airports
from sklearn import feature_extraction
dvec = feature_extraction.DictVectorizer()
cols = ['Departure','CityDeparture','Arrival','CityArrival']
mkdict = lambda row: dict((col, row[col]) for col in cols)

In [13]:
# one-hot encoded data replaces pre-existing columns
vecData = pd.DataFrame(dvec.fit_transform(df_train[cols].apply(mkdict,axis = 1)).toarray())
vecData.columns = dvec.get_feature_names()
vecData.index = df_train.index
vecData = vecData.append(df_train.drop(cols,axis = 1))

In [14]:
# new column names are listed below
print(dvec.get_feature_names())

['Arrival=ATL', 'Arrival=BOS', 'Arrival=CLT', 'Arrival=DEN', 'Arrival=DFW', 'Arrival=DTW', 'Arrival=EWR', 'Arrival=IAH', 'Arrival=JFK', 'Arrival=LAS', 'Arrival=LAX', 'Arrival=LGA', 'Arrival=MCO', 'Arrival=MIA', 'Arrival=MSP', 'Arrival=ORD', 'Arrival=PHL', 'Arrival=PHX', 'Arrival=SEA', 'Arrival=SFO', 'CityArrival=Atlanta', 'CityArrival=Boston', 'CityArrival=Charlotte', 'CityArrival=Chicago', 'CityArrival=Dallas-Fort Worth', 'CityArrival=Denver', 'CityArrival=Detroit', 'CityArrival=Houston', 'CityArrival=Las Vegas', 'CityArrival=Los Angeles', 'CityArrival=Miami', 'CityArrival=Minneapolis', 'CityArrival=New York', 'CityArrival=Newark', 'CityArrival=Orlando', 'CityArrival=Philadelphia', 'CityArrival=Phoenix', 'CityArrival=San Francisco', 'CityArrival=Seattle', 'CityDeparture=Atlanta', 'CityDeparture=Boston', 'CityDeparture=Charlotte', 'CityDeparture=Chicago', 'CityDeparture=Dallas-Fort Worth', 'CityDeparture=Denver', 'CityDeparture=Detroit', 'CityDeparture=Houston', 'CityDeparture=Las Vega

In [15]:
vecData.columns

Index([u'Arrival=ATL', u'Arrival=BOS', u'Arrival=CLT', u'Arrival=DEN',
       u'Arrival=DFW', u'Arrival=DTW', u'Arrival=EWR', u'Arrival=IAH',
       u'Arrival=JFK', u'Arrival=LAS', u'Arrival=LAX', u'Arrival=LGA',
       u'Arrival=MCO', u'Arrival=MIA', u'Arrival=MSP', u'Arrival=ORD',
       u'Arrival=PHL', u'Arrival=PHX', u'Arrival=SEA', u'Arrival=SFO',
       u'CityArrival=Atlanta', u'CityArrival=Boston', u'CityArrival=Charlotte',
       u'CityArrival=Chicago', u'CityArrival=Dallas-Fort Worth',
       u'CityArrival=Denver', u'CityArrival=Detroit', u'CityArrival=Houston',
       u'CityArrival=Las Vegas', u'CityArrival=Los Angeles',
       u'CityArrival=Miami', u'CityArrival=Minneapolis',
       u'CityArrival=New York', u'CityArrival=Newark', u'CityArrival=Orlando',
       u'CityArrival=Philadelphia', u'CityArrival=Phoenix',
       u'CityArrival=San Francisco', u'CityArrival=Seattle',
       u'CityDeparture=Atlanta', u'CityDeparture=Boston',
       u'CityDeparture=Charlotte', u'CityDepar

In [22]:
# Here, we examine the possibility of the dates having an impact on variable PAX.
# In the next few commands,we analyze departure dates from different perspectives
# Firstly, we divide dates by day of the week (0 for Monday,1 for Tuesday etc.)
# We suspect more people should be travelling on end-of-the-week flights
dates = df_train['DateOfDeparture']
date_lambda = lambda row: (datetime.datetime.strptime(row, "%Y-%m-%d"))
df_dates = map(date_lambda,dates)
DOW_l = lambda row: (datetime.datetime.weekday(row))
day_of_week = map(DOW_l, df_dates)
day_of_week = pd.DataFrame(day_of_week)
day_of_week.columns = ['Day of week']
vecData.append(day_of_week)
day_of_week.describe()

Unnamed: 0,Day of week
count,8899.0
mean,2.995955
std,1.998028
min,0.0
25%,1.0
50%,3.0
75%,5.0
max,6.0


In [17]:
# Next, we take into consideration what season trips take place in (Winter,Spring,Summer or Fall)
# by one-hot encoding them
from sklearn.preprocessing import OneHotEncoder as ohe

seasons = {
    1 : 0,
    2 : 0,
    3 : 1,
    4 : 1,
    5 : 1,
    6 : 2,
    7 : 2,
    8 : 2,
    9 : 3,
    10 : 3,
    11 : 3,
    12 : 0
}
date_by_season =  map(lambda x: seasons[x.month], df_dates)
date_by_season = pd.DataFrame(date_by_season)
date_by_season.columns = ['Season']
enc = ohe(sparse=False)
enc.fit(date_by_season)
date_by_season = enc.transform(date_by_season)
date_by_season = pd.DataFrame(date_by_season)
date_by_season.columns = ['Winter','Spring','Summer','Fall']
vecData.append(date_by_season)
date_by_season.describe()

Unnamed: 0,Winter,Spring,Summer,Fall
count,8899.0,8899.0,8899.0,8899.0
mean,0.328014,0.174402,0.161479,0.336105
std,0.469516,0.379476,0.367993,0.472402
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0


In [18]:
# Another, more specific, time of year that we expect flights to be crowded
# would be around Christmas and New year. Therefore, we determine whether a flight
# took place between the 15th of December and the 10th of January (dates determined at random around
# the specified holidays)

around_christmas = map(lambda x: (int)(((x.month == 12) and (x.day >= 15)) or
                      (x.month == 1 and x.day <= 10)), df_dates)
around_christmas
around_christmas = pd.DataFrame(around_christmas)
around_christmas.columns = ['IsAroundChristmas']
around_christmas.describe()

Unnamed: 0,IsAroundChristmas
count,8899.0
mean,0.099562
std,0.299432
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [24]:
vecData = vecData.drop('DateOfDeparture', axis = 1)
vecData.describe()

Unnamed: 0,Arrival=ATL,Arrival=BOS,Arrival=CLT,Arrival=DEN,Arrival=DFW,Arrival=DTW,Arrival=EWR,Arrival=IAH,Arrival=JFK,Arrival=LAS,...,Departure=PHX,Departure=SEA,Departure=SFO,LatitudeArrival,LatitudeDeparture,LongitudeArrival,LongitudeDeparture,PAX,WeeksToDeparture,std_wtd
count,8899.0,8899.0,8899.0,8899.0,8899.0,8899.0,8899.0,8899.0,8899.0,8899.0,...,8899.0,8899.0,8899.0,8899.0,8899.0,8899.0,8899.0,8899.0,8899.0,8899.0
mean,0.112485,0.066524,0.014833,0.072817,0.064502,0.022474,0.044275,0.014945,0.032138,0.048657,...,0.014833,0.032138,0.077537,-93.67779,-93.742886,37.736862,37.808886,3.645915,11.459248,8.62538
std,0.315979,0.24921,0.120891,0.259851,0.245658,0.148229,0.205716,0.121342,0.176378,0.215162,...,0.120891,0.176378,0.267456,17.498414,17.447421,4.704364,4.66504,2.492679,2.79787,2.139732
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-122.374889,-122.374889,25.79325,25.79325,0.0,2.625,2.160247
25%,,,,,,,,,,,...,,,,,,,,,,
50%,,,,,,,,,,,...,,,,,,,,,,
75%,,,,,,,,,,,...,,,,,,,,,,
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,-71.005181,-71.005181,47.449,47.449,7.0,21.933333,15.862216


In [20]:
from sklearn.cross_validation import train_test_split
df_target = vecData['PAX']
df_data = vecData.drop('PAX', axis = 1)
df1_train,df1_test,df2_train,df2_test = train_test_split(df_data,df_target,test_size = 0.2,random_state = 33)

In [21]:
df1_train.describe()

Unnamed: 0,Arrival=ATL,Arrival=BOS,Arrival=CLT,Arrival=DEN,Arrival=DFW,Arrival=DTW,Arrival=EWR,Arrival=IAH,Arrival=JFK,Arrival=LAS,...,Departure=PHL,Departure=PHX,Departure=SEA,Departure=SFO,LatitudeArrival,LatitudeDeparture,LongitudeArrival,LongitudeDeparture,WeeksToDeparture,std_wtd
count,7104.0,7104.0,7104.0,7104.0,7104.0,7104.0,7104.0,7104.0,7104.0,7104.0,...,7104.0,7104.0,7104.0,7104.0,7134.0,7134.0,7134.0,7134.0,7134.0,7134.0
mean,0.11388,0.067849,0.01464,0.073198,0.064189,0.022523,0.045749,0.016329,0.030405,0.049409,...,0.0404,0.015203,0.031813,0.075591,-93.703171,-93.452752,37.674883,37.80124,11.447798,8.613036
std,0.317687,0.251504,0.120114,0.26048,0.245107,0.148386,0.208955,0.126746,0.171712,0.216735,...,0.196909,0.122367,0.175514,0.264362,17.511743,17.428464,4.704637,4.67581,2.791529,2.142945
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-122.374889,-122.374889,25.79325,25.79325,2.625,2.160247
25%,,,,,,,,,,,...,,,,,,,,,,
50%,,,,,,,,,,,...,,,,,,,,,,
75%,,,,,,,,,,,...,,,,,,,,,,
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,-71.005181,-71.005181,47.449,47.449,21.933333,15.862216


In [22]:
from sklearn import tree
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt = dt.fit(df1_train,df2_train)
from sklearn import metrics
y_pred = dt.predict(df1_test)
print "Accuracy:{0:.3f}".format(metrics.f1-score(df2_test,df2_pred)),"\n"

ValueError: invalid literal for float(): 2012-07-20

In [None]:

from sklearn import feature_selection
fs = feature_selection.Select_Percentile(feature_selection.chi2, percentile = 5
X_train_fs = fs.fit_transform(df1_train,df2_train)

In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df_train['Departure'])
df_train['Departure'] = le.transform(df_train['Departure'])
df_train['Arrival'] = le.transform(df_train['Arrival'])

In [3]:
le.fit(df_train['CityDeparture'])
df_train['CityDeparture'] = le.transform(df_train['CityDeparture'])
df_train['CityArrival'] = le.transform(df_train['CityArrival'])

In [4]:
le.fit(df_train['DateOfDeparture'])
df_train['DateOfDeparture'] = le.transform(df_train['DateOfDeparture'])

In [None]:
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile = 5)
X_train_fs = fs.fit_transform(df1_train,df2_train)

NameError: name 'df_train' is not defined