In [None]:
#  Using DictVectorizer, we enforce one-hot encoding on arrival cities,
#      arrival airports, departure cities and departure airports
def PreProc(df_set):
    from sklearn import feature_extraction
    dvec = feature_extraction.DictVectorizer()
    cols = ['Departure','CityDeparture','Arrival','CityArrival']
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    # one-hot encoded data replaces pre-existing columns
    vecData = pd.DataFrame(dvec.fit_transform(df_train[cols].apply(mkdict,axis = 1)).toarray())
    vecData.columns = dvec.get_feature_names()
    vecData.index = df_train.index
    df_set = pd.concat([df_set.drop(cols,axis = 1),vecData],axis = 1)
    df_set = DateExtraColumns(df_set)
    df_train = df_train.drop('DateOfDeparture', axis = 1)
    return(df_set)

In [None]:
# new column names are listed below
# print(dvec.get_feature_names())
#vecData.columns

In [None]:
# Here, we examine the possibility of the dates having an impact on variable PAX.
# In the next few functions,we analyze departure dates from different perspectives

def DateExtraColumns(df_set):
    df_set
    dates = df_set['DateOfDeparture']
    date_lambda = lambda row: (datetime.datetime.strptime(row, "%Y-%m-%d"))
    df_dates = map(date_lambda,dates)
    df_set = DayOfWeek(df_set,df_dates)
    df_set = Seasons(df_set,df_dates)
    df_set = IsNearChristmas(df_set,df_dates)
    return(df_set)

In [None]:
# Firstly, we divide dates by day of the week (0 for Monday,1 for Tuesday etc.)
# We suspect more people should be travelling on end-of-the-week flights
def DayOfWeeek(df_set,df_dates):
    DOW_l = lambda row: (datetime.datetime.weekday(row))
    day_of_week = map(DOW_l, df_dates)
    day_of_week = pd.DataFrame(day_of_week)
    day_of_week.columns = ['Day of week']
    df_set = pd.concat([df_set,day_of_week],axis = 1)
    return(df_set)

In [None]:
# Next, we take into consideration what season trips take place in (Winter,Spring,Summer or Fall)
# by one-hot encoding them
def Seasons(df_set,df_dates):
    from sklearn.preprocessing import OneHotEncoder as ohe

    seasons = {
        1 : 0,
        2 : 0,
        3 : 1,
        4 : 1,
        5 : 1,
        6 : 2,
        7 : 2,
        8 : 2,
        9 : 3,
        10 : 3,
        11 : 3,
        12 : 0
    }
    date_by_season =  map(lambda x: seasons[x.month], df_dates)
    date_by_season = pd.DataFrame(date_by_season)
    date_by_season.columns = ['Season']
    enc = ohe(sparse=False)
    enc.fit(date_by_season)
    date_by_season = enc.transform(date_by_season)
    date_by_season = pd.DataFrame(date_by_season)
    date_by_season.columns = ['Winter','Spring','Summer','Fall']
    df_set = pd.concat([df_set, date_by_season],axis = 1)
    return(df_set)

In [None]:
# Another, more specific, time of year that we expect flights to be crowded
# would be around Christmas and New year. Therefore, we determine whether a flight
# took place between the 15th of December and the 10th of January (dates determined at random around
# the specified holidays)
def IsNearChristmas(df_set,df_dates):
    around_christmas = map(lambda x: (int)(((x.month == 12) and (x.day >= 15)) or
                          (x.month == 1 and x.day <= 10)), df_dates)
    around_christmas
    around_christmas = pd.DataFrame(around_christmas)
    around_christmas.columns = ['IsAroundChristmas']
    df_set = pd.concat([df_set, around_christmas], axis = 1)
    return(df_set)

In [None]:
def DecisionTreeClassifierEntropyTestOnly(train_set):
    from sklearn import tree
    from sklearn.cross_validation import train_test_split
    df_target = train_set['PAX']
    df_data = train_set.drop('PAX', axis = 1)
    df1_train,df1_test,df2_train,df2_test = train_test_split(df_data,df_target,test_size = 0.2,random_state = 42)
    dt = tree.DecisionTreeClassifier(criterion='entropy')
    dt = dt.fit(df1_train,df2_train)
    from sklearn import metrics
    df2_pred = dt.predict(df1_test)
    #print "Accuracy:{0:.14f}".format(metrics.f1_score(df2_test,df2_pred)),"\n"
    return(df2_pred)

In [None]:
def DecisionTreeClassifierEntropy(train_set,test_set):
    from sklearn import tree
    from sklearn.cross_validation import train_test_split
    df_target = train_set['PAX']
    df_data = train_set.drop('PAX', axis = 1)
    df1_train,df1_test,df2_train,df2_test = train_test_split(df_data,df_target,test_size = 0.2,random_state = 42)
    dt = tree.DecisionTreeClassifier(criterion='entropy')
    dt = dt.fit(df1_train,df2_train)
    from sklearn import metrics
    df2_pred = dt.predict(df1_test)
    #print "Accuracy:{0:.14f}".format(metrics.f1_score(df2_test,df2_pred)),"\n"
    return(df2_pred)

In [None]:
import pandas as pd
import numpy as np
from dateutil.parser import parse
import datetime
df_train = pd.read_csv('train.csv')
df_train = PreProc(df_train)

df_test = pd.read_csv('test.csv')
df_test = PrePreoc(df_test)

y_pred = DecisionTreeClassifierEntropy(df_train,df_test)
#np.savetxt('y_pred.txt', y_pred, fmt='%d')
y_pred = pd.DataFrame(y_pred)
y_pred.describe()

Unnamed: 0,DateOfDeparture,Departure,CityDeparture,LongitudeDeparture,LatitudeDeparture,Arrival,CityArrival,LongitudeArrival,LatitudeArrival,WeeksToDeparture,std_wtd
0,2012-10-21,DFW,Dallas-Fort Worth,32.896828,-97.037997,SFO,San Francisco,37.618972,-122.374889,14.6,11.575837
1,2012-09-13,LAX,Los Angeles,33.942536,-118.408075,ATL,Atlanta,33.636719,-84.428067,14.730769,13.364304
2,2012-09-04,ORD,Chicago,41.978603,-87.904842,IAH,Houston,29.984433,-95.341442,8.470588,5.885551
3,2012-08-13,DEN,Denver,39.861656,-104.673178,PHX,Phoenix,33.434278,-112.011583,8.2,6.292853
4,2012-09-10,ORD,Chicago,41.978603,-87.904842,SEA,Seattle,47.449,-122.309306,12.090909,9.138662


In [31]:

from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile = 20)
X_train_fs = fs.fit_transform(df1_train,df2_train)

ValueError: Input X must be non-negative.

In [20]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df_train['Departure'])
df_train['Departure'] = le.transform(df_train['Departure'])
df_train['Arrival'] = le.transform(df_train['Arrival'])

In [21]:
le.fit(df_train['CityDeparture'])
df_train['CityDeparture'] = le.transform(df_train['CityDeparture'])
df_train['CityArrival'] = le.transform(df_train['CityArrival'])

Unnamed: 0,Arrival=ATL,Arrival=BOS,Arrival=CLT,Arrival=DEN,Arrival=DFW,Arrival=DTW,Arrival=EWR,Arrival=IAH,Arrival=JFK,Arrival=LAS,...,Departure=PHL,Departure=PHX,Departure=SEA,Departure=SFO,LatitudeArrival,LatitudeDeparture,LongitudeArrival,LongitudeDeparture,WeeksToDeparture,std_wtd
count,7104.0,7104.0,7104.0,7104.0,7104.0,7104.0,7104.0,7104.0,7104.0,7104.0,...,7104.0,7104.0,7104.0,7104.0,7134.0,7134.0,7134.0,7134.0,7134.0,7134.0
mean,0.11388,0.067849,0.01464,0.073198,0.064189,0.022523,0.045749,0.016329,0.030405,0.049409,...,0.0404,0.015203,0.031813,0.075591,-93.703171,-93.452752,37.674883,37.80124,11.447798,8.613036
std,0.317687,0.251504,0.120114,0.26048,0.245107,0.148386,0.208955,0.126746,0.171712,0.216735,...,0.196909,0.122367,0.175514,0.264362,17.511743,17.428464,4.704637,4.67581,2.791529,2.142945
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-122.374889,-122.374889,25.79325,25.79325,2.625,2.160247
25%,,,,,,,,,,,...,,,,,,,,,,
50%,,,,,,,,,,,...,,,,,,,,,,
75%,,,,,,,,,,,...,,,,,,,,,,
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,-71.005181,-71.005181,47.449,47.449,21.933333,15.862216


In [22]:
le.fit(df_train['DateOfDeparture'])
df_train['DateOfDeparture'] = le.transform(df_train['DateOfDeparture'])

ValueError: invalid literal for float(): 2012-07-20

In [None]:
df_train = df_train.drop('DateOfDeparture', axis = 1)
from sklearn.cross_validation import train_test_split
df_target = df_train['PAX']
df_data = df_train.drop('PAX', axis = 1)
df1_train,df1_test,df2_train,df2_test = train_test_split(df_data,df_target,test_size = 0.2,random_state = 33)
print ('Ths manas sou')

In [None]:
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile = 5)
X_train_fs = fs.fit_transform(df1_train,df2_train)

NameError: name 'df_train' is not defined