In [3]:
#  Using DictVectorizer, we enforce one-hot encoding on arrival cities,
#      arrival airports, departure cities and departure airports
def PreProc(df_set):
    from sklearn import feature_extraction
    dvec = feature_extraction.DictVectorizer()
    cols = ['Departure','CityDeparture','Arrival','CityArrival']
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    # one-hot encoded data replaces pre-existing columns
    vecData = pd.DataFrame(dvec.fit_transform(df_set[cols].apply(mkdict,axis = 1)).toarray())
    vecData.columns = dvec.get_feature_names()
    vecData.index = df_set.index
    df_set = pd.concat([df_set.drop(cols,axis = 1),vecData],axis = 1)
    df_set = DateExtraColumns(df_set)
    df_set = df_set.drop('DateOfDeparture', axis = 1)
    df_set['LatitudeDeparture'] = df_set['LatitudeDeparture'].abs()
    df_set['LatitudeArrival'] = df_set['LatitudeArrival'].abs()
    return(df_set)

In [4]:
# new column names are listed below
# print(dvec.get_feature_names())
#vecData.columns

In [5]:
# Firstly, we divide dates by day of the week (0 for Monday,1 for Tuesday etc.)
# We suspect more people should be travelling on end-of-the-week flights
def DayOfWeek(df_set,df_dates):
    DOW_l = lambda row: (datetime.datetime.weekday(row))
    day_of_week = map(DOW_l, df_dates)
    day_of_week = pd.DataFrame(day_of_week)
    day_of_week.columns = ['Day of week']
    df_set = pd.concat([df_set,day_of_week],axis = 1)
    return(df_set)

In [6]:
# Next, we take into consideration what season trips take place in (Winter,Spring,Summer or Fall)
# by one-hot encoding them
def Seasons(df_set,df_dates):
    from sklearn.preprocessing import OneHotEncoder as ohe

    seasons = {
        1 : 0,
        2 : 0,
        3 : 1,
        4 : 1,
        5 : 1,
        6 : 2,
        7 : 2,
        8 : 2,
        9 : 3,
        10 : 3,
        11 : 3,
        12 : 0
    }
    date_by_season =  map(lambda x: seasons[x.month], df_dates)
    date_by_season = pd.DataFrame(date_by_season)
    date_by_season.columns = ['Season']
    enc = ohe(sparse=False)
    enc.fit(date_by_season)
    date_by_season = enc.transform(date_by_season)
    date_by_season = pd.DataFrame(date_by_season)
    date_by_season.columns = ['Winter','Spring','Summer','Fall']
    df_set = pd.concat([df_set, date_by_season],axis = 1)
    return(df_set)

In [7]:
# Another, more specific, time of year that we expect flights to be crowded
# would be around Christmas and New year. Therefore, we determine whether a flight
# took place between the 15th of December and the 10th of January (dates determined at random around
# the specified holidays)
def IsNearChristmas(df_set,df_dates):
    around_christmas = map(lambda x: (int)(((x.month == 12) and (x.day >= 15)) or
                          (x.month == 1 and x.day <= 10)), df_dates)
    around_christmas
    around_christmas = pd.DataFrame(around_christmas)
    around_christmas.columns = ['IsAroundChristmas']
    df_set = pd.concat([df_set, around_christmas], axis = 1)
    return(df_set)

In [8]:
# Here, we examine the possibility of the dates having an impact on variable PAX.
# In the next few functions,we analyze departure dates from different perspectives

def DateExtraColumns(df_set):
    df_set
    dates = df_set['DateOfDeparture']
    date_lambda = lambda row: (datetime.datetime.strptime(row, "%Y-%m-%d"))
    df_dates = map(date_lambda,dates)
    df_set = DayOfWeek(df_set,df_dates)
    df_set = Seasons(df_set,df_dates)
    df_set = IsNearChristmas(df_set,df_dates)
    return(df_set)

In [9]:
def DecisionTreeClassifierEntropyTestOnly(train_set):
    from sklearn import tree
    from sklearn.cross_validation import train_test_split
    df_target = train_set['PAX']
    df_data = train_set.drop('PAX', axis = 1)
    df1_train,df1_test,df2_train,df2_test = train_test_split(df_data,df_target,test_size = 0.2,random_state = 42)
    dt = tree.DecisionTreeClassifier(criterion='entropy')
    dt = dt.fit(df1_train,df2_train)
    from sklearn import metrics
    df2_pred = dt.predict(df1_test)
    print "Accuracy:{0:.14f}".format(metrics.f1_score(df2_test,df2_pred)),"\n"
    return(df2_pred)

In [10]:
def DecisionTreeClassifierEntropy(train_set,test_set):
    from sklearn import tree
    from sklearn.cross_validation import train_test_split
    df_target = train_set['PAX']
    df_data = train_set.drop('PAX', axis = 1)
    df1_train,df1_test,df2_train,df2_test = train_test_split(df_data,df_target,test_size = 0.2,random_state = 42)
    dt = tree.DecisionTreeClassifier(criterion='entropy')
    dt = dt.fit(df1_train,df2_train)
    from sklearn import metrics
    df2_pred = dt.predict(test_set)
    #print "Accuracy:{0:.14f}".format(metrics.f1_score(df2_test,df2_pred)),"\n"
    return(df2_pred)

In [29]:
import pandas as pd
import numpy as np
from dateutil.parser import parse
import datetime
df_train = pd.read_csv('train.csv')
df_train = PreProc(df_train)

df_test = pd.read_csv('test.csv')
df_test = PreProc(df_test)

df_test.head()

Unnamed: 0,LongitudeDeparture,LatitudeDeparture,LongitudeArrival,LatitudeArrival,WeeksToDeparture,std_wtd,Arrival=ATL,Arrival=BOS,Arrival=CLT,Arrival=DEN,...,Departure=PHL,Departure=PHX,Departure=SEA,Departure=SFO,Day of week,Winter,Spring,Summer,Fall,IsAroundChristmas
0,32.896828,97.037997,37.618972,122.374889,14.6,11.575837,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6,0.0,0.0,0.0,1.0,0
1,33.942536,118.408075,33.636719,84.428067,14.730769,13.364304,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3,0.0,0.0,0.0,1.0,0
2,41.978603,87.904842,29.984433,95.341442,8.470588,5.885551,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,1.0,0
3,39.861656,104.673178,33.434278,112.011583,8.2,6.292853,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0
4,41.978603,87.904842,47.449,122.309306,12.090909,9.138662,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0


In [30]:
t = df_train['PAX']
df_train = df_train.drop(df_train.columns[[6]], axis=1)

Unnamed: 0,WeeksToDeparture,std_wtd,Arrival=ATL,Arrival=BOS,Arrival=CLT,Arrival=DEN,Arrival=DFW,Arrival=DTW,Arrival=EWR,Arrival=IAH,...,Departure=PHL,Departure=PHX,Departure=SEA,Departure=SFO,Day of week,Winter,Spring,Summer,Fall,IsAroundChristmas
0,14.6,11.575837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6,0.0,0.0,0.0,1.0,0
1,14.730769,13.364304,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3,0.0,0.0,0.0,1.0,0
2,8.470588,5.885551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,1.0,0
3,8.2,6.292853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0
4,12.090909,9.138662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0


In [31]:
t.head()

0    7
1    7
2    7
3    7
4    7
Name: PAX, dtype: int64

In [41]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_train, t, random_state=10)
k = KNeighborsClassifier(n_neighbors=6)
k.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='uniform')

In [42]:
from sklearn import metrics
y_pred = k.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.411235955056


In [149]:
np.savetxt('p.txt', pre , fmt='%d')

In [78]:
y_pred = DecisionTreeClassifierEntropy(df_train,df_test)
x_test = DecisionTreeClassifierEntropyTestOnly(df_train)
np.savetxt('y_pred.txt', y_pred, fmt='%d')
y_pred = pd.DataFrame(y_pred)
y_pred.describe()

Accuracy:0.42564673969691 



  sample_weight=sample_weight)


Unnamed: 0,0
count,2229.0
mean,3.745177
std,2.468405
min,0.0
25%,2.0
50%,5.0
75%,6.0
max,7.0


In [38]:
y_pred = np.full((2229,1),2)
np.savetxt('y_pred.txt', y_pred, fmt='%d')
y_pred



array([[ 2.],
       [ 2.],
       [ 2.],
       ..., 
       [ 2.],
       [ 2.],
       [ 2.]])

In [1]:

from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile = 20)
X_train_fs = fs.fit_transform(df1_train,df2_train)

NameError: name 'df1_train' is not defined

In [None]:
df_train = df_train.drop('DateOfDeparture', axis = 1)
from sklearn.cross_validation import train_test_split
df_target = df_train['PAX']
df_data = df_train.drop('PAX', axis = 1)
df1_train,df1_test,df2_train,df2_test = train_test_split(df_data,df_target,test_size = 0.2,random_state = 33)
print ('Ths manas sou')

NameError: name 'df_train' is not defined