In [2]:
import os
import csv
import pandas as pd
import numpy as np
from matplotlib.pyplot import *
import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib import cm
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from dateutil import parser
import io
import base64
from subprocess import check_output

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
#Datetyping the dates
train['pickup_datetime'] = pd.to_datetime(train.pickup_datetime)
test['pickup_datetime'] = pd.to_datetime(test.pickup_datetime)

train.drop(['dropoff_datetime'], axis=1, inplace=True) #as we don't have this feature in the testset

#Date features creations and deletions
train['month'] = train.pickup_datetime.dt.month
train['week'] = train.pickup_datetime.dt.week
train['weekday'] = train.pickup_datetime.dt.weekday
train['hour'] = train.pickup_datetime.dt.hour
train['minute'] = train.pickup_datetime.dt.minute
train['minute_oftheday'] = train['hour'] * 60 + train['minute']
train.drop(['minute'], axis=1, inplace=True)

test['month'] = test.pickup_datetime.dt.month
test['week'] = test.pickup_datetime.dt.week
test['weekday'] = test.pickup_datetime.dt.weekday
test['hour'] = test.pickup_datetime.dt.hour
test['minute'] = test.pickup_datetime.dt.minute
test['minute_oftheday'] = test['hour'] * 60 + test['minute']
test.drop(['minute'], axis=1, inplace=True)

train.drop(['pickup_datetime'], axis=1, inplace=True)

In [6]:
#One-hot encoding binary categorical features
train = pd.concat([train, pd.get_dummies(train['store_and_fwd_flag'])], axis=1)
test = pd.concat([test, pd.get_dummies(test['store_and_fwd_flag'])], axis=1)

train.drop(['store_and_fwd_flag'], axis=1, inplace=True)

train = pd.concat([train, pd.get_dummies(train['vendor_id'])], axis=1)
test = pd.concat([test, pd.get_dummies(test['vendor_id'])], axis=1)

train.drop(['vendor_id'], axis=1, inplace=True)

In [7]:
def ft_haversine_distance(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371 #km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

#Add distance feature
train['distance'] = ft_haversine_distance(train['pickup_latitude'].values,
                                                 train['pickup_longitude'].values, 
                                                 train['dropoff_latitude'].values,
                                                 train['dropoff_longitude'].values)
test['distance'] = ft_haversine_distance(test['pickup_latitude'].values, 
                                                test['pickup_longitude'].values, 
                                                test['dropoff_latitude'].values, 
                                                test['dropoff_longitude'].values)

In [8]:
#Function aiming at calculating the direction
def ft_degree(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371 #km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

#Add direction feature
train['direction'] = ft_degree(train['pickup_latitude'].values,
                                train['pickup_longitude'].values,
                                train['dropoff_latitude'].values,
                                train['dropoff_longitude'].values)
test['direction'] = ft_degree(test['pickup_latitude'].values,
                                  test['pickup_longitude'].values, 
                                  test['dropoff_latitude'].values,
                                  test['dropoff_longitude'].values)

In [9]:
train = train[(train.distance < 200)]

In [10]:
#Create speed feature
train['speed'] = train.distance / train.trip_duration

In [11]:
#Remove speed outliers
train = train[(train.speed < 30)]
train.drop(['speed'], axis=1, inplace=True)

In [12]:
feature_cols = ['passenger_count','pickup_longitude','pickup_latitude',
                'dropoff_longitude','dropoff_latitude',
                'N','Y','month','week','weekday','hour',
                'minute_oftheday','distance','direction']

In [13]:
x_train = train[feature_cols]
y_train = np.log1p(train['trip_duration']) 
x_test = test[feature_cols]

In [14]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=1)
Xcv,Xv,Zcv,Zv = train_test_split(x_valid, y_valid, test_size=0.4, random_state=1)
data_tr  = xgb.DMatrix(x_train, label=y_train)
data_cv  = xgb.DMatrix(Xcv   , label=Zcv)
evallist = [(data_tr, 'train'), (data_cv, 'valid')]

  if getattr(data, 'base', None) is not None and \


In [15]:
parms = {'max_depth':10, 
         'objective':'reg:linear',
         'eta'      :0.05,
         'subsample':0.8,#SGD will use this percentage of data
#         'lambda '  :4, #L2 regularization term,>1 more conservative 
#         'colsample_bytree ':0.9,
         'colsample_bylevel':1,
         'min_child_weight': 10,
         'nthread'  :3}  #number of cpu core to use

clf = xgb.train(parms, data_tr, num_boost_round=1000, evals = evallist,
                  early_stopping_rounds=100, maximize=False, 
                  verbose_eval=100)

[0]	train-rmse:5.72073	valid-rmse:5.72035
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:0.384427	valid-rmse:0.394851
[200]	train-rmse:0.363907	valid-rmse:0.385796
[300]	train-rmse:0.354646	valid-rmse:0.383351
[400]	train-rmse:0.348084	valid-rmse:0.381874
[500]	train-rmse:0.343272	valid-rmse:0.381283
[600]	train-rmse:0.338454	valid-rmse:0.380287
[700]	train-rmse:0.33471	valid-rmse:0.379798
[800]	train-rmse:0.331078	valid-rmse:0.379354
[900]	train-rmse:0.327455	valid-rmse:0.378877
[999]	train-rmse:0.323859	valid-rmse:0.378396


In [16]:
print('score = %1.5f, n_boost_round =%d.'%(clf.best_score,clf.best_iteration))

score = 0.37839, n_boost_round =998.


In [17]:
import pickle

In [18]:
pickle.dump(clf, open("pima.pickle.dat", "wb"))#保存模型

In [19]:
x_test.head()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,N,Y,month,week,weekday,hour,minute_oftheday,distance,direction
0,1,-73.988129,40.732029,-73.990173,40.75668,1,0,6,26,3,23,1439,2.746426,-3.595224
1,1,-73.964203,40.679993,-73.959808,40.655403,1,0,6,26,3,23,1439,2.759239,172.278835
2,1,-73.997437,40.737583,-73.98616,40.729523,1,0,6,26,3,23,1439,1.306155,133.326248
3,1,-73.95607,40.7719,-73.986427,40.730469,1,0,6,26,3,23,1439,5.269088,-150.956833
4,1,-73.970215,40.761475,-73.96151,40.75589,1,0,6,26,3,23,1439,0.960842,130.260381


In [23]:
x_test[:10].to_csv('x_test.csv',header=True)