In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [2]:
features=['trip_duration','vendor_id','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude',
        'great_circle_distance','duration','passenger_count','pickup_hour','pickup_weekday','pickup_month','snow','holiday',
        'distance','route_cooridnates']

In [2]:
train=pd.read_csv("train_full_parsed_clean3.csv",usecols=features)

In [4]:
train.head(4)

Unnamed: 0,trip_duration,vendor_id,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,great_circle_distance,duration,passenger_count,pickup_hour,pickup_weekday,pickup_month,snow,holiday,distance,route_cooridnates
0,455,2,-73.982155,40.767937,-73.96463,40.765602,1498.942981,421.6,1,17,0,3,0,0,2008.2,"[[-73.982316, 40.767869], [-73.982163, 40.7677..."
1,663,1,-73.980415,40.738564,-73.999481,40.731152,1806.015862,587.0,1,0,6,6,0,0,2514.5,"[[-73.980421, 40.738566], [-73.984941, 40.7323..."
2,2124,2,-73.979027,40.763939,-74.005333,40.710087,6386.897467,1281.0,1,11,1,1,0,0,11060.5,"[[-73.978874, 40.764148], [-73.977685, 40.7636..."
3,429,2,-74.01004,40.719971,-74.012268,40.706718,1485.916955,461.7,1,19,2,4,0,0,1819.5,"[[-74.010145, 40.719982], [-74.011102, 40.7155..."


In [3]:
features_test=['id','vendor_id','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude',
        'great_circle_distance','duration','passenger_count','pickup_hour','pickup_weekday','pickup_month','snow','holiday',
        'distance','route_coordinates']
test=pd.read_csv("test_full_parsed_clean.csv",usecols=features_test)

In [4]:
test.groupby('passenger_count').size()

passenger_count
0        23
1    443447
2     90027
3     25686
4     12017
5     33411
6     20521
9         2
dtype: int64

In [5]:
test['passenger_count'].loc[test['passenger_count'] == 9] = 7

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [6]:
def transformation(train,istrain):
    
    train['vendor_id'] = train['vendor_id'].map( {1: 0, 2: 1} ).astype(int)
    
    temp = pd.get_dummies(train['pickup_hour'],prefix='pickup_hour')
    train = pd.concat([train,temp],axis=1)
    train = train.drop(["pickup_hour"], axis=1)

    temp = pd.get_dummies(train['pickup_weekday'],prefix='pickup_weekday')
    train = pd.concat([train,temp],axis=1)
    train = train.drop(["pickup_weekday"], axis=1)

    temp = pd.get_dummies(train['pickup_month'],prefix='pickup_month')
    train = pd.concat([train,temp],axis=1)
    train = train.drop(["pickup_month"], axis=1)

    temp = pd.get_dummies(train['passenger_count'],prefix='passenger_count')
    train = pd.concat([train,temp],axis=1)
    train = train.drop(["passenger_count"], axis=1)
    
    train['log_duration']=np.log(train['duration'].values+1)
    train=train.drop(["duration"], axis=1)
    if istrain=="Y":
        train['log_trip_duration']=np.log(train['trip_duration'].values+1)
        train=train.drop(["trip_duration"], axis=1)
    
    return train


In [7]:
test=transformation(test,istrain="N")

In [6]:
train=transformation(train,istrain="Y")

In [7]:
test.head(2)

Unnamed: 0,id,vendor_id,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,great_circle_distance,snow,holiday,distance,...,pickup_month_6,passenger_count_0,passenger_count_1,passenger_count_2,passenger_count_3,passenger_count_4,passenger_count_5,passenger_count_6,passenger_count_7,log_duration
0,id3004672,0,-73.988129,40.732029,-73.990173,40.75668,2747.199614,0,0,3794.2,...,1,0,1,0,0,0,0,0,0,6.803172
1,id3505355,0,-73.964203,40.679993,-73.959808,40.655403,2760.016336,0,0,2902.4,...,1,0,1,0,0,0,0,0,0,6.36647


In [8]:
feature_out=['log_duration', 'log_trip_duration', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'great_circle_distance','distance',
       'snow', 'holiday', 'vendor_id', 'pickup_hour_0',
       'pickup_hour_1', 'pickup_hour_2', 'pickup_hour_3', 'pickup_hour_4',
       'pickup_hour_5', 'pickup_hour_6', 'pickup_hour_7', 'pickup_hour_8',
       'pickup_hour_9', 'pickup_hour_10', 'pickup_hour_11',
       'pickup_hour_12', 'pickup_hour_13', 'pickup_hour_14',
       'pickup_hour_15', 'pickup_hour_16', 'pickup_hour_17',
       'pickup_hour_18', 'pickup_hour_19', 'pickup_hour_20',
       'pickup_hour_21', 'pickup_hour_22', 'pickup_hour_23',
       'pickup_weekday_0', 'pickup_weekday_1', 'pickup_weekday_2',
       'pickup_weekday_3', 'pickup_weekday_4', 'pickup_weekday_5',
       'pickup_weekday_6', 'pickup_month_1', 'pickup_month_2',
       'pickup_month_3', 'pickup_month_4', 'pickup_month_5',
       'pickup_month_6', 'passenger_count_0', 'passenger_count_1',
       'passenger_count_2', 'passenger_count_3', 'passenger_count_4',
       'passenger_count_5', 'passenger_count_6', 'passenger_count_7',
       'route_cooridnates']
train.to_csv("train_full_parsed_clean5.csv",index=False, columns=feature_out)

In [8]:
feature_out=['id','log_duration','pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'great_circle_distance','distance',
       'snow', 'holiday', 'vendor_id', 'pickup_hour_0',
       'pickup_hour_1', 'pickup_hour_2', 'pickup_hour_3', 'pickup_hour_4',
       'pickup_hour_5', 'pickup_hour_6', 'pickup_hour_7', 'pickup_hour_8',
       'pickup_hour_9', 'pickup_hour_10', 'pickup_hour_11',
       'pickup_hour_12', 'pickup_hour_13', 'pickup_hour_14',
       'pickup_hour_15', 'pickup_hour_16', 'pickup_hour_17',
       'pickup_hour_18', 'pickup_hour_19', 'pickup_hour_20',
       'pickup_hour_21', 'pickup_hour_22', 'pickup_hour_23',
       'pickup_weekday_0', 'pickup_weekday_1', 'pickup_weekday_2',
       'pickup_weekday_3', 'pickup_weekday_4', 'pickup_weekday_5',
       'pickup_weekday_6', 'pickup_month_1', 'pickup_month_2',
       'pickup_month_3', 'pickup_month_4', 'pickup_month_5',
       'pickup_month_6', 'passenger_count_0', 'passenger_count_1',
       'passenger_count_2', 'passenger_count_3', 'passenger_count_4',
       'passenger_count_5', 'passenger_count_6', 'passenger_count_7',
       'route_coordinates']
test.to_csv("test_full_parsed_clean2.csv",index=False, columns=feature_out)

In [15]:
del train['route_cooridnates']

In [None]:
plt.hist(train['log_trip_duration'].values, bins=100)
plt.xlabel('log(trip_duration)')
plt.ylabel('number of train records')
plt.show()
#sns.distplot(train["log_trip_duration"], bins =100)

In [16]:
Train, Test = train_test_split(train, test_size = 0.2)
X_train = Train.drop(['log_trip_duration'], axis=1)
Y_train = Train["log_trip_duration"]

X_test = Test.drop(['log_trip_duration'], axis=1)
Y_test = Test["log_trip_duration"]


In [17]:
dtrain = xgb.DMatrix(X_train, label=Y_train)
dvalid = xgb.DMatrix(X_test, label=Y_test)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [19]:
xgb_pars = {'min_child_weight': 1, 'eta': 0.5, 'colsample_bytree': 0.9, 
            'max_depth': 10,
'subsample': 0.9, 'lambda': 1., 'nthread': -1, 'booster' : 'gbtree', 'silent': 1,
'eval_metric': 'rmse', 'objective': 'reg:linear'}
model = xgb.train(xgb_pars, dtrain, 10, watchlist, early_stopping_rounds=2,
      maximize=False, verbose_eval=1)
print('Modeling RMSLE %.5f' % model.best_score)

[0]	train-rmse:3.02669	valid-rmse:3.02816
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 2 rounds.
[1]	train-rmse:1.54784	valid-rmse:1.5492
[2]	train-rmse:0.836179	valid-rmse:0.838468
[3]	train-rmse:0.519748	valid-rmse:0.523366
[4]	train-rmse:0.398169	valid-rmse:0.403241
[5]	train-rmse:0.358538	valid-rmse:0.364844
[6]	train-rmse:0.344191	valid-rmse:0.351516
[7]	train-rmse:0.336309	valid-rmse:0.345208
[8]	train-rmse:0.332134	valid-rmse:0.342404
[9]	train-rmse:0.329682	valid-rmse:0.34081
Modeling RMSLE 0.34081


In [None]:
xgb.plot_importance(model, max_num_features=28, height=0.7)
plt.show()

In [None]:
pred = model.predict(dtest)
pred = np.exp(pred) - 1

In [None]:
Test_id=pd.read_csv("test_full_parsed_clean.csv",usecols=['id'])
submission = pd.concat([Test_id, pd.DataFrame(pred)], axis=1)
submission.columns = ['id','trip_duration']
#submission['trip_duration'] = submission.apply(lambda x : 1 if (x['trip_duration'] <= 0) else x['trip_duration'], axis = 1)
submission.to_csv("submission.csv", index=False)