In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [None]:
features=['trip_duration','vendor_id','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude',
        'great_circle_distance','duration','passenger_count','pickup_hour','pickup_weekday','pickup_month','snow','holiday',
        'distance']
train=pd.read_csv("train_full_parsed_clean3.csv",usecols=features)

In [None]:
features_test=['vendor_id','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude',
        'great_circle_distance','duration','passenger_count','pickup_hour','pickup_weekday','pickup_month','snow','holiday',
        'distance']
test=pd.read_csv("test_full_parsed_clean.csv",usecols=features_test)

In [None]:
test.groupby('passenger_count').size()

In [None]:
test['passenger_count'].loc[test['passenger_count'] == 9] = 7

In [None]:
def transformation(train,istrain):

    temp = pd.get_dummies(train['pickup_hour'],prefix='pickup_hour')
    train = pd.concat([train,temp],axis=1)
    train = train.drop(["pickup_hour"], axis=1)

    temp = pd.get_dummies(train['pickup_weekday'],prefix='pickup_weekday')
    train = pd.concat([train,temp],axis=1)
    train = train.drop(["pickup_weekday"], axis=1)

    temp = pd.get_dummies(train['pickup_month'],prefix='pickup_month')
    train = pd.concat([train,temp],axis=1)
    train = train.drop(["pickup_month"], axis=1)

    #temp = pd.get_dummies(train['passenger_count'],prefix='passenger_count')
    #train = pd.concat([train,temp],axis=1)
    #train = train.drop(["passenger_count"], axis=1)
    
    train['log_duration']=np.log(train['duration'].values+1)
    train=train.drop(["duration"], axis=1)
    if istrain=="Y":
        train['log_trip_duration']=np.log(train['trip_duration'].values+1)
        train=train.drop(["trip_duration"], axis=1)
    
    return train


In [None]:
test=transformation(test,istrain="N")

In [None]:
train=transformation(train,istrain="Y")

In [None]:
test.info()

In [None]:
plt.hist(train['log_trip_duration'].values, bins=100)
plt.xlabel('log(trip_duration)')
plt.ylabel('number of train records')
plt.show()
#sns.distplot(train["log_trip_duration"], bins =100)

In [None]:
Train, Test = train_test_split(train, test_size = 0.2)
X_train = Train.drop(['log_trip_duration'], axis=1)
Y_train = Train["log_trip_duration"]
X_test = Test.drop(['log_trip_duration'], axis=1)
Y_test = Test["log_trip_duration"]


In [None]:
dtrain = xgb.DMatrix(X_train, label=Y_train)
dvalid = xgb.DMatrix(X_test, label=Y_test)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [None]:
xgb_pars = {'min_child_weight': 1, 'eta': 0.5, 'colsample_bytree': 0.9, 
            'max_depth': 20,
'subsample': 0.9, 'lambda': 1., 'nthread': -1, 'booster' : 'gbtree', 'silent': 1,
'eval_metric': 'rmse', 'objective': 'reg:linear'}
model = xgb.train(xgb_pars, dtrain, 10, watchlist, early_stopping_rounds=2,
      maximize=False, verbose_eval=1)
print('Modeling RMSLE %.5f' % model.best_score)

In [None]:
xgb.plot_importance(model, max_num_features=28, height=0.7)
plt.show()

In [None]:
dtest = xgb.DMatrix(test)

In [None]:
pred = model.predict(dtest)
pred = np.exp(pred) - 1

In [None]:
Test_id=pd.read_csv("test_full_parsed_clean.csv",usecols=['id'])
submission = pd.concat([Test_id, pd.DataFrame(pred)], axis=1)
submission.columns = ['id','trip_duration']
#submission['trip_duration'] = submission.apply(lambda x : 1 if (x['trip_duration'] <= 0) else x['trip_duration'], axis = 1)
submission.to_csv("submission.csv", index=False)