## Import Package

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import scipy.stats as st
from datetime import datetime
from geopy import Point
import geopy.distance
from geographiclib.geodesic import Geodesic


from sklearn.metrics import roc_auc_score, mean_squared_log_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Dataset

In [7]:
df = pd.read_csv('/content/drive/MyDrive/dataset/nyc_taxi_trip/df_prepared.csv')
test = pd.read_csv('/content/drive/MyDrive/dataset/nyc_taxi_trip/test_prepared.csv')
sample = pd.read_csv('/content/drive/MyDrive/dataset/nyc_taxi_trip/sample_submission.csv')

## Split Train and Test

In [8]:
X = df.drop('trip_duration', axis=1)
y = df['trip_duration']
test = test[X.columns.tolist()]

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

## Modeling and Evaluation

In [10]:
list_model = [LinearRegression(),
              DecisionTreeRegressor(),
              LGBMRegressor(),
              XGBRegressor(),
              RandomForestRegressor()]

result = pd.DataFrame(columns=['method','RMSLE'])

for model in list_model:
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)
  print(str(model))

  method = str(model).split('(')[0]
  rmsle = mean_squared_log_error(y_val, abs(y_pred), squared=False)
  
  result.loc[len(result)] = [method, rmsle]

LinearRegression()
DecisionTreeRegressor()
LGBMRegressor()
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)
RandomForestRegressor()


In [11]:
result

Unnamed: 0,method,RMSLE
0,LinearRegression,0.565365
1,DecisionTreeRegressor,0.490906
2,LGBMRegressor,0.393296
3,XGBRegressor,0.380002
4,RandomForestRegressor,0.374756


Random Forest won't be used because it's very slow

In [12]:
xgb = XGBRegressor()
xgb.fit(X, y)
y_pred_xg = xgb.predict(test)
sample['trip_duration'] = abs(y_pred_xg.round(0))

sample.to_csv('/content/drive/MyDrive/dataset/nyc_taxi_trip/pred2_imp.csv', index=False)

result: 0.42713

In [13]:
dt = DecisionTreeRegressor()
dt.fit(X, y)
y_pred_dt = dt.predict(test)
sample['trip_duration'] = abs(y_pred_dt.round(0))

sample.to_csv('/content/drive/MyDrive/dataset/nyc_taxi_trip/pred3_imp.csv', index=False)

result: 0.5234

In [14]:
lgbm = LGBMRegressor()
lgbm.fit(X, y)
y_pred_lgbm = lgbm.predict(test)
sample['trip_duration'] = abs(y_pred_lgbm.round(0))

sample.to_csv('/content/drive/MyDrive/dataset/nyc_taxi_trip/pred4_imp.csv', index=False)

result: 0.43792

In [None]:
rf = RandomForestRegressor()
rf.fit(X, y)
y_pred = rf.predict(test)
sample['trip_duration'] = abs(y_pred.round(0))

sample.to_csv('/content/drive/MyDrive/dataset/nyc_taxi_trip/pred1_imp.csv', index=False)

result: 0.41906