Same dataset as main_para but uses pycaret (a python library) to have quick model training and comparison.

In [1]:
import pandas as pd
import numpy as np
import xgboost
from pycaret.regression import *

In [2]:
df = pd.read_csv('data/data_fcut.csv')

In [3]:
df_ = df[['duration',
'month_start', 'day_start', 'dayOfWeek_start', 'hour_start',
'month_end', 'day_end', 'dayOfWeek_end', 'hour_end', 
'lat_strt', 'lng_strt', 'lat_end', 'lng_end','statn_dist', 
'rain','status','status_end', 'municipal', 'municipal_end',
'strt_statn', 'end_statn','bike_nr','subsc_type',
]]

In [5]:
exp = setup(data=df_, target='duration', session_id=123,
            categorical_features=['rain', 'municipal','municipal_end','subsc_type'],
            numeric_features=['month_start', 'day_start', 'dayOfWeek_start', 'hour_start', 'month_end', 'day_end',
             'dayOfWeek_end', 'hour_end', 'lat_strt', 'lng_strt', 'lat_end', 'lng_end', 'statn_dist'],
            ignore_features=['status','status_end','strt_statn', 'end_statn','bike_nr'],
            fold=5,use_gpu=True)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,duration
2,Original Data,"(1470266, 23)"
3,Missing Values,True
4,Numeric Features,13
5,Categorical Features,4
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(1029186, 20)"


In [9]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,326.3207,713891.3928,843.8355,0.8654,0.5135,0.675,3.432
et,Extra Trees Regressor,272.0617,1067424.0435,1032.6829,0.7988,0.466,0.5176,154.464
rf,Random Forest Regressor,285.8317,1097274.1898,1046.5425,0.7932,0.4655,0.5205,152.584
gbr,Gradient Boosting Regressor,388.7713,1379449.905,1173.8386,0.7399,0.553,0.7561,137.556
knn,K Neighbors Regressor,358.6577,1435782.975,1197.0874,0.7294,0.521,0.5123,27.266
dt,Decision Tree Regressor,344.4826,2118417.111,1453.4603,0.6013,0.5704,0.5724,9.198
lr,Linear Regression,615.5978,4956097.8,2225.9455,0.0654,0.7245,0.931,0.32
ridge,Ridge Regression,615.5754,4956097.5,2225.9454,0.0654,0.7245,0.9309,0.16
br,Bayesian Ridge,615.4864,4956103.0259,2225.9467,0.0654,0.7243,0.9307,0.958
lar,Least Angle Regression,617.534,4957472.704,2226.2538,0.0651,0.7291,0.9337,0.2


In [10]:
rf = create_model('rf')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,284.937,1066257.8985,1032.5976,0.7963,0.4661,0.5131
1,285.4023,1142846.4044,1069.0399,0.7823,0.4628,0.508
2,286.3473,1061761.3845,1030.4181,0.8087,0.464,0.5267
3,288.6045,1249652.3632,1117.8785,0.7688,0.4685,0.5302
4,283.8674,965852.8983,982.7782,0.8096,0.4659,0.5246
Mean,285.8317,1097274.1898,1046.5425,0.7932,0.4655,0.5205
SD,1.5998,94647.4088,44.9785,0.0157,0.0019,0.0085


In [6]:
xgb = create_model('xgboost')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,178.5363,68431.1797,261.5935,0.6382,0.4131,0.4593
1,178.7711,68762.8906,262.2268,0.6379,0.4128,0.4542
2,179.0043,69017.1953,262.7112,0.6346,0.4144,0.4625
3,178.9216,68947.5547,262.5787,0.6353,0.412,0.461
4,178.3546,68483.7812,261.6941,0.6367,0.4169,0.4833
Mean,178.7176,68728.5203,262.1609,0.6365,0.4138,0.4641
SD,0.2414,236.9807,0.452,0.0014,0.0017,0.01


further cut on lower limit

In [5]:
df_2 = df_.query('duration > 60')
df_.shape[0],df_2.shape[0]

(1470266, 1462238)

In [6]:
exp = setup(data=df_2, target='duration', session_id=123,
            categorical_features=['rain', 'municipal','municipal_end','subsc_type'],
            numeric_features=['month_start', 'day_start', 'dayOfWeek_start', 'hour_start', 'month_end', 'day_end',
             'dayOfWeek_end', 'hour_end', 'lat_strt', 'lng_strt', 'lat_end', 'lng_end', 'statn_dist'],
            ignore_features=['status','status_end','strt_statn', 'end_statn','bike_nr'],
            fold=5,use_gpu=True)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,duration
2,Original Data,"(1462238, 23)"
3,Missing Values,True
4,Numeric Features,13
5,Categorical Features,4
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(1023566, 20)"


In [38]:
best = compare_models(include=['lr','dt','rf','et'])

IntProgress(value=0, description='Processing: ', max=24)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,163.6759,60467.5949,245.9007,0.6774,0.3535,0.2872,132.052
lr,Linear Regression,216.9177,94158.9703,306.8529,0.4977,0.4441,0.3831,0.296
dt,Decision Tree Regressor,205.883,112207.9088,334.9724,0.4014,0.4542,0.3521,7.078


In [15]:
best = compare_models(include=['lightgbm','gbr'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,183.0573,70599.1269,265.7044,0.6234,0.3689,0.3097,3.074
gbr,Gradient Boosting Regressor,192.9348,76649.3417,276.8556,0.5911,0.3875,0.3324,126.114


In [9]:
rf = create_model('rf')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,164.1047,60934.0918,246.8483,0.6757,0.3548,0.2883
1,164.0637,60779.3972,246.5348,0.6773,0.3546,0.2884
2,163.2423,60071.5049,245.0949,0.6793,0.3518,0.2859
3,163.6299,60369.6323,245.7023,0.6774,0.3529,0.2864
4,163.3387,60183.3481,245.3229,0.6774,0.3533,0.2871
Mean,163.6759,60467.5949,245.9007,0.6774,0.3535,0.2872
SD,0.3572,335.2969,0.6815,0.0012,0.0011,0.001


In [9]:
xgb = create_model('xgboost')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,177.0276,67329.7734,259.4798,0.6417,0.3581,0.2961
1,177.7383,67862.1797,260.5037,0.6397,0.3594,0.2972
2,176.9081,67266.9922,259.3588,0.6409,0.3571,0.2953
3,177.2959,67308.4219,259.4387,0.6404,0.358,0.2959
4,176.5216,66997.7422,258.8392,0.6408,0.3571,0.295
Mean,177.0983,67353.0219,259.524,0.6407,0.3579,0.2959
SD,0.4055,281.2103,0.5413,0.0006,0.0008,0.0008


In [7]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,163.6759,60467.5949,245.9007,0.6774,0.3535,0.2872,112.222
et,Extra Trees Regressor,159.332,61293.4387,247.5744,0.673,0.3563,0.281,126.942
xgboost,Extreme Gradient Boosting,177.0983,67353.0219,259.524,0.6407,0.3579,0.2959,1.53
catboost,CatBoost Regressor,178.2138,67767.4911,260.3214,0.6385,0.3612,0.2999,9.754
lightgbm,Light Gradient Boosting Machine,183.0573,70599.1269,265.7044,0.6234,0.3689,0.3097,2.462
gbr,Gradient Boosting Regressor,192.9348,76649.3417,276.8556,0.5911,0.3875,0.3324,107.138
knn,K Neighbors Regressor,210.1658,92047.2609,303.3926,0.509,0.4273,0.3521,24.142
lar,Least Angle Regression,216.9181,94158.9694,306.8529,0.4977,0.4441,0.3831,0.192
br,Bayesian Ridge,216.9183,94158.9884,306.8529,0.4977,0.4441,0.3831,0.848
ridge,Ridge Regression,216.9161,94159.0156,306.8529,0.4977,0.4441,0.3831,0.156


In [9]:
xgb_tuned = tune_model(xgboost)

ValueError: Estimator <module 'xgboost' from 'C:\\Users\\david\\anaconda3\\envs\\DSTest_pycaret\\lib\\site-packages\\xgboost\\__init__.py'> does not have the required fit() method.