In [14]:
import pandas as pd
from pycaret.regression import *
import mlflow

In [15]:
df_train=pd.read_parquet("Data/green_tripdata_2021-01.parquet")

In [16]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc_taxi_experiment")

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='nyc_taxi_experiment', tags={}>

In [17]:
def preprocess(df):
    #get trip duration

    df["lpep_dropoff_datetime"]=pd.to_datetime(df["lpep_dropoff_datetime"])

    df["lpep_pickup_datetime"]=pd.to_datetime(df["lpep_pickup_datetime"])

    df["trip_duration"]=df["lpep_dropoff_datetime"]-df["lpep_pickup_datetime"]

    #convert to duration in minutes
    df["trip_duration"]=df["trip_duration"].apply(lambda x: x.total_seconds()/60)

    #We are going to only consider trips with duration between 1 and 60 minutes

    df=df[df["trip_duration"].between(1,60)]

    #Select what categorical and numerical variables we are going to use

    return df

df_train=preprocess(df_train)

In [19]:
reg_nyc_taxi=setup(
                    data=df_train,
                    target="trip_duration",
                    session_id=123,
                  normalize = True, transformation = True, transform_target = True, 
                  combine_rare_levels = True, rare_level_threshold = 0.05,
                  remove_multicollinearity = True, multicollinearity_threshold = 0.95, 
                  numeric_features=["trip_distance","fare_amount","total_amount"],
                  categorical_features=["payment_type","trip_type"],
                  ignore_features=["VendorID","lpep_pickup_datetime","lpep_dropoff_datetime",
                                    "store_and_fwd_flag","extra", "ehail_fee","RatecodeID","PULocationID","DOLocationID","passenger_count"],
                  log_experiment = True, experiment_name = 'nyc_pycaret') 


Unnamed: 0,Description,Value
0,session_id,123
1,Target,trip_duration
2,Original Data,"(73908, 21)"
3,Missing Values,True
4,Numeric Features,8
5,Categorical Features,2
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(51735, 13)"


In [None]:
#best=compare_models(include=["xgboost"])

In [20]:
#create_model and compare_models are essentially the same when comparing only one model
xgb=create_model("xgboost")
#tune_model=tune_model(xgb, n_iter=5)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.9655,23.6424,4.8623,0.8177,0.2291,0.1915
1,2.9227,21.5173,4.6387,0.8395,0.2207,0.187
2,2.9741,23.2988,4.8269,0.8215,0.2331,0.1889
3,2.8893,21.4267,4.6289,0.8379,0.2206,0.1829
4,2.8882,21.1411,4.5979,0.8442,0.2145,0.1809
5,2.8635,21.4191,4.6281,0.8374,0.2183,0.1846
6,2.9491,22.6983,4.7643,0.83,0.2242,0.1859
7,2.854,20.7739,4.5578,0.8419,0.2189,0.1856
8,2.84,20.8491,4.5661,0.8458,0.2246,0.1901
9,2.8681,21.6734,4.6555,0.8381,0.2187,0.1822


In [21]:
evaluate_model(xgb)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [22]:
predict_model(xgb)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extreme Gradient Boosting,2.9038,22.12,4.7032,0.8371,0.2288,0.1899


Unnamed: 0,trip_distance,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,payment_type_1.0,payment_type_2.0,payment_type_3.0,payment_type_4.0,payment_type_5.0,trip_type_2.0,trip_type_not_available,trip_duration,Label
0,3.058083,0.915436,-0.302493,0.051339,2.253948,-1.125524,1.0,0.0,0.0,0.0,0.0,0.0,0.0,37.099998,41.768776
1,0.424001,-1.007111,3.306362,0.051339,0.383557,0.463146,0.0,0.0,0.0,0.0,0.0,0.0,1.0,10.000000,20.119576
2,-0.843515,-1.007111,-0.302493,0.051339,-0.513079,0.463146,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.000000,6.289106
3,-0.990071,-0.287046,-0.302493,0.051339,-1.156472,0.463146,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.000000,3.722269
4,-0.543516,-1.007111,-0.302493,0.051339,0.173696,0.463146,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9.000000,12.696668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22168,0.289043,-1.007111,-0.302493,0.051339,0.190206,-1.125524,0.0,1.0,0.0,0.0,0.0,0.0,0.0,32.200001,30.726021
22169,-0.369572,-1.007111,-0.302493,0.051339,0.058139,0.463146,0.0,0.0,0.0,0.0,0.0,0.0,1.0,16.000000,14.790009
22170,-0.755976,0.553321,-0.302493,0.051339,-0.989293,-1.125524,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.533333,7.010928
22171,-1.092517,-1.007111,-0.302493,0.051339,0.539569,-1.125524,1.0,0.0,0.0,0.0,0.0,0.0,0.0,19.183332,25.752277


In [24]:
save_model(xgb, "models/xgb_model3")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=['payment_type',
                                                             'trip_type'],
                                       display_types=True,
                                       features_todrop=['VendorID',
                                                        'lpep_pickup_datetime',
                                                        'lpep_dropoff_datetime',
                                                        'store_and_fwd_flag',
                                                        'extra', 'ehail_fee',
                                                        'RatecodeID',
                                                        'PULocationID',
                                                        'DOLocationID',
                                                        'passenger_count'],
                                       id_columns=[], ml_usecas