In [3]:
import numpy as np
import pandas as pd
import data_clean_utils
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder
from sklearn.model_selection import train_test_split

In [4]:
import dagshub
dagshub.init(repo_owner='gulamkibria775', repo_name='ML_Project', mlflow=True)


import mlflow
     
mlflow.set_tracking_uri("https://dagshub.com/gulamkibria775/ML_Project.mlflow")
     

In [5]:
# mlflow experiment

mlflow.set_experiment("Exp 3 - RF HP Tuning")

2025/08/10 00:39:22 INFO mlflow.tracking.fluent: Experiment with name 'Exp 3 - RF HP Tuning' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/091d49b9b860495eaabf16b38e9aca87', creation_time=1754764762680, experiment_id='3', last_update_time=1754764762680, lifecycle_stage='active', name='Exp 3 - RF HP Tuning', tags={}>

In [6]:
from sklearn import set_config

set_config(transform_output="pandas")

In [7]:
df=pd.read_csv('swiggy.csv')

In [8]:
df=data_clean_utils.perform_data_cleaning(df)

In [9]:

columns_to_drop =  [
                    "city_name",
                    "order_day_of_week",
                    "order_month"]

df.drop(columns=columns_to_drop, inplace=True)

df

Unnamed: 0,age,ratings,weather,traffic,vehicle_condition,type_of_order,type_of_vehicle,multiple_deliveries,festival,city_type,time_taken,is_weekend,pickup_time_minutes,order_time_of_day,distance,distance_type
0,37.0,4.9,sunny,high,2,snack,motorcycle,0.0,no,urban,24,1,15.0,morning,3.025149,short
1,34.0,4.5,stormy,jam,2,snack,scooter,1.0,no,metropolitian,33,0,5.0,evening,20.183530,very_long
2,23.0,4.4,sandstorms,low,0,drinks,motorcycle,1.0,no,urban,26,1,15.0,morning,1.552758,short
3,38.0,4.7,sunny,medium,0,buffet,motorcycle,1.0,no,metropolitian,21,0,10.0,evening,7.790401,medium
4,32.0,4.6,cloudy,high,1,snack,scooter,1.0,no,metropolitian,30,1,15.0,afternoon,6.210138,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45588,30.0,4.8,windy,high,1,meal,motorcycle,0.0,no,metropolitian,32,0,10.0,morning,1.489846,short
45589,21.0,4.6,windy,jam,0,buffet,motorcycle,1.0,no,metropolitian,36,0,15.0,evening,,
45590,30.0,4.9,cloudy,low,1,drinks,scooter,0.0,no,metropolitian,16,0,15.0,night,4.657195,short
45591,20.0,4.7,cloudy,high,0,snack,motorcycle,1.0,no,metropolitian,26,0,5.0,afternoon,6.232393,medium


In [10]:
temp_df = df.copy().dropna()

In [11]:
# split into X and y

X = temp_df.drop(columns='time_taken')
y = temp_df['time_taken']


In [12]:

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [13]:
# transform target column

pt = PowerTransformer()

y_train_pt = pt.fit_transform(y_train.values.reshape(-1,1))
y_test_pt = pt.transform(y_test.values.reshape(-1,1))

In [14]:
num_cols = ["age","ratings","pickup_time_minutes","distance"]

nominal_cat_cols = ['weather',
                    'type_of_order',
                    'type_of_vehicle',
                    "festival",
                    "city_type",
                    "is_weekend",
                    "order_time_of_day"]

ordinal_cat_cols = ["traffic","distance_type"]

In [15]:
# generate order for ordinal encoding

traffic_order = ["low","medium","high","jam"]

distance_type_order = ["short","medium","long","very_long"]

In [16]:
# build a preprocessor

preprocessor = ColumnTransformer(transformers=[
    ("scale", MinMaxScaler(), num_cols),
    ("nominal_encode", OneHotEncoder(drop="first",handle_unknown="ignore",
                                     sparse_output=False), nominal_cat_cols),
    ("ordinal_encode", OrdinalEncoder(categories=[traffic_order,distance_type_order],
                                      encoded_missing_value=-999,
                                      handle_unknown="use_encoded_value",
                                      unknown_value=-1), ordinal_cat_cols)
],remainder="passthrough",n_jobs=-1,force_int_remainder_cols=False,verbose_feature_names_out=False)


preprocessor

0,1,2
,transformers,"[('scale', ...), ('nominal_encode', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,-1
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['low', 'medium', ...], ['short', 'medium', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,-999
,min_frequency,
,max_categories,


In [17]:
# build the pipeline

processing_pipeline = Pipeline(steps=[
                                # ("simple_imputer",simple_imputer),
                                ("preprocess",preprocessor)
                                # ("knn_imputer",knn_imputer)
                            ])

processing_pipeline

0,1,2
,steps,"[('preprocess', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('scale', ...), ('nominal_encode', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,-1
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['low', 'medium', ...], ['short', 'medium', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,-999
,min_frequency,
,max_categories,


In [18]:

X_train_trans = processing_pipeline.fit_transform(X_train)

X_test_trans = processing_pipeline.transform(X_test)



In [19]:
from sklearn.ensemble import RandomForestRegressor
import optuna

In [20]:
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor

In [21]:
def objective(trial):
    with mlflow.start_run(nested=True):
        params = {
            "n_estimators": trial.suggest_int("n_estimators",10,500),
            "max_depth": trial.suggest_int("max_depth",1,30),
            "max_features": trial.suggest_categorical("max_features",[None,"sqrt","log2"]),
            "min_samples_split": trial.suggest_int("min_samples_split",2,10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf",1,10),
            "max_samples": trial.suggest_float("max_samples",0.5,1),
            "random_state": 42,
            "n_jobs": -1,
        }

        # log model parameters
        mlflow.log_params(params)

        # build the model
        rf = RandomForestRegressor(**params)
        model = TransformedTargetRegressor(regressor=rf,transformer=pt)

        # train the model
        model.fit(X_train_trans,y_train)

        # get the predictions
        y_pred_train = model.predict(X_train_trans)
        y_pred_test = model.predict(X_test_trans)


        # perform cross validation
        cv_score = cross_val_score(model,
                                X_train_trans,
                                y_train,
                                cv=5,
                                scoring="neg_mean_absolute_error",
                                n_jobs=-1)

        # mean score
        mean_score = -(cv_score.mean())

        # log avg cross val error
        mlflow.log_metric("cross_val_error",mean_score)

        return mean_score

In [22]:
from sklearn.compose import TransformedTargetRegressor

In [26]:
# create optuna study
study = optuna.create_study(direction="minimize")

with mlflow.start_run(run_name="best_model"):
    # optimize the objective function
    study.optimize(objective,n_trials=20,n_jobs=-1,show_progress_bar=True)

    # log the best parameters
    mlflow.log_params(study.best_params)

    # log the best score
    mlflow.log_metric("best_score",study.best_value)

    # train the model on best parameters
    best_rf = RandomForestRegressor(**study.best_params)

    best_rf.fit(X_train_trans,y_train_pt.values.ravel())

    # get the predictions
    y_pred_train = best_rf.predict(X_train_trans)
    y_pred_test = best_rf.predict(X_test_trans)

    # get the actual predictions values
    y_pred_train_org = pt.inverse_transform(y_pred_train.reshape(-1,1))
    y_pred_test_org = pt.inverse_transform(y_pred_test.reshape(-1,1))


    # perform cross validation
    model = TransformedTargetRegressor(regressor=best_rf,
                                        transformer=pt)


    scores = cross_val_score(model,
                         X_train_trans,
                         y_train,
                         scoring="neg_mean_absolute_error",
                         cv=5,n_jobs=-1)

    # log metrics
    mlflow.log_metric("training_error",mean_absolute_error(y_train,y_pred_train_org))
    mlflow.log_metric("test_error",mean_absolute_error(y_test,y_pred_test_org))
    mlflow.log_metric("training_r2",r2_score(y_train,y_pred_train_org))
    mlflow.log_metric("test_r2",r2_score(y_test,y_pred_test_org))
    mlflow.log_metric("cross_val",- scores.mean())

    # log the best model
    mlflow.sklearn.save_model(best_rf,path="model")

[I 2025-08-10 01:09:09,680] A new study created in memory with name: no-name-3db8fb3b-d639-4802-89fb-db80eaedd831


  0%|          | 0/20 [00:00<?, ?it/s]



🏃 View run ambitious-hawk-752 at: https://dagshub.com/gulamkibria775/ML_Project.mlflow/#/experiments/3/runs/14391140c3f34dc5aa016a185fad07bf
🧪 View experiment at: https://dagshub.com/gulamkibria775/ML_Project.mlflow/#/experiments/3
[I 2025-08-10 01:09:58,337] Trial 9 finished with value: 3.3918469017031105 and parameters: {'n_estimators': 39, 'max_depth': 19, 'max_features': 'log2', 'min_samples_split': 4, 'min_samples_leaf': 6, 'max_samples': 0.6707296731798902}. Best is trial 9 with value: 3.3918469017031105.
🏃 View run peaceful-hen-130 at: https://dagshub.com/gulamkibria775/ML_Project.mlflow/#/experiments/3/runs/0f94b824d0fb4c95a6f86178659418bd
🧪 View experiment at: https://dagshub.com/gulamkibria775/ML_Project.mlflow/#/experiments/3
[I 2025-08-10 01:10:01,384] Trial 11 finished with value: 6.017535653803142 and parameters: {'n_estimators': 42, 'max_depth': 2, 'max_features': 'log2', 'min_samples_split': 10, 'min_samples_leaf': 9, 'max_samples': 0.5629790512590012}. Best is trial 9 



🏃 View run languid-midge-666 at: https://dagshub.com/gulamkibria775/ML_Project.mlflow/#/experiments/3/runs/fb938bb4048d43f4b423cb858745707d
🧪 View experiment at: https://dagshub.com/gulamkibria775/ML_Project.mlflow/#/experiments/3
🏃 View run dazzling-shrike-857 at: https://dagshub.com/gulamkibria775/ML_Project.mlflow/#/experiments/3/runs/94f09de4ca7e42e6a4caaf3e3ab2f0a0
🧪 View experiment at: https://dagshub.com/gulamkibria775/ML_Project.mlflow/#/experiments/3
[I 2025-08-10 01:11:28,310] Trial 13 finished with value: 3.086615148530084 and parameters: {'n_estimators': 302, 'max_depth': 20, 'max_features': None, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_samples': 0.7940282822893707}. Best is trial 13 with value: 3.086615148530084.
[I 2025-08-10 01:11:29,316] Trial 4 finished with value: 3.1056188168344745 and parameters: {'n_estimators': 485, 'max_depth': 12, 'max_features': None, 'min_samples_split': 6, 'min_samples_leaf': 6, 'max_samples': 0.8089821933362025}. Best is trial 13



🏃 View run best_model at: https://dagshub.com/gulamkibria775/ML_Project.mlflow/#/experiments/3/runs/720ee1bd705e421c8529c7e6398466ad
🧪 View experiment at: https://dagshub.com/gulamkibria775/ML_Project.mlflow/#/experiments/3


In [27]:

optuna.visualization.plot_optimization_history(study)

In [28]:
optuna.visualization.plot_param_importances(study)

In [29]:
optuna.visualization.plot_slice(study)