In [4]:
import os
import pandas as pd
from pycaret.regression import (
    setup, compare_models, create_model, tune_model, finalize_model,
    predict_model, save_model, plot_model, evaluate_model, pull
)

# Ensure a models/ folder for artifacts
os.makedirs("models", exist_ok=True)

In [14]:
df = pd.read_csv("melbourne_cleaned.csv")
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,Seller,Date,Distance,Postcode,...,CouncilArea,Latitude,Longitude,Region,Propertycount,LogPrice,SaleYear,SaleMonth,PropertyAge,Price_per_sqm
0,Airport West,154 Halsey Rd,3,t,840000,PI,Nelson,2023-09-03,13.5,3042,...,Moonee Valley,-37.718,144.878,Western Metropolitan,3464,13.641158,2023,9,7.0,3733.333333
1,Albert Park,105 Kerferd Rd,2,h,1275000,S,hockingstuart,2023-09-03,3.3,3206,...,Port Phillip,-37.8459,144.9555,Southern Metropolitan,3280,14.058458,2023,9,123.0,15548.780488
2,Albert Park,85 Richardson St,2,h,1455000,S,Thomson,2023-09-03,3.3,3206,...,Port Phillip,-37.845,144.9538,Southern Metropolitan,3280,14.190517,2023,9,,
3,Alphington,6 Smith St,4,h,2000000,S,Brace,2023-09-03,6.4,3078,...,Darebin,-37.7707,145.0318,Northern Metropolitan,2211,14.508658,2023,9,93.0,7604.562738
4,Alphington,5/6 Yarralea St,3,h,1110000,S,Jellis,2023-09-03,6.4,3078,...,Yarra,-37.7854,145.0325,Northern Metropolitan,2211,13.919871,2023,9,10.0,


In [17]:
import inspect
from pycaret.regression import setup

bin_candidates = [c for c in ['Distance','Landsize','BuildingArea','PropertyAge'] if c in df.columns]

base_params = dict(
    data=df,
    target="Price",
    session_id=42,
    fold=5,
    normalize=True,
    transform_target=True,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.95,
    feature_interaction=True,
    numeric_imputation="median",
    categorical_imputation="mode",
    ignore_features=[c for c in ['Address','Seller','Date','LogPrice','Price_per_sqm','Postcode'] if c in df.columns],
    # turn OFF PyCaret ↔ MLflow logging to avoid the crash:
    log_experiment=False,
    log_plots=False,
    log_profile=False,
    log_data=False,
)

maybe_params = {
    "unknown_categorical_method": "least_frequent",
    "bin_numeric_features": bin_candidates if bin_candidates else None,
}

sig = inspect.signature(setup)
params = {k:v for k,v in {**base_params, **maybe_params}.items() if (v is not None and k in sig.parameters)}
exp = setup(**params)
print("PyCaret setup complete (logging disabled to avoid MLflow conflict).")

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Price
2,Target type,Regression
3,Original data shape,"(13580, 26)"
4,Transformed data shape,"(13580, 33)"
5,Transformed train set shape,"(9506, 33)"
6,Transformed test set shape,"(4074, 33)"
7,Ignore features,6
8,Numeric features,14
9,Categorical features,5


PyCaret setup complete (logging disabled to avoid MLflow conflict).


In [18]:
from pycaret.regression import compare_models, tune_model, finalize_model, save_model

top3 = compare_models(n_select=3, sort="RMSE")
best = top3[0]
tuned = tune_model(best, optimize="RMSE")
final_model = finalize_model(tuned)
save_model(final_model, "models/melbourne_price_pipeline")
print(" Saved retrained pipeline.")

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,165864.5669,86933517247.622,292519.6865,0.7892,0.1944,0.145,2.722
lightgbm,Light Gradient Boosting Machine,172749.8539,91827547919.1792,300842.6579,0.7771,0.2027,0.1517,0.384
xgboost,Extreme Gradient Boosting,174834.6561,94497433085.0826,305291.7452,0.7709,0.206,0.1549,0.266
et,Extra Trees Regressor,183624.0785,104026026379.0341,320397.9714,0.7476,0.2168,0.1621,2.286
rf,Random Forest Regressor,180368.4028,104399258159.5204,321104.7349,0.7463,0.2141,0.1593,2.778
gbr,Gradient Boosting Regressor,188306.054,109562477769.3887,328864.4644,0.7341,0.2195,0.1652,0.72
knn,K Neighbors Regressor,219856.0315,136058197686.6788,366585.9457,0.6696,0.2617,0.199,0.188
huber,Huber Regressor,219490.9926,141341474718.1017,374025.8052,0.6553,0.2613,0.2006,0.24
lar,Least Angle Regression,219800.336,141703540600.8989,374429.2573,0.6544,0.2609,0.2013,0.186
ridge,Ridge Regression,219800.9575,141706194396.6482,374432.9424,0.6544,0.2609,0.2013,0.17


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,185807.8393,104031751148.413,322539.5342,0.7607,0.2125,0.1591
1,196671.4717,145434012686.8157,381358.1161,0.688,0.2304,0.1668
2,171845.5265,83260399286.4149,288548.7815,0.7691,0.2052,0.1573
3,180237.0517,88301435127.935,297155.5739,0.7738,0.2139,0.1634
4,176154.4902,76238151137.5247,276112.5697,0.8025,0.2011,0.1554
Mean,182143.2759,99453149877.4206,313142.9151,0.7588,0.2126,0.1604
Std,8602.7947,24741485057.8737,37345.2087,0.0381,0.0101,0.0041


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Transformation Pipeline and Model Successfully Saved
 Saved retrained pipeline.


In [19]:
import mlflow
import mlflow.sklearn
from pycaret.regression import predict_model, pull

mlflow.set_experiment("melb_price_exp")

with mlflow.start_run(run_name="manual_retrain_logging"):
    mlflow.log_param("cv_folds", 5)
    mlflow.log_param("normalize", True)
    mlflow.log_param("feature_interaction", True)
    mlflow.log_param("ignored_features", "Address,Seller,Date,LogPrice,Price_per_sqm,Postcode")

    # Log holdout metrics (best available from PyCaret last table)
    try:
        results_tbl = pull()
        for metric in ["MAE","MSE","RMSE","R2","MAPE"]:
            if metric in results_tbl.columns:
                val = results_tbl[metric].min() if metric != "R2" else results_tbl[metric].max()
                mlflow.log_metric(f"holdout_{metric}", float(val))
    except Exception as e:
        print(f"(Info) Could not pull metrics table: {e}")

    # Log model artifact
    mlflow.sklearn.log_model(final_model, artifact_path="melbourne_price_model")

    # Optional: register if your tracking URI has a registry backend
    try:
        mlflow.register_model(
            f"runs:/{mlflow.active_run().info.run_id}/melbourne_price_model",
            name="MelbourneHousePriceModel"
        )
        print("Attempted MLflow registration.")
    except Exception as e:
        print(f"(Info) Model registry not available here: {e}")

print(" Manual MLflow logging done. View with:  mlflow ui --port 5000")

Registered model 'MelbourneHousePriceModel' already exists. Creating a new version of this model...
Created version '2' of model 'MelbourneHousePriceModel'.


Attempted MLflow registration.
 Manual MLflow logging done. View with:  mlflow ui --port 5000
