In [1]:
# replace your read_csv with this codes
from hydra import initialize, compose, utils
from omegaconf import OmegaConf
import pandas as pd
import os

current_directory = os.getcwd()

with initialize(version_base=None, config_path="../../config/"):
    cfg = compose(config_name='main.yaml')
    print(f"Process data using {cfg.data.raw.hdb}")
    csv_file_path = os.path.join(current_directory, "../../", cfg.data.raw.hdb)
    df = pd.read_csv(csv_file_path)
    print(df.head())

Process data using data/raw/01_hdb_resale_transactions.csv
  block           street_name        town  postal_code    month flat_type  \
0   174   ANG MO KIO AVENUE 4  ANG MO KIO       560174  2015-01    3 ROOM   
1   541  ANG MO KIO AVENUE 10  ANG MO KIO       560541  2015-01    3 ROOM   
2   163   ANG MO KIO AVENUE 4  ANG MO KIO       560163  2015-01    3 ROOM   
3   446  ANG MO KIO AVENUE 10  ANG MO KIO       560446  2015-01    3 ROOM   
4   557  ANG MO KIO AVENUE 10  ANG MO KIO       560557  2015-01    3 ROOM   

  storey_range  floor_area_sqm      flat_model  lease_commence_date  \
0     07 TO 09            60.0        Improved                 1986   
1     01 TO 03            68.0  New Generation                 1981   
2     01 TO 03            69.0  New Generation                 1980   
3     01 TO 03            68.0  New Generation                 1979   
4     07 TO 09            68.0  New Generation                 1980   

   resale_price  latitude   longitude      cbd_dist

In [None]:
# initialize setup
from pycaret.regression import *
s = setup(df, 
          target = 'resale_price', 
           transform_target = True, 
           log_experiment = True, 
           experiment_name = 'hdb_1',
           train_size = 0.8,
           categorical_features = ['town', 'flat_type', 'storey_range', 'flat_model', 'month', 'lease_commence_date'],
           numeric_features = ['floor_area_sqm', 'cbd_dist', 'min_dist_mrt'],
           ignore_features = ['block', 'street_name', 'latitude', 'longitude'],
           remove_outliers = True,
           remove_multicollinearity = True,
           multicollinearity_threshold = 0.8,
           preprocess = True,
           transform_target_method = 'quantile',
           categorical_imputation = 'drop',
           use_gpu = True
         )


In [None]:
best = compare_models(exclude = ['lar', 'lightgbm', 'ada', 'rf', 'lasso', 'et', 'catboost'])

In [None]:
plot_model(best, plot = 'residuals')

In [None]:
plot_model(best, plot = 'learning')

In [None]:
plot_model(best, plot = 'vc')

In [None]:
plot_model(best, plot = 'error')

In [None]:
plot_model(best, plot = 'feature')

In [None]:
plot_model(best, plot = 'feature_all')

In [2]:
df = df.drop(['block', 'street_name', 'latitude', 'longitude'], axis=1)

In [3]:
df[["Year", "Month"]] = df['month'].str.split("-", expand=True)

In [4]:
df = df.drop(['month'], axis=1)

In [5]:
from pycaret.regression import *
s = setup(df, 
          target = 'resale_price', 
          transform_target = True,
          log_experiment = True, 
          experiment_name = 'hdb_1',
          train_size = 0.8,
          categorical_features = ['town','postal_code', 'Year', 'Month', 'flat_type', 'storey_range', 'flat_model', 'lease_commence_date'],
          numeric_features = ['floor_area_sqm', 'cbd_dist', 'min_dist_mrt'],
          remove_outliers = True,
          remove_multicollinearity = True,
          multicollinearity_threshold = 0.8,
          transform_target_method = 'quantile',
          preprocess = True,
          categorical_imputation = 'drop',
          use_gpu = True
         )

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] U

Unnamed: 0,Description,Value
0,Session id,2733
1,Target,resale_price
2,Target type,Regression
3,Original data shape,"(194240, 12)"
4,Transformed data shape,"(186470, 71)"
5,Transformed train set shape,"(147622, 71)"
6,Transformed test set shape,"(38848, 71)"
7,Numeric features,3
8,Categorical features,8
9,Preprocess,True


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


In [None]:
best = compare_models(exclude = ['lar', 'lightgbm', 'ada', 'rf', 'lasso', 'et', 'catboost'])

In [6]:
gb = create_model('gbr')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,36031.3614,2574026258.3798,50734.8624,0.9038,0.0959,0.0739
1,36047.5145,2695643969.9203,51919.5914,0.8986,0.0965,0.0731
2,35685.2312,2595440597.087,50945.4669,0.9015,0.0962,0.0732
3,35318.5683,2573539163.3907,50730.0617,0.9006,0.0956,0.0725
4,35780.7752,2608377331.52,51072.2756,0.9026,0.0962,0.0733
5,35058.1898,2478913217.5771,49788.6856,0.908,0.0934,0.0718
6,36108.3864,2709930106.3939,52056.989,0.8962,0.0964,0.0732
7,36289.0353,2707001695.8418,52028.8545,0.8988,0.0971,0.0737
8,36345.9189,2703702325.7529,51997.1377,0.8993,0.0969,0.0741
9,35232.0122,2553209927.7622,50529.2977,0.903,0.0953,0.0724


In [None]:
plot_model(best, plot = 'learning')

In [None]:
plot_model(best, plot = 'vc')

In [None]:
plot_model(best, plot = 'feature_all')

In [7]:
# finalize the model
final_best = finalize_model(gb)

# save model to disk
save_model(final_best, 'gb')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\Akira\AppData\Local\Temp\joblib),
          steps=[('target_transformation',
                  TransformerWrapperWithInverse(transformer=TargetTransformer(estimator=QuantileTransformer(output_distribution='normal',
                                                                                                            random_state=2733)))),
                 ('numerical_imputer',
                  TransformerWrapper(include=['floor_area_sqm', 'cbd_dist',
                                              'min_dist_mrt'],
                                     transf...
                                     transformer=LeaveOneOutEncoder(cols=['town'],
                                                                    handle_missing='return_nan',
                                                                    random_state=2733))),
                 ('remove_multicollinearity',
                  TransformerWrapper(exclude=[],
                  