# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import pandas_profiling as pp
from pycaret.regression import *

# Variables to Set

In [2]:
data_file = 'regression_dataset.csv'
target_column = 'Price'

# Load/Transform Data

In [3]:
df = pd.read_csv(data_file)
df[target_column] = df[target_column].astype(float)
df.head()

Unnamed: 0,Price,Type,Living_space,Rooms,Bedrooms,Bathrooms,Floors,Year_built,State,Garages
0,7900.0,Mid-terrace house,40.0,2.0,,1.0,1.0,1980.0,Rheinland-Pfalz,1.0
1,9900.0,Mid-terrace house,70.0,3.0,2.0,1.0,1.0,1890.0,Sachsen-Anhalt,
2,10000.0,Single dwelling,208.0,8.0,,,2.0,1920.0,Mecklenburg-Vorpommern,
3,10500.0,Mid-terrace house,200.0,4.0,,1.0,2.0,1946.0,Sachsen-Anhalt,
4,12000.0,Special property,24.0,2.0,,,,1985.0,Sachsen,


In [4]:
df.dtypes

Price           float64
Type             object
Living_space    float64
Rooms           float64
Bedrooms        float64
Bathrooms       float64
Floors          float64
Year_built      float64
State            object
Garages         float64
dtype: object

# Profile Data

In [5]:
pp.ProfileReport(df)

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=24.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…






# Split into Training/Test and Validation Data

In [6]:
data = df.sample(frac=0.9, random_state=786)
data_unseen = df.drop(data.index)

data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (9465, 10)
Unseen Data For Predictions: (1052, 10)


# Setup Dataset with Pycaret

In [7]:
from pycaret.regression import *
reg = setup(data = data, 
            target = target_column , 
            session_id=123,
            silent = True,
            verbose = True) 

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Price
2,Original Data,"(9465, 10)"
3,Missing Values,True
4,Numeric Features,6
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(6625, 44)"


# Compare Regression Models

In [17]:
compare_models(
#         include = ['lr']
        exclude = ['lightgbm']
)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,228600.0,196600000000.0,438200.0,0.4898,0.5797,0.5848,1.168
gbr,Gradient Boosting Regressor,231500.0,198900000000.0,439800.0,0.4833,0.5997,0.6326,0.14
rf,Random Forest Regressor,233900.0,207500000000.0,449200.0,0.4645,0.5846,0.6141,0.592
xgboost,Extreme Gradient Boosting,238900.0,213500000000.0,457100.0,0.4425,0.6066,0.601,0.396
ridge,Ridge Regression,251400.0,229600000000.0,473200.0,0.4031,0.6896,0.6882,0.01
br,Bayesian Ridge,251400.0,230400000000.0,473900.0,0.4019,0.6869,0.6881,0.012
llar,Lasso Least Angle Regression,251900.0,230900000000.0,474800.0,0.3986,0.6936,0.6883,0.012
lr,Linear Regression,251900.0,230900000000.0,474800.0,0.3985,0.6951,0.6887,0.312
lasso,Lasso Regression,251900.0,230900000000.0,474800.0,0.3985,0.6936,0.6886,0.041
et,Extra Trees Regressor,233700.0,228500000000.0,473000.0,0.3984,0.5803,0.5971,0.531


<catboost.core.CatBoostRegressor at 0x7fbb4ff36908>

# Build a Linear Regression Model

In [9]:
lr = create_model('lr')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,264400.0,266900000000.0,516600.0,0.3588,0.7714,0.7021
1,256300.0,265900000000.0,515700.0,0.3837,0.7179,0.652
2,248200.0,243200000000.0,493200.0,0.4546,0.6356,0.6023
3,267800.0,303300000000.0,550700.0,0.2891,0.7063,0.729
4,263800.0,383600000000.0,619300.0,0.37,0.6776,0.7052
5,250500.0,186800000000.0,432200.0,0.3874,0.6738,0.6897
6,244400.0,199300000000.0,446400.0,0.5846,0.6709,0.7039
7,235300.0,162900000000.0,403600.0,0.4565,0.7131,0.6753
8,239700.0,155200000000.0,394000.0,0.3509,0.6633,0.7139
9,248900.0,141700000000.0,376400.0,0.3493,0.721,0.7134


# Tune Linear Regression Model

In [10]:
tuned_lr = tune_model(lr)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,264400.0,266900000000.0,516600.0,0.3588,0.7714,0.7021
1,256300.0,265900000000.0,515700.0,0.3837,0.7179,0.652
2,248200.0,243200000000.0,493200.0,0.4546,0.6356,0.6023
3,267800.0,303300000000.0,550700.0,0.2891,0.7063,0.729
4,263800.0,383600000000.0,619300.0,0.37,0.6776,0.7052
5,250500.0,186800000000.0,432200.0,0.3874,0.6738,0.6897
6,244400.0,199300000000.0,446400.0,0.5846,0.6709,0.7039
7,235300.0,162900000000.0,403600.0,0.4565,0.7131,0.6753
8,239700.0,155200000000.0,394000.0,0.3509,0.6633,0.7139
9,248900.0,141700000000.0,376400.0,0.3493,0.721,0.7134


In [11]:
print(tuned_lr)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)


# Evaluate Linear Regression Model

In [12]:
evaluate_model(tuned_lr)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

# Predict on Test Dataset

In [13]:
predict_model(tuned_lr)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Linear Regression,248700.0,193700000000.0,440200.0,0.2846,0.7044,0.7038


Unnamed: 0,Living_space,Rooms,Bedrooms,Bathrooms,Year_built,Garages,Type_Bungalow,Type_Castle,Type_Corner house,Type_Duplex,...,State_Nordrhein-Westfalen,State_Rheinland-Pfalz,State_Saarland,State_Sachsen,State_Sachsen-Anhalt,State_Schleswig-Holstein,State_Thüringen,State_not_available,Price,Label
0,456.000000,15.0,11.000000,5.000000,1950.000000,3.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,665000.0,1009608.750
1,210.000000,10.0,10.000000,4.000000,1997.000000,10.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,189500.0,285981.125
2,118.500000,6.0,4.000000,1.000000,1992.000000,3.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,349000.0,500768.500
3,330.000000,11.0,8.000000,3.000000,2003.000000,7.000000,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,398000.0,422192.500
4,130.000000,4.0,2.000000,1.000000,2007.000000,2.704516,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,450000.0,281425.625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2835,230.000000,5.0,3.000000,2.000000,1958.425659,10.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,349000.0,339550.375
2836,235.399994,7.0,4.205069,2.326241,1982.000000,2.704516,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,285000.0,793932.750
2837,133.000000,6.0,3.000000,1.000000,1900.000000,2.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35000.0,5719.125
2838,192.000000,7.0,5.000000,2.000000,1966.000000,1.000000,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,420000.0,464517.375


# Predict on Validation Data

In [14]:
unseen_predictions = predict_model(tuned_lr, data=data_unseen)
unseen_predictions

Unnamed: 0,Price,Type,Living_space,Rooms,Bedrooms,Bathrooms,Floors,Year_built,State,Garages,Label
0,12000.0,Mid-terrace house,61.15,2.0,1.0,1.0,2.0,,Sachsen-Anhalt,,34387.125
1,19900.0,Special property,20.00,1.0,,1.0,,2008.0,Sachsen-Anhalt,,52609.875
2,27500.0,Mid-terrace house,90.00,5.0,,,2.0,1850.0,Sachsen,,-20025.375
3,28000.0,Special property,40.00,2.0,,1.0,,1900.0,Brandenburg,1.0,220227.500
4,29500.0,Mid-terrace house,60.00,2.0,1.0,1.0,1.0,1900.0,Sachsen-Anhalt,,-42270.750
...,...,...,...,...,...,...,...,...,...,...,...
1047,5519232.0,Bungalow,1013.00,26.0,16.0,14.0,3.0,2010.0,Hessen,10.0,2382064.000
1048,5575000.0,Single dwelling,289.00,6.0,,4.0,2.0,2019.0,Schleswig-Holstein,2.0,849907.750
1049,5850000.0,Villa,385.00,11.0,6.0,4.0,4.0,1914.0,Berlin,1.0,2219840.250
1050,6200000.0,Villa,878.00,5.0,3.0,2.0,3.0,2018.0,Hessen,2.0,2656745.000


In [16]:
from pycaret.utils import check_metric
check_metric(unseen_predictions[target_column], unseen_predictions.Label, 'MAPE')

0.6695