# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import ydata_profiling as pp
from pycaret.regression import *

# Variables to Set

In [2]:
data_file = 'regression_dataset.csv'
target_column = 'Price'

# Load/Transform Data

In [3]:
df = pd.read_csv(data_file)
df[target_column] = df[target_column].astype(float)
df.head()

Unnamed: 0,Price,Type,Living_space,Rooms,Bedrooms,Bathrooms,Floors,Year_built,State,Garages
0,7900.0,Mid-terrace house,40.0,2.0,,1.0,1.0,1980.0,Rheinland-Pfalz,1.0
1,9900.0,Mid-terrace house,70.0,3.0,2.0,1.0,1.0,1890.0,Sachsen-Anhalt,
2,10000.0,Single dwelling,208.0,8.0,,,2.0,1920.0,Mecklenburg-Vorpommern,
3,10500.0,Mid-terrace house,200.0,4.0,,1.0,2.0,1946.0,Sachsen-Anhalt,
4,12000.0,Special property,24.0,2.0,,,,1985.0,Sachsen,


In [4]:
df.dtypes

Price           float64
Type             object
Living_space    float64
Rooms           float64
Bedrooms        float64
Bathrooms       float64
Floors          float64
Year_built      float64
State            object
Garages         float64
dtype: object

# Profile Data

In [5]:
pp.ProfileReport(df)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



# Split into Training/Test and Validation Data

In [8]:
data = df.sample(frac=0.75, random_state=786)
data_unseen = df.drop(data.index)

data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (7888, 10)
Unseen Data For Predictions: (2629, 10)


# Setup Dataset with Pycaret

In [11]:
reg = setup(data = data, 
            target = target_column , 
            session_id=123,
            # silent = True,
            verbose = True) 

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Price
2,Target type,Regression
3,Original data shape,"(7888, 10)"
4,Transformed data shape,"(7888, 35)"
5,Transformed train set shape,"(5521, 35)"
6,Transformed test set shape,"(2367, 35)"
7,Numeric features,7
8,Categorical features,2
9,Rows with missing values,58.4%


# Compare Regression Models

In [12]:
compare_models(
#         include = ['lr']
        
        exclude = ['lightgbm']
)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,235138.2947,190496568577.617,434629.7268,0.4555,0.6064,0.6421,0.076
et,Extra Trees Regressor,234584.5269,193679484407.6774,437601.7098,0.4514,0.591,0.613,0.215
rf,Random Forest Regressor,237527.5169,203554868545.5056,448347.279,0.4266,0.5949,0.629,0.239
br,Bayesian Ridge,254345.6578,219095840960.6798,464019.3544,0.3943,0.6965,0.6916,0.022
ridge,Ridge Regression,255173.9575,219223775268.4752,464213.918,0.3936,0.7079,0.6949,0.02
lr,Linear Regression,255395.4286,219364367789.6747,464397.0068,0.3931,0.7195,0.6956,0.394
lasso,Lasso Regression,255403.0828,219378766605.4958,464411.6685,0.393,0.7161,0.6957,0.245
llar,Lasso Least Angle Regression,255402.6681,219378603104.7718,464411.4698,0.393,0.7162,0.6957,0.022
xgboost,Extreme Gradient Boosting,244339.8781,207730547097.6,453938.7406,0.3904,0.6189,0.6042,0.13
huber,Huber Regressor,260643.7021,248613783055.1453,494190.0845,0.3149,0.679,0.7352,0.113


# Build a Linear Regression Model

In [18]:
lr = create_model('lr')
gbr = create_model('gbr')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,253751.3427,177320052717.4123,421093.8764,0.4023,0.7413,0.6583
1,267174.413,322717074704.9928,568081.9261,0.4196,0.7726,0.7328
2,261470.3542,194727889602.5328,441279.8314,0.4269,0.7228,0.804
3,273402.9819,331875967958.2029,576086.7712,0.4588,0.6922,0.6132
4,255334.4257,205816423146.0852,453669.9496,0.4409,0.6735,0.6651
5,246273.6033,157337947696.0526,396658.4774,0.4545,0.7965,0.6926
6,242963.0707,247466052938.2084,497459.5993,0.3005,0.6994,0.7229
7,263625.5756,217436107060.9548,466300.4472,0.3004,0.7507,0.7073
8,252463.9801,168561495851.462,410562.4141,0.3361,0.6716,0.6896
9,237494.5389,170384666220.8436,412776.7753,0.3907,0.6748,0.6697


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,248255.4287,210680935621.7378,458999.9299,0.2899,0.5921,0.611
1,238533.2495,200122543648.7329,447350.5825,0.6401,0.6435,0.6525
2,229501.1444,162188325904.8598,402726.1177,0.5227,0.6262,0.7277
3,257143.0784,269560827581.2062,519192.4764,0.5604,0.6251,0.5909
4,249583.3698,209456990802.3962,457664.7144,0.431,0.6088,0.6362
5,224275.9872,139474785014.9938,373463.2311,0.5164,0.6001,0.6496
6,213540.9455,207733070551.6082,455777.4353,0.4128,0.5984,0.6651
7,230596.6029,154418139002.5539,392960.7347,0.5032,0.5937,0.6292
8,239223.7281,180703072481.7498,425091.8401,0.2883,0.5999,0.6287
9,220729.4125,170626995166.3308,413070.2061,0.3899,0.5757,0.6297


# Tune Linear Regression Model

In [14]:
tuned_lr = tune_model(lr)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,253751.3427,177320052717.4123,421093.8764,0.4023,0.7413,0.6583
1,267174.413,322717074704.9928,568081.9261,0.4196,0.7726,0.7328
2,261470.3542,194727889602.5328,441279.8314,0.4269,0.7228,0.804
3,273402.9819,331875967958.2029,576086.7712,0.4588,0.6922,0.6132
4,255334.4257,205816423146.0852,453669.9496,0.4409,0.6735,0.6651
5,246273.6033,157337947696.0526,396658.4774,0.4545,0.7965,0.6926
6,242963.0707,247466052938.2084,497459.5993,0.3005,0.6994,0.7229
7,263625.5756,217436107060.9548,466300.4472,0.3004,0.7507,0.7073
8,252463.9801,168561495851.462,410562.4141,0.3361,0.6716,0.6896
9,237494.5389,170384666220.8436,412776.7753,0.3907,0.6748,0.6697


Fitting 10 folds for each of 2 candidates, totalling 20 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [15]:
print(tuned_lr)

LinearRegression(n_jobs=-1)


# Evaluate Linear Regression Model

In [20]:
evaluate_model(tuned_lr)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

# Predict on Test Dataset

In [21]:
predict_model(tuned_lr)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Linear Regression,247244.5769,225176936059.292,474528.1194,0.3178,0.7089,0.7029


Unnamed: 0,Type,Living_space,Rooms,Bedrooms,Bathrooms,Floors,Year_built,State,Garages,Price,prediction_label
550,Special property,40.000000,2.0,,1.0,,,Brandenburg,1.0,79000.0,295464.0
4975,Duplex,355.000000,20.0,,5.0,4.0,1886.0,Sachsen,2.0,295000.0,446808.0
6190,Mid-terrace house,235.000000,8.0,,3.0,4.0,1994.0,Bayern,1.0,890000.0,1005420.0
5116,Single dwelling,160.000000,6.0,4.0,2.0,4.0,1983.0,Bayern,2.0,525000.0,814516.0
2312,Duplex,280.000000,8.5,4.0,2.0,,1935.0,Nordrhein-Westfalen,1.0,395000.0,534660.0
...,...,...,...,...,...,...,...,...,...,...,...
4786,Duplex,269.000000,15.0,,,3.0,1913.0,Sachsen,2.0,300000.0,203568.0
7590,Multiple dwelling,154.970001,6.0,4.0,2.0,2.0,1950.0,Nordrhein-Westfalen,2.0,315000.0,359580.0
6328,Duplex,180.000000,10.0,,2.0,2.0,1910.0,Nordrhein-Westfalen,4.0,195000.0,235944.0
1416,Duplex,380.000000,14.0,,,,1898.0,Sachsen,4.0,290000.0,394012.0


# Predict on Validation Data

In [24]:
unseen_predictions = predict_model(tuned_lr, data=data_unseen)
unseen_predictions.head(20)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Linear Regression,252984.1942,294412996771.611,542598.3752,0.2913,0.6966,0.6835


Unnamed: 0,Type,Living_space,Rooms,Bedrooms,Bathrooms,Floors,Year_built,State,Garages,Price,prediction_label
0,Mid-terrace house,70.0,3.0,2.0,1.0,1.0,1890.0,Sachsen-Anhalt,,9900.0,-135308.0
1,Mid-terrace house,61.150002,2.0,1.0,1.0,2.0,,Sachsen-Anhalt,,12000.0,29048.0
2,Mid-terrace house,70.0,3.0,2.0,1.0,1.0,,Sachsen-Anhalt,,13500.0,-55020.0
3,,30.0,1.0,,,1.0,1960.0,Sachsen-Anhalt,1.0,16000.0,-59772.0
4,Duplex,183.0,10.0,,,,1929.0,Brandenburg,,19500.0,318532.0
5,Special property,20.0,1.0,,1.0,,2008.0,Sachsen-Anhalt,,19900.0,74368.0
6,Mid-terrace house,120.0,4.0,3.0,1.0,1.0,1960.0,Hessen,1.0,20000.0,345944.0
7,Single dwelling,80.0,5.0,,1.0,,1920.0,Sachsen,1.0,25000.0,-40212.0
8,Mid-terrace house,90.0,5.0,,,2.0,1850.0,Sachsen,,27500.0,-44072.0
9,Special property,40.0,2.0,,1.0,,1900.0,Brandenburg,1.0,28000.0,226728.0


In [25]:
unseen_predictions[unseen_predictions['Type']=='Farmhouse']

Unnamed: 0,Type,Living_space,Rooms,Bedrooms,Bathrooms,Floors,Year_built,State,Garages,Price,prediction_label
18,Farmhouse,90.000000,5.0,,1.0,,1924.0,Sachsen-Anhalt,1.0,30000.0,-93932.0
19,Farmhouse,150.000000,8.0,6.0,1.0,2.0,1950.0,Rheinland-Pfalz,,30310.0,74092.0
26,Farmhouse,200.000000,8.0,,,3.0,,Thüringen,3.0,35000.0,231924.0
31,Farmhouse,110.000000,5.0,2.0,1.0,3.0,1900.0,Thüringen,2.0,38000.0,-5548.0
39,Farmhouse,100.000000,4.0,3.0,1.0,3.0,,Rheinland-Pfalz,,39500.0,164860.0
...,...,...,...,...,...,...,...,...,...,...,...
2252,Farmhouse,164.750000,5.0,4.0,2.0,3.0,2001.0,Hessen,1.0,858600.0,578924.0
2312,Farmhouse,120.000000,5.0,,1.0,4.0,1962.0,Bayern,1.0,960000.0,623764.0
2336,Farmhouse,170.000000,4.0,,2.0,,,Nordrhein-Westfalen,2.0,990000.0,422760.0
2343,Farmhouse,129.199997,5.0,4.0,2.0,4.0,,Bayern,2.0,995000.0,688700.0


In [23]:
from pycaret.utils import check_metric
check_metric(unseen_predictions[target_column], unseen_predictions.Label, 'MAPE')

ImportError: cannot import name 'check_metric' from 'pycaret.utils' (C:\ProgramData\Anaconda3\lib\site-packages\pycaret\utils\__init__.py)