
# Real State Properties Price Prediction  
---

### Project for prediction of Real State properties price  based on the features:


- City (Predictor)
- Type of property (Predictor)
- Address (Predictor)
- Neighborhood (Predictor)
- Footage (Predictor)
- Doorms (Predictor)
- Garages (Predictor)
- Price (Dependent Variable)
---

### We're gonna start our project using notebooks so that we can do it straight forward, after we're done here, we're gonna move into the source code of the application.

In [None]:
## importing the libs for the project

import logging
import pandas as pd
import numpy as np
import scipy.stats as stats
import duckdb as duck
import pyarrow as pa
import polars as pl
import seaborn as sns
import seaborn.objects as so
from matplotlib import pyplot as plt
import dataclasses
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression, GammaRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
import mlflow
from mlflow.tracking.client import MlflowClient
import os
import statsmodels.api as sm
import pickle
from lightgbm import LGBMRegressor

## JSON > Parquet
Before starting the process of cleaning and transforming the data for  our analysis, we're gonna make sure to convert the files into `.parquet` format so that we're always dealing with optimazed performance datasets, no matter the situation.  

For that, we're gonna start by loading our data into [Pola.rs](pola.rs 'Most Efficient DataFrame Lib for Python') dataframes and try to get some information from our dataset.

In [None]:
## read the .json files and convering it to .parquet

sp_dt = pl.read_json('/home/garcia-ln/Documentos/real-state-prices/data/raw/sp_properties.json')
rj_dt = pl.read_json('/home/garcia-ln/Documentos/real-state-prices/data/raw/rj_properties.json')
pa_dt = pl.read_json('/home/garcia-ln/Documentos/real-state-prices/data/raw/pa_properties.json')
bh_dt = pl.read_json('/home/garcia-ln/Documentos/real-state-prices/data/raw/bh_properties.json')

sp_dt.write_parquet('/home/garcia-ln/Documentos/real-state-prices/data/processed/sp_properties.parquet')
rj_dt.write_parquet('/home/garcia-ln/Documentos/real-state-prices/data/processed/rj_properties.parquet')
pa_dt.write_parquet('/home/garcia-ln/Documentos/real-state-prices/data/processed/pa_properties.parquet')
bh_dt.write_parquet('/home/garcia-ln/Documentos/real-state-prices/data/processed/bh_properties.parquet')

In [None]:
## using a duckdb and SQL queries

con = duck.connect(database='/home/garcia-ln/Documentos/real-state-prices/data/processed/real_state.duckdb')

sql_sp = '''
    CREATE TABLE sp_tbl as 
        SELECT * FROM '~/Documentos/real-state-prices/data/processed/sp_properties.parquet';
    ALTER TABLE sp_tbl
        ADD COLUMN city VARCHAR DEFAULT 'Sao_Paulo'
'''

sql_rj = '''
    CREATE TABLE rj_tbl as 
         SELECT * FROM '~/Documentos/real-state-prices/data/processed/rj_properties.parquet';
    ALTER TABLE rj_tbl
        ADD COLUMN city VARCHAR DEFAULT 'Rio_de_Janeiro'
'''

sql_pa = '''
    CREATE TABLE pa_tbl as 
        SELECT * FROM '~/Documentos/real-state-prices/data/processed/pa_properties.parquet';
    ALTER TABLE pa_tbl
        ADD COLUMN city VARCHAR DEFAULT 'Porto_Alegre'
'''

sql_bh = '''
    CREATE TABLE bh_tbl as 
        SELECT * FROM '~/Documentos/real-state-prices/data/processed/bh_properties.parquet';
    ALTER TABLE bh_tbl
        ADD COLUMN city VARCHAR DEFAULT 'Belo_Horizonte'
'''

con.execute(sql_sp).fetchall()
sp_df = con.table('sp_tbl').df()
display(sp_df)


con.execute(sql_rj).fetchall()
rj_df = con.table('rj_tbl').df()
display(rj_df)


con.execute(sql_pa).fetchall()
pa_df = con.table('pa_tbl').df()
display(pa_df)


con.execute(sql_bh).fetchall()
bh_df = con.table('bh_tbl').df()
display(bh_df)

In [None]:
## using SQL queries to create one unified table

sql = '''
    CREATE TABLE properties as
        SELECT * FROM sp_tbl 
    UNION ALL 
        SELECT * FROM rj_tbl 
    UNION ALL 
        SELECT * FROM pa_tbl 
    UNION ALL 
        SELECT * FROM bh_tbl;
    ORDERBY random()
'''

con.execute(sql).fetchall()

properties = pl.from_pandas(con.table('properties').df())
properties

## Dtypes

Now that we altered the file from `.json` to `.parque` and added the feature to our dataset we're gonna **add all the tables together and define the dtypes of our data**.  


After that we're gonna make sure to **change all dtypes of our dataset**, to keep a tidy dataset for our cleaning, analysis and modeling.

In [None]:
## the code for change dtypes on Polars

properties = properties.with_columns(
   [
       (pl.col('type').cast(pl.Categorical)),
       (pl.col('city').cast(pl.Categorical)),
       (pl.col('neighborhood').cast(pl.Categorical)),
       (pl.col('footage').cast(pl.Int16)),
       (pl.col('doorms').cast(pl.Int8)),
       (pl.col('garages').cast(pl.Int8)),
       (pl.col('price').cast(pl.Int32))
   ]
)

properties.write_parquet('/home/garcia-ln/Documentos/real-state-prices/data/raw/properties.parquet')
properties = pl.read_parquet('/home/garcia-ln/Documentos/real-state-prices/data/raw/properties.parquet')
properties = properties.drop('address').sample(frac=1, shuffle=True, seed=42).select(
    [
        'city', 'neighborhood', 'type', 'footage', 'doorms', 'garages', 'price'
    ]
)
properties

In [None]:
## SQL query for changing the dtypes of the properties table

con.execute('''
    ALTER TABLE properties
        DROP COLUMN adress;
    ALTER TABLE properties
        ALTER type SET DATA TYPE VARCHAR;
    ALTER TABLE properties
        ALTER city SET DATA TYPE VARCHAR;
    ALTER TABLE properties
        ALTER neighborhood SET DATA TYPE VARCHAR;
    ALTER TABLE properties    
        ALTER footage SET DATA TYPE SMALLINT;
    ALTER TABLE properties    
        ALTER doorms SET DATA TYPE INT2;
    ALTER TABLE properties
        ALTER garages SET DATA TYPE INT2;
    ALTER TABLE properties    
        ALTER price SET DATA TYPE INT4
'''
)

## EDA
Now it's time for one of the most important part of a data job: the analysis. Here we're gonna focus on undertanding our data, **its features, dimensions, characteristics, distribution and interactions**.  

We're gonna start by understanding the basic information on the qualy and quant variables, followed by some visualizations to help on the insights for our analysis.  

What we want to check in our analysis (aside from business details that depends on the questions to be answered):  
> - Type of dataset
>> - Cross Section
>> - Time Series
>> - Painel Data
> - Frequency, Distributions and Density of observations (undertanding the proportions of inputs and outputs for both numerical and categorical features)
> - Missing values
> - Outliers
> - Dirty data
> - Feature Engineering
>> - Handling missing values, outliers, and cleaning the dataset
>> - Scalling the data (Standardization, Normalization) 
>> - Decoding categorical features
> - Specificity
>> - Normality
>> - Linearity
>> - Means Interaction
>> - Variance Interaction
>> - Autocorrelation
>> - Multicolinearity
>> - Heteroskedasticity
> - Feature Selection 
>> - Correlation
>> - K Neighbor
>> - ChiSquare
>> - Genetic Algo
>> - Feature Importance (Extra Tree Classifier)

Lets make some changes on our dataset to make sure we'll be able to work on it. In this case, i'm gonna use `seaborn` for our dataviz (wich requires the DF on `pandas` format, and not `polars`) insted of using `plotly express` (wich we can use the `polars` DF and generates interactive plots). The reason for that, is for prettier dataviz made simple and easy, seaborn is the way to go and given that we don't have such a big df, there's no problem transforming the pl.df to pd.df just for plotting.  

In [None]:
## checking the null values count and schema of the dataset

display(
    properties.null_count(),
    properties.schema
)

null = properties.filter(pl.col('type')==None).to_pandas()
df = properties.to_pandas()

In [None]:
## checking the unique values of every relevant feature

display(
    properties['type'].value_counts(),
    properties['doorms'].value_counts().sort('doorms'),
    properties['garages'].value_counts().sort('garages'),
    properties['city'].value_counts()
)

From this, we can observe that we have a very small volume of missing values. But before we do anything with it, lets check if we have those null values concentrated on a group, or if it's well distributed through all citys and prices. After that, we decide whether to **drop those null values, or make some statistical interpolation**.

In [None]:
source = 'Source: loft.com.br; Elaborated by author'

sns.set_theme(
    context='notebook', 
    style='darkgrid', 
    font_scale=1,
)

sns.catplot(
    kind='count',
    data=null, x='city', 
    height=4, aspect=1.3,  
    orient='v', 
    alpha=.75, linewidth=1
).set_axis_labels('City', 'Count').set_xticklabels(['SP', 'RJ', 'PA', 'BH'])
plt.title('Null Count/City', fontname='Arial', size=15, fontweight="bold")
plt.annotate(
    source,
    xy=(1, -.2),
    xycoords='axes fraction',
    ha='center',
    fontsize=8)
plt.show()


In [None]:
sns.catplot(
    kind='box',
    data=null, x='price', y='city',   
    orient='h',
    height=5, aspect=1.5
).set_axis_labels('Price', 'City').set_xticklabels(
    labels=['', '$0', '$500K', '$1M', '$1.5M', '$2M',
            '$2.5M', '$3M', '$3.5M', '$4M'], 
    rotation=0
)

plt.title('Null Prices/City', fontname='Arial', size=15, fontweight='bold')
plt.annotate(
    source, 
    xy=(1, -.2),
    xycoords='axes fraction',
    ha='center',
    fontsize=8)
plt.show()

In [None]:
hist=so.Plot(
        null, 
        x="price", 
        y=None, 
        color="city").add(
                so.Bars(), 
                so.Hist(bins=15, stat='count'), 
                so.Dodge(),
                )

hist.label(title='Null Price/City', x='Price', y='Percentage').layout(size=(10, 5)).scale(
        x = so.Continuous().tick(every=500_000).label(like='${x:,.0f}')
).limit(x=(0, 3_500_000)).show()

This plots can make easy for us to understand that São Paulo has the majority of null type records, that's important for us when we proceed to use some interpolation to fill those empty data.

In [None]:
sns.set_theme(
    context='notebook', 
    style='darkgrid', 
    font_scale=1,
)

sns.displot(data=df, x='price', hue='type', col='city', col_wrap=2, 
    kind='hist', log_scale=True, element='step', 
    palette='ch:s=1, r=2, l=.3, d=.5', height=5, aspect=1
).set_axis_labels('Prices(log)')

plt.annotate(
    source,
    xy=(1, -.2),
    xycoords='axes fraction',
    ha='left',
    fontsize=8)
plt.show()

In [None]:
sns.set_theme(
    context='notebook', 
    style='darkgrid', 
    font_scale=1,
)

sns.displot(data=df, x='footage', hue='type', col='city', col_wrap=2, 
    kind='hist', log_scale=True, element='step', 
    palette='ch:s=1, r=2, l=.3, d=.5', height=5, aspect=1
).set_axis_labels('Footage(log)')

plt.annotate(
    source,
    xy=(1, -.2),
    xycoords='axes fraction',
    ha='left',
    fontsize=8)
plt.show()

In [None]:
sns.set_theme(
    context='notebook', 
    style='darkgrid', 
    font_scale=1,
)

sns.relplot(
    data=df, x='price', y='footage', kind='scatter',
    hue='type', col='city', col_wrap=2,
    height=5, aspect=1
).set_axis_labels('Prices (R$)', 'Footage (m²)').set_xticklabels(
    labels=['', '$0', '$5M', '$10M', '$15M', '$20M',
            '$25M', '$30M', '$35M', '$40M', '$45M'], 
    rotation=45
)

plt.annotate(
    source,
    xy=(1, -.2),
    xycoords='axes fraction',
    ha='left',
    fontsize=8)
plt.show()

### Feature Engineering

Now we did some analysis on our dataset, we can take some conclusions, having more information on what to do next:  

We are focus on predicting properties prices based on just a few features (but over all important on our model). Having said that, we notice that we have some **Null values, outliers (Footage, Doorms, Garages and prices) and some categorical observations that we can dropout for being in such small number of observations (types of properties)**. We can do that using the Z-score, or quartile interval 
- Properties types = Apartment, Houses, Studios and Rooftop
- Footage < 1.000m²
- Doorms < 6
- Garages < 8 
- Prices < R$ 10.000.000

we're gonna start the feature engineering before the statistical tests stage

In [None]:
## here we're defining a function to get the quantiles to get the outliers and see how to treat it

def get_quantiles(df: pl.DataFrame, upper: float, lower: float) -> list[pl.DataFrame]:
    '''
        Get the quantiles values given the Dataframe, upper and lower limit
    '''
    upper_quantile = df.quantile(quantile=upper)
    lower_quantile = df.quantile(quantile=lower)
    return [upper_quantile, lower_quantile]

def filter_outliers(
    df: pl.DataFrame, 
    column: str, 
    upper_quantile: float, 
    lower_quantile: float
) -> pl.DataFrame:
    '''
        Filter the values between the quantiles
    '''
    amp = upper_quantile - lower_quantile
    iqr_sup = upper_quantile + (1.5 * amp)
    iqr_inf = lower_quantile - (1.5 * amp) 
    df = df.filter(
        (pl.col(column) < iqr_sup) &
        (pl.col(column) > iqr_inf)
        )
    return df

In [None]:
get_quantiles(properties_eng, .75, .25)

In [None]:
test = filter_outliers(properties, 'doorms', 3, 2)
test = filter_outliers(test, 'garages', 2, 1)
test = filter_outliers(properties, 'footage', 125, 57)
test = filter_outliers(properties, 'doorms', 3, 2)
test.describe()

We've used the inter quantile method to get the outliers, and since we have a pretty small number of outliers and we have a preference to develop models focus on a certain interval of values (we're gonna use the same strategy that Loft, which is the place from where we got our data). So we're gonna try to get our model to predict only for values **between 150k and 1M**.

Wen can do that since we don't have too many outliers and the idea of out model isn't anomalies detection, but a regression using cross section data.

In [None]:
## dropping irrelevant feature (address) which will be replaced by neighborhood

properties_clean = pl.read_parquet('/home/garcia-ln/Documentos/real-state-prices/data/raw/properties.parquet')
properties_clean = properties_clean.drop('address').sample(frac=1, shuffle=True, seed=42).select(
    [
        'city', 'neighborhood', 'type', 'footage', 'doorms', 'garages', 'price'
    ]
)

properties_clean.columns = ['city', 'address', 'type', 'footage', 'doorms', 'garages', 'price']

## now we're gonna fill the null values using forward method, since we've checked the null values are so small
## and don't have particular characteristics

properties_clean = properties_clean.fill_null(strategy='forward').filter(
    (pl.col('type') == 'Apartamento') | 
    (pl.col('type') == 'Casa') | 
    (pl.col('type') == 'Cobertura') |
    (pl.col('type') == 'Duplex') |
    (pl.col('type') == 'Studio')
)

properties_clean = properties_clean.with_columns(
   [
       (pl.col('type').cast(pl.Categorical)),
       (pl.col('city').cast(pl.Categorical)),
       (pl.col('address').cast(pl.Categorical)),
       (pl.col('footage').cast(pl.Int16)),
       (pl.col('doorms').cast(pl.Int8)),
       (pl.col('garages').cast(pl.Int8)),
       (pl.col('price').cast(pl.Int32))
   ]
)

## here we're gonna set some threshold values to filter based on the 
## independent variables price, footage, doorms and garages

properties_clean = properties_clean.filter(
    (pl.col('price') <= 1_000_000) & 
    (pl.col('price') >= 100_000) &
    (pl.col('footage') >= 0) & 
    (pl.col('footage') <= 250) &
    (pl.col('doorms') >= 1) &
    (pl.col('doorms') <= 5) &
    (pl.col('garages') <= 5)
)

display(
    properties_clean,
    properties_clean['type'].value_counts(),
    properties_clean.describe()
)

properties_clean.write_parquet('/home/garcia-ln/Documentos/real-state-prices/data/processed/properties_clean.parquet')

In [None]:
properties_clean = pl.read_parquet('/home/garcia-ln/Documentos/real-state-prices/data/processed/properties_clean.parquet')
df_clean = properties_clean.to_pandas()

In [None]:
sns.set_theme(
    context='notebook', 
    style='darkgrid', 
    font_scale=1,
)

sns.relplot(
    data=properties_clean, x='price', y='footage', kind='scatter',
    col='city', col_wrap=2, hue='type',
    height=4, aspect=1
).set_axis_labels('Prices (R$)', 'Footage (m²)').set_xticklabels(
    labels=['$0', '$200K', '$400K', '$600K', '$800K', '$1M', ''], 
    rotation=0
)
plt.annotate(
    source,
    xy=(1, -.3),
    xycoords='axes fraction',
    ha='left',
    fontsize=8)
plt.show()

In [None]:
sns.set_theme(
    context='notebook', 
    style='darkgrid', 
    font_scale=1,
)

sns.displot(data=properties_clean, x='price', col='city', col_wrap=2, kde=True,
    kind='hist', log_scale=False, element='step', height=5, aspect=1
).set_axis_labels('Prices(R$1,000)').set_xticklabels(
    labels=['0', '200', '400', '600', '800', '1,000', ''], 
    rotation=0
)

plt.annotate(
    source,
    xy=(1, -.2),
    xycoords='axes fraction',
    ha='left',
    fontsize=8)
plt.show()

In [None]:
sns.set_theme(
    context='notebook', 
    style='darkgrid', 
    font_scale=1,
)

sns.displot(data=properties_clean, x='footage', col='city', col_wrap=2, kde=True,
    kind='hist', log_scale=False, element='step', height=5, aspect=1
).set_axis_labels('Footage (m²)')

plt.annotate(
    source,
    xy=(1, -.2),
    xycoords='axes fraction',
    ha='left',
    fontsize=8)
plt.show()

After we've checked the overall characteristics of our dataset, we can proceed with some changes for our model development stage.   

As we've seen, the first thing after having diagnosed of our data is to perform Feature Selection (wich can be done both before we load the data into our development environment (as a SQL Query on some DW, usually) and after we load into our dataset (as we did here).  

In [None]:
sns.set_theme(
    context='notebook', 
    style='darkgrid', 
    font_scale=1
)

sns.catplot(
    kind='box',
    data=df_clean, y='price', x='city',
    orient='v', height=5, aspect=1.3
).set_axis_labels('Cities', 'Prices').set_yticklabels(
    labels=['$0', '$200K', '$400K', '$600K', '$800K', '$1M', ''], 
    rotation=0
)

plt.title('Boxplot Prices/City', fontname='Arial', size=15, fontweight="bold")
plt.annotate(
    source,
    xy=(1, -.2),
    xycoords='axes fraction',
    ha='left',
    fontsize=8)
plt.show()

Now we're gonna encode and scale our features just for good practice, since our intention is using **Xgboost and Lightgbm (wich are ensembled models by gradient boosting that don't require these kinds of feature engineering)**.

In [None]:
## here we're gonna encode the entire values of categorical data, so that our model performance can be maximize

## lets start by changing it to pandas Dataframe, for no problems with sklearn and separating x from y

y = properties_clean['price'].to_pandas()
X1 = properties_clean.drop(columns=['price']).to_pandas()

## we're gonna use OrdinalEncoder since the address feature would create too many dummies variables

ord_enc = OrdinalEncoder()
X2 = ord_enc.fit_transform(X1[['city', 'address', 'type']])
X2 = pd.DataFrame(X2, columns=['city_encoded', 'address_encoded', 'type_encoded'])
X3 = X1.join(X2).drop(['city', 'type'], axis=1)
X3 = pl.DataFrame(X3).select(
    [
        'city_encoded',  
        'address_encoded',
        'type_encoded', 
        'footage', 
        'doorms',
        'garages'
    ]
)

X3.columns = [
    'city', 
    'address',    
    'type', 
    'footage', 
    'doorms',
    'garages'
    ]
X3 = X3.to_pandas()

In [None]:
properties_final = pl.from_pandas(X3.join(y)).with_columns(
   [
       (pl.col('city').cast(pl.Int8)),
       (pl.col('address').cast(pl.Int16)),
       (pl.col('type').cast(pl.Int8)),
       (pl.col('footage').cast(pl.Float32)),
       (pl.col('doorms').cast(pl.Int8)),
       (pl.col('garages').cast(pl.Int8)),
       (pl.col('price').cast(pl.Int32))
   ]
)
properties_final.write_parquet('/home/garcia-ln/Documentos/real-state-prices/data/final/properties_final.parquet')

In [None]:
properties_final = pl.read_parquet('/home/garcia-ln/Documentos/real-state-prices/data/final/properties_final.parquet')

In [None]:
y = properties_final['price'].to_pandas()
X_val = properties_final.drop(columns=['price']).to_pandas()

In [None]:
## spliting the train-test datasets (if we were to perform hyperparameter tunning, we would also split
## the train dataset into 2 datasets: train, validation and test datasets)

X_train, X_test, y_train, y_test = train_test_split(X_val, y, test_size=0.2, random_state=42)

In [None]:
## here scal the x dataset

ss = StandardScaler()
X_train = ss.fit(X).transform(X_train)
X_train = pd.DataFrame(X_train, columns=['city', 'address', 'type', 'footage', 'doorms', 'garages'])
X_train

### Statistical Tests

Since **we're gonna use Xgboost and Lightgbm algorithms for regression, which are not parametrized (in other words, it doesn't have assumptions aplied to the data). So it's not necessary for us to perform the majority of the statistical tests** that would be necessary for OLS, ARIMA, SARIMA, PCA, SVM, etc...  

Those tests are necessary to understand better (accept or deny the assumptions) the data that we have, how should we treat it and what estimator to use for better and more robust models.  

Since it's not the case, the only test we're gonna perform is to **check colinearity throught correlation matrix**.

In [None]:
sns.set_theme(
    context='notebook', 
    style='darkgrid', 
    font_scale=1.1,
)

sns.heatmap( 
    data=X3.corr(
        method='pearson', 
        numeric_only=True
    ),
    linewidths=.5, 
    linecolor='w', 
    annot=True,
    fmt='.2f'
)
    
plt.title('Heatmap: Pearson Corr. Test', fontname='Arial', size=15, fontweight='bold')
plt.annotate(
    source,
    xy=(1, -.2),
    xycoords='axes fraction',
    ha='left',
    fontsize=9)
plt.show()

In [None]:
sns.set_theme(
    context='notebook', 
    style='darkgrid', 
    font_scale=1.1,
)

sns.heatmap( 
    data=X3.corr(
        method='spearman', 
        numeric_only=True
    ),
    linewidths=.5, 
    linecolor='w', 
    annot=True,
    fmt='.2f'
)
    
plt.title('Heatmap: Spearman Corr. Test', fontname='Arial', size=15, fontweight='bold')
plt.annotate(
    source,
    xy=(1, -.2),
    xycoords='axes fraction',
    ha='left',
    fontsize=9)
plt.show()

The higher correlation is .75 between doorms and footage, which we can still consider the independent matrix linear independent (no multicolinearity detect).

In [None]:
## creating the validation metrics

def eval_metrics(actual: pd.DataFrame, predict: pd.DataFrame) -> list[float]:
    '''
        Defining the 3 evaluation metrics, given actual and predicted values: 
            - Root Mean Squared Error
            - Mean Absolute Error
            - R²
    '''
    rmse = np.sqrt(mean_squared_error(actual, predict))
    mae = mean_absolute_error(actual, predict)
    r2 = r2_score(actual, predict)
    return rmse, mae, r2

###### Before we use mlflow to log the experiments, we're gonna use cross validation so we don't overfit the model and get unwanted results

In [None]:
lgbm_reg = LGBMRegressor(learning_rate=.3, max_depth=13, boosting_type='gbdt')
xgb_reg = XGBRegressor(learning_rate=.3, max_depth=6, booster='gbtree')
cv = KFold(n_splits=10, random_state=42, shuffle=True)

cv_lgbm = cross_val_score(estimator=lgbm_reg, X=X_train, y=y_train, scoring='neg_root_mean_squared_error', cv=cv).mean()
cv_xgb = cross_val_score(estimator=xgb_reg, X=X_train, y=y_train, scoring='neg_root_mean_squared_error', cv=cv).mean()

cv_lgbm, cv_xgb

In [None]:
# from skopt.space import Real, Categorical, Integer

# search_space = {
#     'learning_rate': Real(.1, 1),
#     'max_depth': Integer(-1, 15),
#     'boosting_type': Categorical(['gbdt', 'dart', 'goss', 'rf'])
# }

# lgbm_opt = BayesSearchCV(
#     LGBMRegressor(),
#         search_spaces=search_space,
#     scoring='neg_root_mean_squared_error',
#     n_iter=32,
#     random_state=42,
#     n_jobs=-1,
#     cv=10
#  )

# lgbm_opt.fit(X_train, y_train)

### mlflow

Now we're gonna create logs from the models, parameters, metrics and artifacts using **mlflow**.

In [None]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [None]:
## setting the experiment name

experiment_name = 'RealState-Price-Prediction'

## creating or getting the experiment

try:
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
    
## params list
    
learning_rate_xgb, max_depth_xgb,  = .3, 6
learning_rate_lgbm, max_depth_lgbm,  = .3, 13
booster_xgb, booster_lgbm = 'gbtree', 'gbdt'
eval_metric = 'rmse'
seed = 42
n_observations = X.shape[0]
n_features = X.shape[-1]

In [None]:
## creating the context

with mlflow.start_run(experiment_id=exp_id, run_name='notebook_run'):
    
    ## setting tags
    
    mlflow.set_tags(
        {
            'Problem': 'Regression',
            'Models': 'Lightgbm'
        }
    )
    
    
    ## this plot isn't for statistical analysis, but for a reference when looking at the UI
    
    sns.displot(data=properties_clean, x='price', col='city', col_wrap=2, kde=True,
        kind='hist', log_scale=False, element='step', height=5, aspect=1
    ).set_axis_labels('Price (R$ 1,000)').set_xticklabels(
        labels=['0', '200', '400', '600', '800', '1,000', ''], 
        rotation=0
    )
    plt.savefig('/home/garcia-ln/Documentos/real-state-prices/images/price_hist.png')
    
    ## artifacts logging (images)
    
    mlflow.log_artifact('/home/garcia-ln/Documentos/real-state-prices/images/')
    

    ## lightgbm model
    
    lgbm_reg = LGBMRegressor(
        max_depth=max_depth_lgbm, 
        learning_rate=learning_rate_lgbm, 
        boosting_type=booster_lgbm
    )
    lgbm_reg.fit(X_train, y_train)
    
    predict_lgbm = lgbm_reg.predict(X_test)
    
    rmse_lgbm, mae_lgbm, r2_lgbm = eval_metrics(y_test, predict_lgbm)

    
    ## parameters logging
    
    mlflow.log_params(
        {
            'Nº Observations': n_observations,
            'Nº Features': n_features,
            'Learning Rate': learning_rate_lgbm,
            'Max Depth': max_depth_lgbm,
            'Booster': booster_lgbm
        }
    )
    
    ## metrics logging
    
    mlflow.log_metrics(
        {
            'RMSE': rmse_lgbm,        
            'MAE': mae_lgbm,
            'R²': r2_lgbm
        }
    )
    
    ## model logging
    
    mlflow.lightgbm.log_model(lgbm_reg, 'Lightgbm')    

In [None]:
## creating the context

with mlflow.start_run(experiment_id=exp_id, run_name='notebook_run'):
    
    ## setting tags
    
    mlflow.set_tags(
        {
            'Problem': 'Regression',
            'Models': 'Xgboost'
        }
    )
    
    
    ## this plot isn't for statistical analysis, but for a reference when looking at the UI
    
    sns.displot(data=properties_clean, x='footage', col='city', col_wrap=2, kde=True,
        kind='hist', log_scale=False, element='step', height=5, aspect=1
    ).set_axis_labels('Footage (m²)')
    plt.savefig('/home/garcia-ln/Documentos/real-state-prices/images/footage_hist.png')
    
    ## artifacts logging (images)
    
    mlflow.log_artifact('/home/garcia-ln/Documentos/real-state-prices/images/')
    

    ## xgboost model
    
    xgb_reg = XGBRegressor(
        learning_rate=learning_rate_xgb, 
        max_depth=max_depth_xgb, 
        booster=booster_xgb, 
        eval_metric=eval_metric, 
        seed=seed
    )
    xgb_reg.fit(X_train, y_train)
    
    predict_xgb = xgb_reg.predict(X_test)
    
    rmse_xgb, mae_xgb, r2_xgb = eval_metrics(y_test, predict_xgb)
    
    ## parameters logging
    
    mlflow.log_params(
        {
            'Nº Observations': n_observations,
            'Nº Features': n_features,
            'Learning Rate': learning_rate_xgb,
            'Max Depth': max_depth_xgb,
            'Booster': booster_xgb
        }
    )    
    ## metrics logging
    
    mlflow.log_metrics(
        {
            'RMSE': rmse_xgb,        
            'MAE': mae_xgb,
            'R²': r2_xgb
        }
    )
    
    ## model logging
    
    mlflow.xgboost.log_model(xgb_reg, 'XGBoost')