# Imports

In [39]:
!pip install xgboost
!pip install mlflow
!pip install pandas
!pip install scikit-learn

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.26.2.post1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m0m eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.26.2.post1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (291.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m291.7/291.7 MB[0m [31m661.5 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.26.2.post1 xgboost-3.0.0


In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import math
from xgboost import XGBRegressor, XGBRFRegressor
import mlflow

## Lendo as base de dados

In [3]:
df = pd.read_csv('../data/processed/casas.csv')

In [4]:
df.head()

Unnamed: 0,tamanho,ano,garagem,preco
0,159.0,2003,2,208500
1,117.0,1976,2,181500
2,166.0,2001,2,223500
3,160.0,1915,3,140000
4,204.0,2000,3,250000


## Separando as variáveis explicativas do target

In [5]:
X = df.drop('preco', axis=1)
y = df['preco'].copy()

In [6]:
X.head()

Unnamed: 0,tamanho,ano,garagem
0,159.0,2003,2
1,117.0,1976,2
2,166.0,2001,2
3,160.0,1915,3
4,204.0,2000,3


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
X_train.shape

(1022, 3)

In [9]:
X_test.shape

(438, 3)

# Usando o MLFlow

Criando experimento chamado `house-prices-eda`

In [60]:
mlflow.set_experiment('house-prices-eda')

2025/04/17 15:58:52 INFO mlflow.tracking.fluent: Experiment with name 'house-prices-eda' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///home/felip/projetos/alura/mlflow/notebooks/mlruns/613493852528493151', creation_time=1744916332079, experiment_id='613493852528493151', last_update_time=1744916332079, lifecycle_stage='active', name='house-prices-eda', tags={}>

# Linear Regression

In [61]:
mlflow.start_run()

<ActiveRun: >

In [62]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [63]:
lr.coef_

array([  830.00653222,   762.09006126, 25767.20537933])

In [64]:
lr.intercept_

np.float64(-1484640.8721676627)

## Logando o modelo com MLFlow

In [65]:
mlflow.sklearn.log_model(lr, 'lr')



<mlflow.models.model.ModelInfo at 0x7f2b9dbd3440>

In [32]:
lr.predict(X_test)[0:2]

array([119279.7701544 , 289022.71460536])

In [34]:
lr_predicted = lr.predict(X_test)


vejamos os dados da primeira casa, no caso aqui a casa 892, O modelo preveu o preço de venda como R$ 119.279

In [17]:
X_test.iloc[0]

tamanho      99.0
ano        1963.0
garagem       1.0
Name: 892, dtype: float64

O preço real de venda foi R$ 154.500

In [19]:
y_test

892     154500
1105    325000
413     115000
522     159000
1036    315500
         ...  
331     139000
323     126175
650     205950
439     110000
798     485000
Name: preco, Length: 438, dtype: int64

# Avaliando a performance do modelo

## MSE

In [66]:
mse = mean_squared_error(y_test, lr_predicted)
mlflow.log_metric('mse', mse)

In [48]:
mse = mean_squared_error(y_test, lr_predicted)
mse

2078666917.9289908

## RMSE

In [67]:
rmse = math.sqrt(mse)
rmse
mlflow.log_metric('rmse', rmse)


O nosso modelo atual está errando em média R$ 45.592

## R Quadrado

In [68]:
r2 = r2_score(y_test, lr_predicted)
r2
mlflow.log_metric('r2', r2)


Nosso r2 tá dando em torno de 70.21%

In [91]:
mlflow.end_run()

# Usando usar XGBRFRegressor (XGBoost Random Forest Regressor)

In [94]:


with mlflow.start_run():
    xgb_rf = XGBRFRegressor(random_state=42)
    xgb_rf.fit(X_train, y_train)
    mlflow.xgboost.log_model(xgb_rf, 'xgboost')

    xgb_rf_predicted = xgb_rf.predict(X_test)
    mse = mean_squared_error(y_test, xgb_rf_predicted)
    rmse = math.sqrt(mse)
    r2 = r2_score(y_test, xgb_rf_predicted)
    
    mlflow.log_metric('mse', mse)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('r2', r2)

  self.get_booster().save_model(fname)


In [86]:
xgbrf_predicted = xgbrf.predict(X_test)

## MSE XGBRFRegressor

In [87]:
xgbrf_mse = mean_squared_error(y_test, xbgrf_predicted)
xgbrf_mse

1328217856.0

## RMSE XGBREFRegressor

In [88]:
xgbrf_rmse = math.sqrt(xgbrf_mse)
xgbrf_rmse

36444.72329432616

## R² XGBRFRegressor

In [89]:
xgbrf_r2 = r2_score(y_test, xgbrf_predicted)
xgbrf_r2

0.809658944606781

# Usando usar XGBRegressor

In [95]:
# xgb_params = {
#     'learning_rate': 0.2,
#     'n_estimators': 50,
#     'random_state': 42
# }

xgb_params = {}

xgb_params['learning_rate'] = 0.2
xgb_params['n_estimators'] = 50
xgb_params['random_state'] = 42


with mlflow.start_run():
    xgb = XGBRegressor(**xgb_params)
    xgb.fit(X_train, y_train)
    mlflow.xgboost.log_model(xgb, 'xgboost')

    xgb_predicted = xgb.predict(X_test)
    mse = mean_squared_error(y_test, xgb_predicted)
    rmse = math.sqrt(mse)
    r2 = r2_score(y_test, xgb_predicted)
    
    mlflow.log_metric('mse', mse)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('r2', r2)

    

  self.get_booster().save_model(fname)


## MSE XGBRegressor

In [57]:
# mse_xgb = mean_squared_error(y_test, xbg_predicted)
# mse_xgb

1572136576.0

## RMSE XGBRegressor

In [51]:
# rmse_xgb = math.sqrt(mse_xgb)
# rmse

45592.39978251848

O xgb errou em torno de R$ 45.592 (ruim)

## R^2 XGBRegressor

In [52]:
# r2_xgb = r2_score(y_test, xbg_predicted)
# r2_xgb

0.7747039794921875

R² deu 77.4%