# House Prices Prediction

Notebook with a basic model to use as an example in MLflow.

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import math
import mlflow
from xgboost import XGBRegressor

In [16]:
df = pd.read_csv('../data/processed/casas.csv')

In [17]:
df.head()

Unnamed: 0,tamanho,ano,garagem,preco
0,159.0,2003,2,208500
1,117.0,1976,2,181500
2,166.0,2001,2,223500
3,160.0,1915,3,140000
4,204.0,2000,3,250000


In [18]:
X = df.drop('preco', axis = 1)
y = df['preco'].copy()

In [19]:
X.head()

Unnamed: 0,tamanho,ano,garagem
0,159.0,2003,2
1,117.0,1976,2
2,166.0,2001,2
3,160.0,1915,3
4,204.0,2000,3


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [21]:
mlflow.set_experiment('house_prices_eda')

INFO: 'house_prices_eda' does not exist. Creating a new experiment


# Linear Regression

In [28]:
mlflow.start_run()

<ActiveRun: >

In [29]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

LinearRegression()

In [30]:
mlflow.sklearn.log_model(lr_model, 'lr')

In [31]:
lr_predicted = lr_model.predict(X_test)

In [32]:
mse_lr = mean_squared_error(y_test, lr_predicted)
rmse_lr = math.sqrt(mse_lr)
r2_lr = r2_score(y_test, lr_predicted)
print(f'Linear Regression Model\nMSE: {mse_lr}\nRMSE: {rmse_lr}\nR2: {r2_lr}')

mlflow.log_metric('mse', mse_lr)
mlflow.log_metric('rmse', rmse_lr)
mlflow.log_metric('r2', r2_lr)

Linear Regression Model
MSE: 2078666917.9289908
RMSE: 45592.39978251848
R2: 0.7021153642898048


In [33]:
mlflow.end_run()

# XGBoost

In [35]:
xgb_params = {
    'learning_rate': 0.2,
    'n_estimators': 50,
    'random_state': 42
}

In [36]:
with mlflow.start_run():
    xgb_model = XGBRegressor(**xgb_params)
    xgb_model.fit(X_train, y_train)
    mlflow.xgboost.log_model(xgb_model, 'xgboost')

    xgb_predicted = xgb_model.predict(X_test)

    mse_xgb = mean_squared_error(y_test, xgb_predicted)
    rmse_xgb = math.sqrt(mse_xgb)
    r2_xgb = r2_score(y_test, xgb_predicted)
    print(f'XGBRegressor\nMSE: {mse_xgb}\nRMSE: {rmse_xgb}\nR2: {r2_xgb}')

    mlflow.log_metric('mse', mse_xgb)
    mlflow.log_metric('rmse', rmse_xgb)
    mlflow.log_metric('r2', r2_xgb)

XGBRegressor
MSE: 1386727460.1346002
RMSE: 37238.789724353286
R2: 0.8012741720529797
