In [139]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn

df = pd.read_csv('processed-dataset.csv')

df.drop(columns=['Unnamed: 0'], inplace=True)
df['checkin'] = pd.to_datetime(df['checkin'])
df['year'] = df['checkin'].dt.year
df['month'] = df['checkin'].dt.month
df['week'] = df['checkin'].dt.isocalendar().week
df_test = df[((df['checkin'].dt.month > 6) & (df['checkin'].dt.year == 2025))]
df = df[~((df['checkin'].dt.month > 6) & (df['checkin'].dt.year == 2025))]
# df[df['month'] == 6].head()
df_test.head()


Unnamed: 0,hotel,cidade,preço,checkin,checkout,dist_centro,avaliacao,qtnd_avaliacoes,data_extracao,tem_feriado,nome_feriado,mes_checkin,log_price,price_category,season,norm_price,year,month,week
6622,Aconchegante Apto de 1qt a 60mts do Consulado EUA,Recife,1011,2025-07-01,2025-07-08,1.2,,,2024-6-6,False,,7,3.004751,Econômico,Inverno,0.058357,2025,7,27
6623,Lindo APT Cinza à 50m do mar - Praia de Boa Vi...,Recife,838,2025-07-01,2025-07-08,5.8,,,2024-6-6,False,,7,2.923244,Econômico,Inverno,0.044769,2025,7,27
6624,Lindo APT Azul a 50 metros do mar - Praia de B...,Recife,750,2025-07-01,2025-07-08,5.8,,,2024-6-6,False,,7,2.875061,Econômico,Inverno,0.037857,2025,7,27
6625,Flat aconchegante em Boa Viagem,Recife,803,2025-07-01,2025-07-08,4.7,,,2024-6-6,False,,7,2.904716,Econômico,Inverno,0.04202,2025,7,27
6626,Hotel Fit transamérica,Recife,1021,2025-07-01,2025-07-08,7.0,,,2024-6-6,False,,7,3.009026,Econômico,Inverno,0.059142,2025,7,27


In [140]:
# Divisão dos dados
df = pd.get_dummies(df, columns=['hotel', 'season', 'nome_feriado', 'tem_feriado'], drop_first=True)
df_test = pd.get_dummies(df_test, columns=['hotel', 'season', 'nome_feriado', 'tem_feriado'], drop_first=True)

X_train = df.drop(['preço', 'checkin', 'cidade', 'checkout', 'data_extracao', 'log_price', 'norm_price', 'avaliacao', 'qtnd_avaliacoes', 'price_category'], axis=1)
y_train = df['preço']

X_test = df_test.drop(['preço', 'checkin', 'cidade', 'checkout', 'data_extracao', 'log_price', 'norm_price', 'avaliacao', 'qtnd_avaliacoes', 'price_category'], axis=1)
y_test = df_test['preço']


In [147]:
# Configuração do MLflow
experiment_name = "Price_Prediction_Experiment"
mlflow.set_experiment(experiment_name)

X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

with mlflow.start_run():
    # Definir o modelo e hiperparâmetros
    model = RandomForestRegressor(random_state=42)
    param_grid = {
        'n_estimators': [100, 200, 300],
        # 'max_depth': [10, 20, 30],
        # 'min_samples_split': [2, 5, 10],
        # 'min_samples_leaf': [1, 2, 4]
    }
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    
    # Avaliação do modelo
    y_pred = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'Best Parameters: {grid_search.best_params_}')
    print(f'RMSE: {rmse}')
    print(f'MAE: {mae}')
    print(f'R2: {r2}')
    
    # Logando métricas no MLflow
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)
    
    # Logando o melhor modelo no MLflow
    mlflow.sklearn.log_model(best_model, "RandomForestRegressor")

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Parameters: {'n_estimators': 200}
RMSE: 307.57243327930155
MAE: 143.40695071793624
R2: 0.8884753844464899


In [126]:
model.fit(X_train, y_train)

In [142]:
# Treinamento do modelo
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [143]:
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)
y_pred = model.predict(X_test)
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

RMSE: 308.17330622055186
MAE: 143.6721822422325
R2: 0.8880392102390989


In [146]:
# Previsão de novos preços
nova_data = pd.DataFrame({
    'hotel': ['Aconchegante Apto de 1qt a 60mts do Consulado EUA'],
    'year': [2025],
    'month': [1],
    'week': [1],
    'season': 'Verão',
    'dist_centro': 1,
    'mes_checkin': 1,
    'tem_feriado': True,
    'nome_feriado': 'Ano Novo',
    'mediana_preco': 1000
})
nova_data = pd.get_dummies(nova_data, columns=['hotel'], drop_first=True)
nova_data = nova_data.reindex(columns=X_train.columns, fill_value=0)
precos_previstos = best_model.predict(nova_data)
print(precos_previstos)

[963.55891112]
