In [1]:
import pandas as pd
import numpy as np
import joblib
import logging
import time

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

import mlflow

In [4]:
#!pip install xgboost

# 1.- Configuración de Modelos

In [16]:
model_configurations = {
    'LinearRegression':LinearRegression(),
    'RandomForestRegressor':RandomForestRegressor(n_estimators=200, random_state=2026),
    'GradientBoosting': GradientBoostingRegressor(random_state=2024),
    "SVR":SVR(kernel='rbf', C=10, epsilon=0.1),
    "XGBoost": XGBRegressor(n_estimators=200,learning_rate=0.05, random_state=2026)
}

## 1.1 AGregamos configuración de conexióncon MLflow

In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Regression Technics for House Pricing")

2025/11/22 15:05:16 INFO mlflow.tracking.fluent: Experiment with name 'Regression Technics for House Pricing' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/816166779378113440', creation_time=1763845516296, experiment_id='816166779378113440', last_update_time=1763845516296, lifecycle_stage='active', name='Regression Technics for House Pricing', tags={}>

# 2.- Configuración para Logging

In [17]:
logging.basicConfig(filename="ml_system.log", encoding="utf-8", filemode="a",level=logging.INFO,
                    format="{asctime}-{levelname}-{message}", style="{", datefmt="%Y-%m-%d %H:%M")

# 3.- Entrenamiento y selección de Modelo Ganador

In [18]:
dataset = pd.read_csv('../data/interim/proc_data_train.csv')
X = dataset.drop('SalePrice', axis = 1)
y = dataset['SalePrice']

results = {}
logging.info(f"----Iniciando Entrenamiento, ENTRENAMIENTO")
start = time.time()

for model_name, model in model_configurations.items():
    metric = cross_val_score(model,X,y,scoring='neg_root_mean_squared_error',cv=10)
    rmse_mean = np.round(np.abs(metric.mean()),2)
    results[model_name] = rmse_mean
    print(f"RMSE Promedio de modelo {model_name}: {rmse_mean}, ENTRENAMIENTO")
    logging.info(f"RMSE Promedio de modelo {model_name}: {rmse_mean}, ENTRENAMIENTO")

finish = time.time()
logging.info(f"Tiempo de Entrenamiento: {finish - start}, ENTRENAMIENTO")
    

RMSE Promedio de modelo LinearRegression: 33179.96, ENTRENAMIENTO
RMSE Promedio de modelo RandomForestRegressor: 32847.34, ENTRENAMIENTO
RMSE Promedio de modelo GradientBoosting: 33296.7, ENTRENAMIENTO
RMSE Promedio de modelo SVR: 77786.61, ENTRENAMIENTO
RMSE Promedio de modelo XGBoost: 33655.16, ENTRENAMIENTO


In [19]:
best_model_name = min(results, key = results.get)
best_model_config = model_configurations[best_model_name]

# 4.- Agregamos Modelo a Pipeline

In [20]:
house_prices_pipeline = joblib.load('../models/house_prices_pre_proc_pipeline.pkl')
house_prices_pipeline.steps.append((best_model_name, best_model_config))

### Re entrenamos el Pipeline con el Modelo Ganador

In [28]:
dataset = pd.read_csv('../data/raw/train.csv')
dataset['MSSubClass'] = dataset['MSSubClass'].astype('O')
dataset['GarageCars'] = dataset['GarageCars'].astype('O')
dataset['BsmtFullBath'] = dataset['BsmtFullBath'].astype('O')

X = dataset.drop(["Id","SalePrice"], axis=1)
y = dataset['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,shuffle=True, random_state=2028)
house_prices_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('drop_features', ...), ('cat_missing_imputation', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,features_to_drop,"['GarageType', 'Street', ...]"

0,1,2
,imputation_method,'missing'
,fill_value,'Missing'
,variables,['FireplaceQu']
,return_object,False
,ignore_format,False

0,1,2
,imputation_method,'frequent'
,fill_value,'Missing'
,variables,"['BsmtQual', 'BsmtExposure', ...]"
,return_object,False
,ignore_format,False

0,1,2
,imputation_method,'mean'
,variables,"['LotFrontage', 'GarageArea']"

0,1,2
,variables,"['ExterQual', 'BsmtQual', ...]"
,mappins,"{'Ex': 5, 'Fa': 2, 'Gd': 4, 'Missing': 0, ...}"

0,1,2
,variables,['BsmtExposure']
,mappins,"{'Av': 3, 'Gd': 4, 'Mn': 2, 'No': 1}"

0,1,2
,variables,['GarageFinish']
,mappins,"{'Fin': 3, 'Missing': 0, 'NA': 0, 'RFn': 2, ...}"

0,1,2
,variables,['BsmtFinType1']
,mappins,"{'ALQ': 5, 'BLQ': 4, 'GLQ': 6, 'LwQ': 2, ...}"

0,1,2
,encoding_method,'count'
,variables,"['MSZoning', 'LotShape', ...]"
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'

0,1,2
,variables,"['LotFrontage', '1stFlrSF', ...]"
,base,'e'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [29]:
joblib.dump(house_prices_pipeline, '../models/house_prices_pipeline.pkl')

['../models/house_prices_pipeline.pkl']