# Notebook de mejora

Este notebook es un ejemplo cómo se puede mejorar los hiperparámetros de un algoritmo, y medir la importancia de atributos https://www.kaggle.com/c/house-prices-advanced-regression-techniques/ de la práctica 3 de Inteligencia de Negocio. 

Incluyo el código del notebook anterior, no se explicarán.

Es un ejemplo que es claramente mejorable, se deja a el/la estudiante el mejorarlo para obtener mejores resultados.

## Código del Notebook anterior

Ahora incluyo todo el código del otro Notebook para poder ejecutarlo fácilmente (muy mejorable).

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv", na_values="NaN") # Definimos na_values para identificar bien los valores perdidos


if 'Id' in train:
    train.drop('Id', axis=1, inplace=True)

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

test = pd.read_csv("test.csv", na_values="NaN")
test_ids = test.Id
test = test.drop('Id', axis=1)

input_all = pd.concat([train.drop('SalePrice', axis=1), test])

col_cat = list(input_all.select_dtypes(exclude=np.number).columns)

from sklearn.impute import SimpleImputer

# Valores categóricos por el más frecuente
imputer_cat = SimpleImputer(strategy="most_frequent")
imputer_cat.fit(input_all[col_cat])
train[col_cat] = imputer_cat.transform(train[col_cat])
test[col_cat] = imputer_cat.transform(test[col_cat])

# Valores numéricos por la media
col_num = list(train.select_dtypes(include=np.number).columns)
col_num.remove('SalePrice')
imputer_num = SimpleImputer(strategy="median")
imputer_num.fit(input_all[col_num])
train[col_num] = imputer_num.transform(train[col_num])
test[col_num] = imputer_num.transform(test[col_num])

from sklearn.preprocessing import LabelEncoder
labelers = {}
test_l = test.copy()
train_l = train.copy()

for col in col_cat:
    labelers[col] = LabelEncoder().fit(input_all[col])
    test_l[col] = labelers[col].transform(test[col])
    train_l[col] = labelers[col].transform(train[col])


y_train = train_l.SalePrice
X_train = train_l.drop('SalePrice', axis=1)

if 'Id' in test_l:
    test_l.drop('Id', axis=1, inplace=True)

X_test = test_l

## Aplico modelo

Voy a aplicar un modelo más completo, un Random Forest.

In [8]:
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.datasets import make_regression


# Define the base models
base_models = [
    ('random_forest', RandomForestRegressor(criterion='squared_error', max_depth=20, random_state=42)),
    ('xgboost', XGBRegressor(objective='reg:squarederror', random_state=42)),
    ('knn', KNeighborsRegressor()),
    ('gradient_boosting', GradientBoostingRegressor(random_state=42))
]

# Define the meta-model (stacking regressor)
meta_model = RandomForestRegressor(criterion='squared_error', max_depth=20, random_state=42)

# Create the stacking regressor
stacking_regressor = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Define the cross-validation strategy
cv = KFold(n_splits=10, shuffle=True, random_state=124345)

# Calculate cross-validated negative mean squared log error
values = cross_val_score(stacking_regressor, X_train, y_train, scoring='neg_mean_squared_log_error', cv=cv)

# Print the results
print(values)
print(values.mean())



[-0.01228728 -0.02436021 -0.01702909 -0.01561671 -0.01574254 -0.02450135
 -0.02391974 -0.01861379 -0.02071716 -0.01553109]
-0.018831897073152422


In [10]:
stacking_regressor.fit(X_train, y_train)

In [11]:
pred = stacking_regressor.predict(X_test)

In [12]:
salida = pd.DataFrame({'Id': test_ids, 'SalePrice': pred})
salida.to_csv("Prueba3.csv", index=False)