### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

### Carregar dataset detalhado (Ames Housing com mais features)

In [15]:
url = 'AmesHousing.csv'
data = pd.read_csv(url, encoding='latin-1').drop('PID', axis=1)
data.columns = [x.replace(' ', '').replace('/', '') for x in data.columns]

In [14]:
[x for x in data.columns if 'year' in x.lower()]

['YearBuilt', 'YearRemod/Add']

### Engenharia de features avançada

In [16]:
data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']
data['TotalBath'] = data['FullBath'] + 0.5*data['HalfBath'] + data['BsmtFullBath'] + 0.5*data['BsmtHalfBath']
data['Age'] = data['YrSold'] - data['YearBuilt']
data['RecentRemodel'] = (data['YearRemodAdd'] == data['YrSold']).astype(int)

### Pré-processamento complexo

In [18]:
numeric_features = data.select_dtypes(include=np.number).columns.to_list()
categorical_features = data.select_dtypes(exclude=np.number).columns.to_list()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

### Pipeline completo com seleção de features

In [20]:
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', SelectFromModel(estimator=rf)),
    ('regressor', rf)
])

### Divisão de dados

In [21]:
X = data.drop('SalePrice', axis=1)
y = np.log(data['SalePrice'])   # Transformação logarítmica
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Hiperparâmetros para tuning

In [25]:
param_grid = {
    'regressor__n_estimators': [100, 200, 500],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__max_features': ['sqrt', 'log2', 0.3],
    'regressor__bootstrap': [True, False]
}

### Busca de hiperparâmetros com validação cruzada

In [26]:
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


ValueError: 
All the 1080 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1080 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xgabr\AppData\Roaming\Python\Python312\site-packages\pandas\core\indexes\base.py", line 3790, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 152, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 181, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\hashtable_class_helper.pxi", line 7080, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\_libs\hashtable_class_helper.pxi", line 7088, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'SalePrice'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\xgabr\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_indexing.py", line 364, in _get_column_indices
    col_idx = all_columns.get_loc(col)
              ^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xgabr\AppData\Roaming\Python\Python312\site-packages\pandas\core\indexes\base.py", line 3797, in get_loc
    raise KeyError(key) from err
KeyError: 'SalePrice'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\xgabr\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\xgabr\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xgabr\AppData\Roaming\Python\Python312\site-packages\sklearn\pipeline.py", line 652, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xgabr\AppData\Roaming\Python\Python312\site-packages\sklearn\pipeline.py", line 586, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xgabr\AppData\Roaming\Python\Python312\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xgabr\AppData\Roaming\Python\Python312\site-packages\sklearn\pipeline.py", line 1540, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xgabr\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xgabr\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xgabr\AppData\Roaming\Python\Python312\site-packages\sklearn\compose\_column_transformer.py", line 992, in fit_transform
    self._validate_column_callables(X)
  File "C:\Users\xgabr\AppData\Roaming\Python\Python312\site-packages\sklearn\compose\_column_transformer.py", line 551, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xgabr\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_indexing.py", line 372, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe


### Melhor modelo

In [None]:
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'best_rf_model.pkl')

### Avaliação detalhada

In [None]:
y_pred = best_model.predict(X_test)
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}')
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f'R²: {r2_score(y_test, y_pred):.4f}')

### Visualizações

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('Actual vs Predicted Values')
plt.show()

### Feature Importance

In [None]:
preprocessor.fit(X_train)
feature_names = numeric_features + list(best_model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features))
importances = best_model.named_steps['regressor'].feature_importances_
indices = np.argsort(importances)[-20:]
plt.figure(figsize=(10, 6))
plt.title('Top 20 Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

### Learning Curve

In [None]:
train_sizes, train_scores, test_scores = learning_curve(best_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.sqrt(-train_scores.mean(1)), 'o-', label='Training')
plt.plot(train_sizes, np.sqrt(-test_scores.mean(1)), 'o-', label='Validation')
plt.xlabel('Training examples')
plt.ylabel('RMSE')
plt.title('Learning Curves')
plt.legend()
plt.show()

### Permutation importance

In [None]:
from sklearn.inspection import permutation_importance
result = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
sorted_idx = result.importances_mean.argsort()[-20:]
plt.figure(figsize=(10, 6))
plt.boxplot(result.importances[sorted_idx].T, vert=False, labels=np.array(feature_names)[sorted_idx])
plt.title('Permutation Importances (test set)')
plt.show()

### Partial dependence plot

In [None]:
from sklearn.inspection import PartialDependenceDisplay
fig, ax = plt.subplots(figsize=(12, 8))
PartialDependenceDisplay.from_estimator(best_model, X_train, ['TotalSF', 'OverallQual'], kind='average', ax=ax)
plt.show()