Ronald Ortiz y Alejandro Guzman

# Entrenamiento con el Dataset de Train

In [269]:
# some imports
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
plt.rc('font', size=12) 
plt.rc('figure', figsize = (12, 5))

# Settings for the visualizations
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 2,'font.family': [u'times']})

import pandas as pd
pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

# create output folder
if not os.path.exists('output'):
    os.makedirs('output')
if not os.path.exists('output/session1'):
    os.makedirs('output/session1')

  from IPython.core.display import display, HTML


In [270]:
housing = pd.read_csv('dataset/train_set.csv',index_col=0) 

price_cat = pd.cut(housing["Price"],
                               bins=[0., 500000, 1000000, 1500000, 2000000., np.inf],
                               labels=[1, 2, 3, 4, 5])
housing['Postcode'] = pd.Categorical(housing.Postcode)
housing['Postcode'] = housing['Postcode'].astype(str)

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, price_cat):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
housing = strat_train_set.drop("Price", axis=1) # drop labels for training set
housing_num = housing.select_dtypes(include=[np.number])
housing_labels = strat_train_set["Price"].copy()

In [271]:
from sklearn.base import BaseEstimator, TransformerMixin
values = []
# column index
Rooms_ix, Bedroom2_ix, Bathroom_ix, BuildingArea_ix = 0, 2, 3, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):
        rooms_per_building_area = X[:, Rooms_ix] / (1.0 +X[:, BuildingArea_ix])# add 1 to avoid 0 division
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, Bedroom2_ix] / (1.0 + X[:, Bathroom_ix]) # add 1 to avoid 0 division
            return np.c_[X, rooms_per_building_area, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_building_area]

In [272]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

## create a function to replace 0 by NaN
def replace_0_2_NaN(data):
    data[data == 0] = np.nan
    return data


num0_pipeline = Pipeline([
        ('zeros2NaN',FunctionTransformer(func = replace_0_2_NaN,validate=False)),
        ('imputer', SimpleImputer(strategy="median")),
        ('log',FunctionTransformer(np.log1p, validate=True)),
        ('std_scaler', StandardScaler()),
    ])

num_pipeline = Pipeline([
        #('imputer', SimpleImputer(strategy="median")),
        ('imputer', IterativeImputer(random_state=42)),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

from category_encoders import TargetEncoder, CatBoostEncoder, JamesSteinEncoder


cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="constant", fill_value='Unknown')),
        ('cat_boost_encoder', CatBoostEncoder()),
    ])


In [273]:
from sklearn.compose import ColumnTransformer

num_attribs0 = ['Landsize','BuildingArea']
num_attribs1 = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
cat_attribs = ['Type', 'Regionname', 'CouncilArea', 'Suburb','Postcode','SellerG']


full_pipeline = ColumnTransformer([
        ("num0", num0_pipeline, num_attribs0),
        ("num1", num_pipeline, num_attribs1),
        ("cat", cat_pipeline, cat_attribs),
    ])
housing_prepared = full_pipeline.fit_transform(housing, housing_labels)

In [274]:
from scipy.sparse import csr_matrix
housing_prepared.shape
drop_columns2 = []
columns2 = []
for i in range(housing_prepared.shape[1]):
    data1 = housing_prepared[:,i]
    data2 = housing_labels.values
    mean1 = data1.mean() 
    mean2 = data2.mean()
    std1 = data1.std()
    std2 = data2.std()
    corr = ((data1*data2).mean()-mean1*mean2)/(std1*std2)
    if abs(corr) < 0.1: # 0.1 es el punto de inflexión aprox
        drop_columns2.append(i)
    else:
        columns2.append(i)
housing_prepared =np.delete(housing_prepared, drop_columns2, axis=1)  #Comentar si no se quiere eliminar datos con poca correlación

In [275]:
housing_prepared = csr_matrix(housing_prepared)
housing_prepared

<4345x18 sparse matrix of type '<class 'numpy.float64'>'
	with 78210 stored elements in Compressed Sparse Row format>

In [276]:
## Let's try another model: Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    #{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8], 'max_depth':[3,5,7,10]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [110], 'max_features': [6,7], 'min_samples_split': [3,5], 'min_samples_leaf': [1]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_root_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
print(grid_search.best_params_)

{'bootstrap': False, 'max_features': 6, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 110}


In [277]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("Price", axis=1)
y_test = strat_test_set["Price"].copy()

X_test_prepared = full_pipeline.transform(X_test)
X_test_prepared = np.delete(X_test_prepared, drop_columns2, axis=1) #Comentar si no se quiere eliminar datos con poca correlación
X_test_prepared = csr_matrix(X_test_prepared)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [278]:
full_pipeline = full_pipeline.fit(housing, housing_labels)
housing_prepared = full_pipeline.transform(housing)
X_test_prepared = full_pipeline.transform(X_test)

In [279]:
print(final_rmse)

from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))

290821.4477420408


array([257409.41406214, 320771.91661014])

# Prueba con el Dataset de Test

In [280]:
housing = pd.read_csv('dataset/train_set.csv',index_col=0) 
test = pd.read_csv('dataset/test_set.csv/test_set.csv',index_col=0) 

housing['Postcode'] = pd.Categorical(housing.Postcode)
housing['Postcode'] = housing['Postcode'].astype(str)
test['Postcode'] = pd.Categorical(test.Postcode)
test['Postcode'] = test['Postcode'].astype(str)

housing_labels = housing["Price"].copy()
housing = housing.drop("Price", axis=1) 

In [281]:
#cat_attribs = ['Type', 'CouncilArea', 'Suburb', 'Postcode']
cat_attribs = ['Type', 'Regionname', 'CouncilArea', 'Suburb','Postcode','SellerG']
full_pipeline = ColumnTransformer([
        ("num0", num0_pipeline, num_attribs0),
        ("num1", num_pipeline, num_attribs1),
        ("cat", cat_pipeline, cat_attribs),
    ])
full_pipeline = full_pipeline.fit(housing, housing_labels)
housing_prepared = full_pipeline.transform(housing)



In [282]:
drop_columns = []
columns = []
for i in range(housing_prepared.shape[1]):
    data1 = housing_prepared[:,i]
    data2 = housing_labels.values
    mean1 = np.mean(data1) 
    mean2 = np.mean(data2)
    std1 = np.std(data1)
    std2 = np.std(data2)
    corr = ((data1*data2).mean()-mean1*mean2)/(std1*std2)
    if abs(corr) < 0.1:
        drop_columns.append(i)

In [283]:
housing_prepared =np.delete(housing_prepared, drop_columns, axis=1)
housing_prepared = csr_matrix(housing_prepared)

In [310]:
param_grid = [
    {'bootstrap': [False], 'n_estimators': [125], 'max_features': [4], 'min_samples_split': [2], 'min_samples_leaf': [2]},
  ]

In [311]:
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_root_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
print(grid_search.best_params_)

{'bootstrap': False, 'max_features': 4, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 125}


In [312]:
test_prepared = full_pipeline.transform(test)
test_prepared = np.delete(test_prepared, drop_columns, axis=1) #Comentar si no se quiere eliminar datos con poca correlación
test_prepared = csr_matrix(test_prepared)
final_model = grid_search.best_estimator_
y_pred = final_model.predict(test_prepared)

df_output = pd.DataFrame(y_pred)
df_output = df_output.reset_index()
df_output.columns = ['index','Price']

df_output.to_csv('output/session1/baseline.csv',index=False)

Explicación de los cambios realizados:

En primera parte se ha realizado diversas mejoras en el postprocesamiento de los datos:

- **CatBoost Encoder**: Se ha definido un CatBoostEncoder para realizar la codificación basada en el objetivo de las variables categóricas. La clase realiza una codificación consciente del tiempo, con regularización y aprendizaje en línea, de cada categoría única en la matriz de entrada X, y reemplaza cada categoría con un valor calculado basado en el objetivo. Finalmente, se eliminan las columnas originales. Este enfoque mejora el one-hot encoder estándar al reducir la dimensionalidad del dataset y al permitir la captura de relaciones entre las categorías y la variable objetivo, especialmente en datasets con un gran número de categorías únicas.

- **Columnas poco correlacionadas**: Se ha añadido un segmento de código que calcula el coeficiente de correlación de Pearson entre cada columna de housing_prepared y housing_labels, identificando las columnas con una correlación débil (menor a 0.1) para posiblemente eliminarlas. Este enfoque puede mejorar la eficiencia del modelo al reducir la dimensionalidad y eliminar características no informativas. En la práctica hemos observado que ha mejorado bastante el rendimiento.

- **Iterative Imputer en variables numéricas**: El Iterative Imputer es generalmente mejor que el Single Imputer para variables numéricas porque, en lugar de imputar valores faltantes utilizando una estadística simple como la media o la mediana, utiliza modelos predictivos para estimar los valores faltantes considerando las relaciones entre todas las variables. Esto puede resultar en imputaciones más precisas y realistas.

Finalmente se ha realizado un reajuste de los hiperparámetros del modelo de Random Forest para obtener un mayor rendimiento:

- En primer lugar se han probado varios hiperparámetros tanto para Bootstrap=False como Boostrap=True pero en el 100% de los casos la función GridSearchCV ha encontrado un menor error cuadrático con el Bootstrap=False.
- Los mejores hiperparámetros encontrados han sido:

    **One Hot Encoder**: {'bootstrap': [False], 'n_estimators': [125], 'max_features': [10], 'min_samples_split': [4], 'min_samples_leaf': [2]}
    
    **Cat Boost Encoder**: {'bootstrap': [False], 'n_estimators': [125], 'max_features': [4], 'min_samples_split': [2], 'min_samples_leaf': [2]}

**PUNTUACIÓN FINAL** Implementando Cat Boost Encoder (297500) se ha obtenido un mejor private score en el Kaggle y con One Hot Encoder se ha obtenido un mejor public score (279111).

- *Puntuación media One Hot Encoder*: 290.000
- *Puntuación media CatBoost Encoder*: 289.500