In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [7]:
df = pd.read_csv('../data/train_data_nominal_encoded_full.csv')
print('shape:', df.shape)
df.head(5)

shape: (39829, 24)


Unnamed: 0,weight,table_percent,price,depth_percent,quality_EN_Fair,quality_EN_Good,quality_EN_Ideal,quality_EN_Premium,quality_EN_Very Good,color_EN_D,...,color_EN_I,color_EN_J,clarity_EN_I1,clarity_EN_IF,clarity_EN_SI1,clarity_EN_SI2,clarity_EN_VS1,clarity_EN_VS2,clarity_EN_VVS1,clarity_EN_VVS2
0,0.3,0.58,1.848927,0.624,0,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0
1,1.01,0.56,2.217354,0.627,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0.72,0.59,2.077314,0.618,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1.08,0.57,2.124773,0.632,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,0.36,0.59,1.88525,0.623,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [8]:
#split dara train and test
X = df.drop(["price"], axis = 1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)

print("Datos de entrenamiento")
print("-----------------------")
print(y_train.describe())
print('\n')
print("Datos de testeo")
print("-----------------------")
print(y_test.describe())

Datos de entrenamiento
-----------------------
count    31863.000000
mean         2.042597
std          0.131438
min          1.755614
25%          1.923080
50%          2.050270
75%          2.149201
max          2.286659
Name: price, dtype: float64


Datos de testeo
-----------------------
count    7966.000000
mean        2.042922
std         0.131004
min         1.762331
25%         1.924687
50%         2.051106
75%         2.148734
max         2.286659
Name: price, dtype: float64


In [9]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [10]:
param = {"max_depth": [18],
        "max_features": [15],
        "min_samples_split": [10]}

random_forest = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param,
            cv=10,
            n_jobs = -1,
            verbose= 2,
            return_train_score = True,
            scoring="neg_mean_squared_error")

random_forest.fit(X_train, y_train)

best_rf = random_forest.best_estimator_
print(best_rf)

y_pred_test_rf = random_forest.predict(X_test)
y_pred_train_rf = random_forest.predict(X_train)
rf_results_1 = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest 1")
rf_results_1.style.background_gradient(cmap='coolwarm')

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END max_depth=18, max_features=15, min_samples_split=10; total time= 4.0min
[CV] END max_depth=18, max_features=15, min_samples_split=10; total time= 4.1min
[CV] END max_depth=18, max_features=15, min_samples_split=10; total time= 4.1min
[CV] END max_depth=18, max_features=15, min_samples_split=10; total time= 4.1min
[CV] END max_depth=18, max_features=15, min_samples_split=10; total time= 4.1min
[CV] END max_depth=18, max_features=15, min_samples_split=10; total time= 4.1min
[CV] END max_depth=18, max_features=15, min_samples_split=10; total time= 4.2min
[CV] END max_depth=18, max_features=15, min_samples_split=10; total time= 4.2min
[CV] END max_depth=18, max_features=15, min_samples_split=10; total time= 2.6min
[CV] END max_depth=18, max_features=15, min_samples_split=10; total time= 2.6min
RandomForestRegressor(max_depth=18, max_features=15, min_samples_split=10)


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.010055,0.000181,0.013442,0.989471,test,Random Forest 1
1,0.00791,0.000113,0.010611,0.993483,train,Random Forest 1


In [11]:
param = {"max_depth": [15],
        "max_features": [12],
        "min_samples_split": [5]}

random_forest = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param,
            cv=10,
            n_jobs = -1,
            verbose= 1,
            return_train_score = True,
            scoring="neg_mean_squared_error")

random_forest.fit(X_train, y_train)

best_rf = random_forest.best_estimator_
print(best_rf)

y_pred_test_rf = random_forest.predict(X_test)
y_pred_train_rf = random_forest.predict(X_train)
rf_results_2 = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest 2")
rf_results_2.style.background_gradient(cmap='coolwarm')

Fitting 10 folds for each of 1 candidates, totalling 10 fits
RandomForestRegressor(max_depth=15, max_features=12, min_samples_split=5)


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.010374,0.00019,0.013773,0.988946,test,Random Forest 2
1,0.008553,0.000128,0.011323,0.992578,train,Random Forest 2


In [12]:
param = {"max_depth": [20],
        "max_features": [17],
        "min_samples_split": [12]}

random_forest = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param,
            cv=10,
            n_jobs = -1,
            verbose= 1,
            return_train_score = True,
            scoring="neg_mean_squared_error")

random_forest.fit(X_train, y_train)

best_rf = random_forest.best_estimator_
print(best_rf)

y_pred_test_rf = random_forest.predict(X_test)
y_pred_train_rf = random_forest.predict(X_train)
rf_results_3 = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest 3")
rf_results_3.style.background_gradient(cmap='coolwarm')

Fitting 10 folds for each of 1 candidates, totalling 10 fits
RandomForestRegressor(max_depth=20, max_features=17, min_samples_split=12)


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.010061,0.000181,0.013464,0.989436,test,Random Forest 3
1,0.007861,0.000111,0.010558,0.993548,train,Random Forest 3


In [13]:
df_results = pd.concat([rf_results_1, rf_results_2, rf_results_3], axis = 0)
df_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.010055,0.000181,0.013442,0.989471,test,Random Forest 1
1,0.00791,0.000113,0.010611,0.993483,train,Random Forest 1
0,0.010374,0.00019,0.013773,0.988946,test,Random Forest 2
1,0.008553,0.000128,0.011323,0.992578,train,Random Forest 2
0,0.010061,0.000181,0.013464,0.989436,test,Random Forest 3
1,0.007861,0.000111,0.010558,0.993548,train,Random Forest 3
