In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [11]:
df = pd.read_csv('../data/train_data_encoded_full.csv')
print('shape:', df.shape)
df.head(5)

shape: (39829, 7)


Unnamed: 0,weight,quality,color,clarity,table_percent,price,depth_percent
0,0.3,3,6,1,0.58,1.848927,0.624
1,1.01,4,5,5,0.56,2.217354,0.627
2,0.72,4,4,3,0.59,2.077314,0.618
3,1.08,2,3,1,0.57,2.124773,0.632
4,0.36,3,3,4,0.59,1.88525,0.623


In [12]:
#split dara train and test
X = df.drop(["price"], axis = 1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)

print("Datos de entrenamiento")
print("-----------------------")
print(y_train.describe())
print('\n')
print("Datos de testeo")
print("-----------------------")
print(y_test.describe())

Datos de entrenamiento
-----------------------
count    31863.000000
mean         2.042597
std          0.131438
min          1.755614
25%          1.923080
50%          2.050270
75%          2.149201
max          2.286659
Name: price, dtype: float64


Datos de testeo
-----------------------
count    7966.000000
mean        2.042922
std         0.131004
min         1.762331
25%         1.924687
50%         2.051106
75%         2.148734
max         2.286659
Name: price, dtype: float64


In [13]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [14]:
gb = GradientBoostingRegressor(n_estimators=125, loss='squared_error', learning_rate=0.15, max_depth=6, random_state=0, criterion='friedman_mse')
gb.fit(X_train, y_train)

y_pred_gb_test= gb.predict(X_test)
y_pred_gb_train= gb.predict(X_train)

results_gradient_3 = metricas(y_test, y_train, y_pred_gb_test, y_pred_gb_train, "Gradient 3")
results_gradient_3.style.background_gradient(cmap='coolwarm')

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.009426,0.000159,0.012613,0.990729,test,Gradient 3
1,0.008798,0.000137,0.01172,0.992049,train,Gradient 3


In [15]:
df_test = pd.read_csv('../data/test_data_encoded.csv')
df_test

Unnamed: 0,weight,quality,color,clarity,table_percent,depth_percent
0,0.32,4,1,2,0.580,0.605
1,1.24,3,1,2,0.600,0.629
2,1.66,3,6,2,0.590,0.620
3,0.75,3,6,1,0.560,0.606
4,1.50,0,5,1,0.550,0.648
...,...,...,...,...,...,...
13480,1.10,3,3,2,0.600,0.596
13481,0.90,2,6,2,0.600,0.621
13482,0.30,4,4,3,0.533,0.621
13483,1.25,4,1,2,0.590,0.596


In [16]:
#predict test to submit competition
y_pred_gb_test = gb.predict(df_test)
submission_3 = pd.DataFrame(y_pred_gb_test).reset_index()
print('shape:', y_pred_gb_test.shape)
submission_3.columns = ['id', 'price']
submission_3

shape: (13485,)


Unnamed: 0,id,price
0,0,1.809259
1,1,2.150975
2,2,2.253917
3,3,2.063917
4,4,2.192364
...,...,...
13480,13480,2.145197
13481,13481,2.127176
13482,13482,1.863325
13483,13483,2.163392


In [17]:
submission_3['price'] = submission_3['price'].apply(lambda x: np.exp(x))
submission_3

Unnamed: 0,id,price
0,0,6.105923
1,1,8.593234
2,2,9.524976
3,3,7.876766
4,4,8.956363
...,...,...
13480,13480,8.543726
13481,13481,8.391136
13482,13482,6.445129
13483,13483,8.700597


In [18]:
submission_3.to_csv('../output/submission_3_fs.csv', index=False)