In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns
from sklearn.model_selection import train_test_split
plt.rcParams["figure.figsize"] = (10,8)


from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn import tree



from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metrics


# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("../data/two.csv", index_col = 0)
df_test = pd.read_csv("../data/two_test.csv", index_col = 0)
df.head()

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_map,color_D,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,-0.655738,0.461538,0.333333,-0.754098,-0.78022,-0.75,6.353,3,1,...,0,0,0,0,0,1,0,0,0,0
1,1,0.508197,0.692308,-0.333333,0.398907,0.417582,0.464286,9.183,4,0,...,0,0,0,0,0,0,0,0,0,1
2,2,0.032787,0.0,0.666667,0.010929,0.021978,0.017857,7.983,4,0,...,0,0,0,0,0,0,0,1,0,0
3,3,0.622951,1.076923,0.0,0.464481,0.43956,0.535714,8.371,2,0,...,0,0,0,0,0,1,0,0,0,0
4,4,-0.557377,0.384615,0.666667,-0.650273,-0.631868,-0.625,6.588,3,0,...,0,0,0,0,0,0,1,0,0,0


In [4]:
df.isnull().sum()

id              0
carat           0
depth           0
table           0
x               0
y               0
z               0
price           0
cut_map         0
color_D         0
color_E         0
color_F         0
color_G         0
color_H         0
color_I         0
color_J         0
clarity_I1      0
clarity_IF      0
clarity_SI1     0
clarity_SI2     0
clarity_VS1     0
clarity_VS2     0
clarity_VVS1    0
clarity_VVS2    0
dtype: int64

In [5]:
# Reparto de datos en train y test
# ==============================================================================
# lo primero que hacemos es definir cual es nuestra variable Y y nuestras variables X
X = df.drop('price', axis =1)
y = df['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)

In [6]:
# Generación del GridSearch
# ==============================================================================

# definimos un diccionario con los hiperparámetros que queremos testear. 
param = {"max_depth": [4, 5, 6, 7,11],
        "min_samples_split": [10, 21,50, 100],
        "max_features": [1,2,3,4,5,6,9]}

gs = GridSearchCV(
            estimator=DecisionTreeRegressor(),
            param_grid= param,
            cv=10,
            verbose=3,
            return_train_score = True,
            scoring="neg_mean_squared_error")


In [7]:
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0) 
  
# fit the regressor with X and Y data
regressor.fit(X_train, y_train)

In [8]:
# ajustamos el modelo de nuevo

%time
gs.fit(X_train, y_train)

CPU times: total: 0 ns
Wall time: 0 ns
Fitting 10 folds for each of 140 candidates, totalling 1400 fits
[CV 1/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.133, test=-0.130) total time=   0.0s
[CV 2/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.898, test=-0.898) total time=   0.0s
[CV 3/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.339, test=-0.340) total time=   0.0s
[CV 4/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.736, test=-0.760) total time=   0.0s
[CV 5/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.890, test=-0.887) total time=   0.0s
[CV 6/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.267, test=-0.269) total time=   0.0s
[CV 7/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.273, test=-0.276) total time=   0.0s
[CV 8/10] END max_depth=4, max_features=1, min_samples_split=10

In [9]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [10]:
# sacamos cual es el mejor de todos los que hemos testeado usando el método best_estimator_

best_tree = gs.best_estimator_
best_tree

In [11]:
y_pred_test_dt2 = best_tree.predict(X_test)
y_pred_train_dt2 = best_tree.predict(X_train)


In [12]:
dt_results2 = metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, "Decision tree II")

In [13]:
dt_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.115288,0.026445,0.162618,0.974106,test,Decision tree II
1,0.10941,0.021671,0.147209,0.979104,train,Decision tree II


In [14]:
df_test

Unnamed: 0,id,carat,depth,table,x,y,z,cut_map,color_D,color_E,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,-0.629032,-1.076923,0.333333,-0.707182,-0.683333,-0.741071,4,0,0,...,1,0,0,0,1,0,0,0,0,0
1,1,0.854839,0.769231,1.000000,0.602210,0.566667,0.651786,3,0,0,...,1,0,0,0,1,0,0,0,0,0
2,2,1.532258,0.076923,0.666667,1.016575,1.044444,1.044643,3,1,0,...,0,0,0,0,1,0,0,0,0,0
3,3,0.064516,-1.000000,-0.333333,0.127072,0.100000,0.053571,3,1,0,...,0,0,0,0,0,1,0,0,0,0
4,4,1.274194,0.000000,-0.666667,0.856354,0.794444,1.017857,0,0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.629032,-1.769231,1.000000,0.569061,0.544444,0.419643,3,0,0,...,0,0,0,0,1,0,0,0,0,0
13481,13481,0.306452,0.153846,1.000000,0.237569,0.266667,0.267857,2,1,0,...,0,0,0,0,1,0,0,0,0,0
13482,13482,-0.661290,0.153846,-1.233333,-0.779006,-0.777778,-0.758929,4,0,0,...,0,0,0,0,0,0,0,1,0,0
13483,13483,0.870968,-1.769231,0.666667,0.718232,0.761111,0.598214,4,0,0,...,1,0,0,0,1,0,0,0,0,0


In [15]:
df

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_map,color_D,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,-0.655738,0.461538,0.333333,-0.754098,-0.780220,-0.750000,6.353,3,1,...,0,0,0,0,0,1,0,0,0,0
1,1,0.508197,0.692308,-0.333333,0.398907,0.417582,0.464286,9.183,4,0,...,0,0,0,0,0,0,0,0,0,1
2,2,0.032787,0.000000,0.666667,0.010929,0.021978,0.017857,7.983,4,0,...,0,0,0,0,0,0,0,1,0,0
3,3,0.622951,1.076923,0.000000,0.464481,0.439560,0.535714,8.371,2,0,...,0,0,0,0,0,1,0,0,0,0
4,4,-0.557377,0.384615,0.666667,-0.650273,-0.631868,-0.625000,6.588,3,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,-0.459016,0.230769,0.666667,-0.497268,-0.483516,-0.482143,6.551,3,1,...,0,0,0,0,0,1,0,0,0,0
40451,40451,-0.278689,0.153846,0.333333,-0.262295,-0.285714,-0.267857,7.382,3,0,...,0,0,0,0,0,0,0,1,0,0
40452,40452,0.163934,0.769231,0.333333,0.092896,0.109890,0.151786,7.768,1,0,...,0,0,0,0,0,1,0,0,0,0
40453,40453,0.508197,-0.230769,0.000000,0.387978,0.428571,0.392857,8.726,2,0,...,0,0,0,0,0,0,0,1,0,0


In [16]:
X_reality_submision = best_tree.predict(df_test)
X_reality_submision

array([6.28280241, 8.56916418, 9.44758559, ..., 6.43198268, 8.64980392,
       7.90077143])

In [17]:
submision1 = pd.DataFrame(X_reality_submision)
submision1.reset_index(inplace=True)
submision1.rename(columns={"index":"id",0:"price"}, inplace =True)
submision1

Unnamed: 0,id,price
0,0,6.282802
1,1,8.569164
2,2,9.447586
3,3,7.888344
4,4,8.935909
...,...,...
13480,13480,8.569164
13481,13481,8.366393
13482,13482,6.431983
13483,13483,8.649804


In [19]:
submision1.to_csv("../data/submission3.csv", index= False)

In [20]:
pd.read_csv("../data/submission3.csv")

Unnamed: 0,id,price
0,0,6.282802
1,1,8.569164
2,2,9.447586
3,3,7.888344
4,4,8.935909
...,...,...
13480,13480,8.569164
13481,13481,8.366393
13482,13482,6.431983
13483,13483,8.649804
