In [41]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns
from sklearn.model_selection import train_test_split
plt.rcParams["figure.figsize"] = (10,8)


from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn import tree



from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metrics


# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [42]:
df = pd.read_csv("../data/one.csv", index_col = 0)
df_test = pd.read_csv("../data/one_test.csv", index_col = 0)
df.head()

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_map,color_D,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,-0.625,0.4,0.333333,-0.754098,-0.78022,-0.75,6.353,3,1,...,0,0,0,0,0,1,0,0,0,0
1,1,0.484375,0.6,-0.333333,0.398907,0.417582,0.464286,9.183,4,0,...,0,0,0,0,0,0,0,0,0,1
2,2,0.03125,0.0,0.666667,0.010929,0.021978,0.017857,7.983,4,0,...,0,0,0,0,0,0,0,1,0,0
3,3,0.59375,0.933333,0.0,0.464481,0.43956,0.535714,8.371,2,0,...,0,0,0,0,0,1,0,0,0,0
4,4,-0.53125,0.333333,0.666667,-0.650273,-0.631868,-0.625,6.588,3,0,...,0,0,0,0,0,0,1,0,0,0


In [43]:
df.isnull().sum()

id              0
carat           0
depth           0
table           0
x               0
y               0
z               0
price           0
cut_map         0
color_D         0
color_E         0
color_F         0
color_G         0
color_H         0
color_I         0
color_J         0
clarity_I1      0
clarity_IF      0
clarity_SI1     0
clarity_SI2     0
clarity_VS1     0
clarity_VS2     0
clarity_VVS1    0
clarity_VVS2    0
dtype: int64

In [44]:
class Ajuste_modelo_lineal:
    
    
    def __init__(self, dataframe, variable_respuesta):
        """ 
        inicializamos la clase con el dataframe y la variable respuesta
        """
        self.dataframe = dataframe # variable que contiene el dataframe
        self.variable_respuesta = variable_respuesta # variable que contiene la variable respuesta del modelo
        
    def separar_datos(self):

        """
        Esta función separa los datos en train y test y devuelve los 4 datasets listos para ser usados en el modelo
        Returns:
            _type_: X_train, X_test, y_train, y_test son las variables son las variables predicotroas y las variables respuesta necesarias para el entrenamiento del modelo
        """
        # lo primero que hacemos es definir cual es nuestra variable Y y nuestras variables X

        X = self.dataframe.drop(self.variable_respuesta, axis =1) # seleccion de variables predictoras
        y = self.dataframe[self.variable_respuesta]               # seleccion de la variable respuesta
        
        # dividimos los datos en train y test con un 80% de train y un 20% de test, Recordamos que el random state es una semilla que nos permite reproducir los resultados 
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)
        
        return X_train, X_test, y_train, y_test
    
    
    def gridsearch(self, tipo_modelo, X_test, X_train, y_test, y_train, modelo = DecisionTreeRegressor()):
        """
        Esta función realiza un gridsearch sobre el modelo que le pasemos y nos devuelve el mejor modelo con los mejores hiperparametros
        Args:
            tipo_modelo (_type_):  modelo que deseamos optimizar 
            X_test (_type_): variables predictoras de test
            X_train (_type_): variables predictoras de train
            y_test (_type_): variable respuesta de test
            y_train (_type_): variable respuesta de train
            modelo (_type_, optional): _description_. Defaults to DecisionTreeRegressor().

        Returns:
            df: df con las metricas del modelo
        """
        profundidad = int(input("Cual es la profundidad máxima que quieres"))
        features = int(input("¿Cual es el nº de features maximo que quieres?"))
        leaf = int(input("¿Cual es el min_sample_leaf que quieres?"))
        split = int(input("¿Cual es el min_samples_split que quieres?"))
        
        param = {"max_depth": range(1, profundidad +1, 2),
                "min_samples_split": range(1, split +1, 2),
                "min_samples_leaf": range(1, leaf +1, 2),
                "max_features": range(1, features + 1, 2)}

        gs = GridSearchCV(
                    estimator = modelo,
                    param_grid= param,
                    cv=10,
                    verbose = 0,
                    return_train_score = True,
                    scoring="neg_mean_squared_error")
        gs.fit(X_train, y_train)
        
        self.best_tree = gs.best_estimator_
        print(f"el mejor arbol es {self.best_tree}")
        
        y_pred_test_dt2 = self.best_tree.predict(X_test)
        y_pred_train_dt2 = self.best_tree.predict(X_train)
        dt_results2 = self.metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, tipo_modelo)
        return dt_results2
    
    def ajuste_modelo(self, X_test, X_train, y_test, y_train):
        """
        Esta función realiza el ajuste del modelo y nos devuelve las metricas del modelo
        Args:
            X_test (_type_): variables predictoras de test
            X_train (_type_): variables predictoras de train
            y_test (_type_): variable respuesta de test
            y_train (_type_): variable respuesta de train
        """
        
        self.X_test = X_test
        self.X_train = X_train
        self.y_test = y_test
        self.y_train = y_train
        
        # iniciamos el método de Linear Regression
        
        tipo_modelo = input("Que modelo quieres hacer? 1: Regresion Lineal, 2: Decision Tree, 3: Random Forest")

        if tipo_modelo == "1":
            
            lr = LinearRegression()
            
            # fiteamos el modelo
            lr.fit(X_train, y_train)

            
            # hacemos las predicciones sobre los dos set de datos el X_test y el X_train
            y_pred_test = lr.predict(X_test)
            y_pred_train = lr.predict(X_train)
            
            lr_results = self.metricas(y_test, y_train, y_pred_test, y_pred_train, "Regresion lineal")
            
            return lr_results
            
            
        elif tipo_modelo == "2":
            # creamos el objeto del árbol
            regressor = DecisionTreeRegressor(random_state = 0) 
            
            # ajustamos el modelo
            regressor.fit(X_train, y_train)
            
            # hacemos las predicciones sobre los dos set de datos el X_test y el X_train
            y_pred_test = regressor.predict(X_test)
            y_pred_train = regressor.predict(X_train)
            
            dt_results = self.metricas(y_test, y_train, y_pred_test, y_pred_train, "Decision Tree")
            print("Las metricas del modelo son: ")
            display(dt_results)
            
            
            nuevo_modelo = input("¿quieres hacer un modelo nuevo: S/N?")
            
            if nuevo_modelo.upper() == "N":
                return dt_results
            else:
                parametros = regressor.get_params()
                claves_deseadas = ['max_depth', 'max_features', 'min_samples_leaf', 'min_samples_split' ]
                valores_deseados = {clave: parametros[clave] for clave in claves_deseadas}
                print(f"Los principales hiperparametros del modelo son: {valores_deseados}")

                nuevo_modelo = self.gridsearch("Decision Tree II",  X_test, X_train, y_test, y_train)
                print("Las nuevas metricas del modelo son: ")
                display(nuevo_modelo)

        elif tipo_modelo == "3":
            random_forest = self.gridsearch("Random Forest", X_test, X_train, y_test, y_train, RandomForestRegressor())
            display(random_forest)
        
    
    def metricas(self, y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
        """
        Esta función nos devuelve las metricas del modelo en un dataframe para poder compararlas con otros modelos

        Args:
            y_test (_type_): variable respuesta de test 
            y_train (_type_): variable respuesta de train
            y_test_pred (_type_): variable respuesta predicha de test
            y_train_pred (_type_): variable respuesta predicha de train
            tipo_modelo (_type_): tipo de modelo que estamos haciendo (regresion lineal, decision tree, random forest)

        Returns:
            _type_: df con las metricas del modelo en test y train
        """
    
    
        resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                    'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                    'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                    'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                    "set": ["test", "train"]}
        df_metricas = pd.DataFrame(resultados)
        df_metricas["modelo"] = tipo_modelo
        return df_metricas
        

In [45]:
modelo = Ajuste_modelo_lineal(df, "price")

In [46]:
X_entrena, X_testear, y_entrena, y_testear = modelo.separar_datos()

In [47]:
metricas_regresion_lineal = modelo.ajuste_modelo(X_testear, X_entrena, y_testear, y_entrena)
metricas_regresion_lineal

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.112884,0.033559,0.183192,0.967139,test,Regresion lineal
1,0.113824,0.033014,0.181697,0.968166,train,Regresion lineal


In [48]:
metricas_regresion_lineal = modelo.ajuste_modelo(X_testear, X_entrena, y_testear, y_entrena)
metricas_regresion_lineal

Las metricas del modelo son: 


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.08905067,0.01707911,0.1306871,0.983276,test,Decision Tree
1,2.055511e-17,2.3277780000000002e-32,1.525706e-16,1.0,train,Decision Tree


Los principales hiperparametros del modelo son: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
el mejor arbol es DecisionTreeRegressor(max_depth=7, max_features=7, min_samples_split=5)
Las nuevas metricas del modelo son: 


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.159278,0.044477,0.210897,0.956448,test,Decision Tree II
1,0.157819,0.0443,0.210475,0.957283,train,Decision Tree II


In [49]:
# Reparto de datos en train y test
# ==============================================================================
# lo primero que hacemos es definir cual es nuestra variable Y y nuestras variables X
X = df.drop('price', axis =1)
y = df['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)

In [50]:
# Generación del GridSearch
# ==============================================================================

# definimos un diccionario con los hiperparámetros que queremos testear. 
param = {"max_depth": [4, 5, 6, 7,11],
        "min_samples_split": [10, 21,50, 100],
        "max_features": [1,2,3,4,5,6,9]}

gs = GridSearchCV(
            estimator=DecisionTreeRegressor(),
            param_grid= param,
            cv=10,
            verbose=3,
            return_train_score = True,
            scoring="neg_mean_squared_error")


In [51]:
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0) 
  
# fit the regressor with X and Y data
regressor.fit(X_train, y_train)

In [52]:
# ajustamos el modelo de nuevo

%time
gs.fit(X_train, y_train)

CPU times: total: 0 ns
Wall time: 0 ns
Fitting 10 folds for each of 140 candidates, totalling 1400 fits
[CV 1/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.266, test=-0.263) total time=   0.0s
[CV 2/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.289, test=-0.290) total time=   0.0s
[CV 3/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.283, test=-0.287) total time=   0.0s
[CV 4/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.945, test=-0.973) total time=   0.0s
[CV 5/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.220, test=-0.216) total time=   0.0s
[CV 6/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.906, test=-0.911) total time=   0.0s
[CV 7/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.833, test=-0.851) total time=   0.0s
[CV 8/10] END max_depth=4, max_features=1, min_samples_split=10

In [53]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [54]:
# sacamos cual es el mejor de todos los que hemos testeado usando el método best_estimator_

best_tree = gs.best_estimator_
best_tree

In [55]:
y_pred_test_dt2 = best_tree.predict(X_test)
y_pred_train_dt2 = best_tree.predict(X_train)


In [56]:
dt_results2 = metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, "Decision tree II")

In [57]:
dt_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.117117,0.025704,0.160325,0.974831,test,Decision tree II
1,0.107479,0.02082,0.144293,0.979923,train,Decision tree II


In [58]:
df_test

Unnamed: 0,id,carat,depth,table,x,y,z,cut_map,color_D,color_E,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,-0.609375,-1.000000,0.333333,-0.707182,-0.683333,-0.741071,4,0,0,...,1,0,0,0,1,0,0,0,0,0
1,1,0.828125,0.714286,1.000000,0.602210,0.566667,0.651786,3,0,0,...,1,0,0,0,1,0,0,0,0,0
2,2,1.484375,0.071429,0.666667,1.016575,1.044444,1.044643,3,1,0,...,0,0,0,0,1,0,0,0,0,0
3,3,0.062500,-0.928571,-0.333333,0.127072,0.100000,0.053571,3,1,0,...,0,0,0,0,0,1,0,0,0,0
4,4,1.234375,2.071429,-0.666667,0.856354,0.794444,1.017857,0,0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.609375,-1.642857,1.000000,0.569061,0.544444,0.419643,3,0,0,...,0,0,0,0,1,0,0,0,0,0
13481,13481,0.296875,0.142857,1.000000,0.237569,0.266667,0.267857,2,1,0,...,0,0,0,0,1,0,0,0,0,0
13482,13482,-0.640625,0.142857,-1.233333,-0.779006,-0.777778,-0.758929,4,0,0,...,0,0,0,0,0,0,0,1,0,0
13483,13483,0.843750,-1.642857,0.666667,0.718232,0.761111,0.598214,4,0,0,...,1,0,0,0,1,0,0,0,0,0


In [59]:
df

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_map,color_D,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,-0.625000,0.400000,0.333333,-0.754098,-0.780220,-0.750000,6.353,3,1,...,0,0,0,0,0,1,0,0,0,0
1,1,0.484375,0.600000,-0.333333,0.398907,0.417582,0.464286,9.183,4,0,...,0,0,0,0,0,0,0,0,0,1
2,2,0.031250,0.000000,0.666667,0.010929,0.021978,0.017857,7.983,4,0,...,0,0,0,0,0,0,0,1,0,0
3,3,0.593750,0.933333,0.000000,0.464481,0.439560,0.535714,8.371,2,0,...,0,0,0,0,0,1,0,0,0,0
4,4,-0.531250,0.333333,0.666667,-0.650273,-0.631868,-0.625000,6.588,3,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,-0.437500,0.200000,0.666667,-0.497268,-0.483516,-0.482143,6.551,3,1,...,0,0,0,0,0,1,0,0,0,0
40451,40451,-0.265625,0.133333,0.333333,-0.262295,-0.285714,-0.267857,7.382,3,0,...,0,0,0,0,0,0,0,1,0,0
40452,40452,0.156250,0.666667,0.333333,0.092896,0.109890,0.151786,7.768,1,0,...,0,0,0,0,0,1,0,0,0,0
40453,40453,0.484375,-0.200000,0.000000,0.387978,0.428571,0.392857,8.726,2,0,...,0,0,0,0,0,0,0,1,0,0


In [60]:
X_reality_submision = best_tree.predict(df_test)
X_reality_submision

array([6.2023    , 8.62548   , 9.1669331 , ..., 6.2889    , 8.51030895,
       7.80408571])

In [61]:
submision1 = pd.DataFrame(X_reality_submision)
submision1.reset_index(inplace=True)
submision1.rename(columns={"index":"id",0:"price"}, inplace =True)
submision1

Unnamed: 0,id,price
0,0,6.202300
1,1,8.625480
2,2,9.166933
3,3,7.783000
4,4,9.057833
...,...,...
13480,13480,8.510309
13481,13481,8.131351
13482,13482,6.288900
13483,13483,8.510309


In [63]:
submision1.to_csv("../data/submission2.csv", index= False)

In [39]:
pd.read_csv("../data/submission1.csv")

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_map,color_D,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,0.30,62.4,58.0,4.31,4.28,2.68,6.353,3,1,...,0,0,0,0,0,1,0,0,0,0
1,1,1.01,62.7,56.0,6.42,6.46,4.04,9.183,4,0,...,0,0,0,0,0,0,0,0,0,1
2,2,0.72,61.8,59.0,5.71,5.74,3.54,7.983,4,0,...,0,0,0,0,0,0,0,1,0,0
3,3,1.08,63.2,57.0,6.54,6.50,4.12,8.371,2,0,...,0,0,0,0,0,1,0,0,0,0
4,4,0.36,62.3,59.0,4.50,4.55,2.82,6.588,3,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,0.42,62.1,59.0,4.78,4.82,2.98,6.551,3,1,...,0,0,0,0,0,1,0,0,0,0
40451,40451,0.53,62.0,58.0,5.21,5.18,3.22,7.382,3,0,...,0,0,0,0,0,0,0,1,0,0
40452,40452,0.80,62.8,58.0,5.86,5.90,3.69,7.768,1,0,...,0,0,0,0,0,1,0,0,0,0
40453,40453,1.01,61.5,57.0,6.40,6.48,3.96,8.726,2,0,...,0,0,0,0,0,0,0,1,0,0
