In [53]:
import pandas as pd
import numpy as np

# librerías de visualización
import seaborn as sns
import matplotlib.pyplot as plt

# librerías para crear el modelo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder  

from sklearn import tree

# para calcular las métricas
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score 
from sklearn.metrics import f1_score 
from sklearn.metrics import cohen_kappa_score


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('data/train1.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,-1.046965,3,1,4,0.47079,0.246905,-1.264705,-1.302472,-1.238584,-1.406483
1,1,0.449558,0,0,7,0.685818,-0.650074,0.61605,0.654833,0.726456,1.376777
2,2,-0.161698,0,2,5,0.040735,0.695394,-0.016811,0.008384,0.004015,0.196596
3,3,0.597103,2,3,4,1.044197,-0.201585,0.723012,0.690747,0.842047,0.578188
4,4,-0.920498,3,3,2,0.399114,0.695394,-1.095348,-1.060054,-1.0363,-1.175364


In [5]:
X = df.drop(["id",'price'], axis =1)
y = df['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)

In [6]:
print("Datos de entrenamiento")
print("-----------------------")
print(y_train.describe())

Datos de entrenamiento
-----------------------
count    32307.000000
mean         0.003529
std          1.001881
min         -1.963135
25%         -0.914741
50%         -0.002067
75%          0.791604
max          2.024893
Name: price, dtype: float64


In [7]:
print("Datos de testeo")
print("-----------------------")
print(y_test.describe())

Datos de testeo
-----------------------
count    8077.000000
mean       -0.014115
std         0.992439
min        -1.885439
25%        -0.925559
50%        -0.011902
75%         0.764067
max         2.023910
Name: price, dtype: float64


In [8]:
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0) 
  
# fit the regressor with X and Y data
regressor.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

In [11]:
max_features = np.sqrt(len(X_train.columns))
max_features

3.0

In [12]:
print(regressor.tree_.max_depth)

35


In [13]:
y_pred_test_dt = regressor.predict(X_test)
y_pred_train_dt = regressor.predict(X_train)

In [15]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [16]:
dt_results1 = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Decission Tree I")
dt_results1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.087351,0.016571,0.128729,0.983173,test,Decission Tree I
1,0.000144,1.2e-05,0.003494,0.999988,train,Decission Tree I


In [17]:
importancia_predictores = pd.DataFrame(
                            {'predictor': X_train.columns,
                             'importancia': regressor.feature_importances_}
                            )
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
importancia_predictores

Importancia de los predictores en el modelo
-------------------------------------------


Unnamed: 0,predictor,importancia
0,carat,0.074338
1,cut,0.000956
2,color,0.01477
3,clarity,0.030566
4,depth,0.001535
5,table,0.001024
6,x,0.012297
7,y,0.862164
8,z,0.002349


# decision tree

In [29]:
max_features = np.sqrt(len(X_train.columns))
max_features

3.0

In [34]:
param = {
    "max_depth": list(range(1, 36)),
    "min_samples_split": [2, 5, 10, 20],
    "max_features": [1, 2, 3]
}

arbol2 = GridSearchCV(
            estimator=DecisionTreeRegressor(),
            param_grid= param,
            cv=10,
            verbose=0,
            n_jobs = -1,
            return_train_score = True,
            scoring="neg_mean_squared_error")

In [35]:
arbol2.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...],
                         'max_features': [1, 2, 3],
                         'min_samples_split': [2, 5, 10, 20]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [37]:
best_tree = arbol2.best_estimator_
best_tree

DecisionTreeRegressor(max_depth=26, max_features=3, min_samples_split=20)

In [38]:
y_pred_test_dt2 = best_tree.predict(X_test)
y_pred_train_dt2 = best_tree.predict(X_train)

In [39]:
dt_results2 = metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, "Decision tree II")

In [40]:
dt_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.106173,0.022674,0.150579,0.976976,test,Decision tree II
1,0.074209,0.010977,0.104773,0.989064,train,Decision tree II


## Random Forest

In [42]:
param = {    "max_depth": list(range(1, 36)),
    "min_samples_split": [2, 5, 10, 20],
    "max_features": [1, 2, 3]}

bosque = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param,
            cv=10,
            verbose=0,
            n_jobs = -1,
            return_train_score = True,
            scoring="neg_mean_squared_error")

In [44]:
bosque.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...],
                         'max_features': [1, 2, 3],
                         'min_samples_split': [2, 5, 10, 20]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [45]:
bos = bosque.best_estimator_

In [47]:
y_pred_test_rf = bos.predict(X_test)
y_pred_train_rf = bos.predict(X_train)

In [55]:
rf_results = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest")
rf_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.073323,0.010291,0.101443,0.989551,test,Random Forest
1,0.027878,0.001528,0.039093,0.998477,train,Random Forest


In [50]:
#primer modelo
import pickle
with open('data/tercer_modelo.pkl', 'wb') as modelo:
        pickle.dump(bos, modelo)

##  Gradient

In [57]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from multiprocessing import cpu_count

In [58]:
# Define hyperparameter grid
param_grid = {
    "max_depth": [2,3,4],
    "min_samples_split": [50,100,150],
    "max_features": [1,2,3],
    "min_samples_leaf": [50,100,150]
}

# Create Gradient Boosting Regressor object
gb = GradientBoostingRegressor()

# Create GridSearchCV object with 10-fold cross-validation
grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=10, n_jobs=cpu_count())

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)


GridSearchCV(cv=10, estimator=GradientBoostingRegressor(), n_jobs=4,
             param_grid={'max_depth': [2, 3, 4], 'max_features': [1, 2, 3],
                         'min_samples_leaf': [50, 100, 150],
                         'min_samples_split': [50, 100, 150]})

In [62]:
best_g=grid_search.best_params_
print(best_g)

{'max_depth': 4, 'max_features': 3, 'min_samples_leaf': 50, 'min_samples_split': 150}


In [63]:
y_pred_gb_test= grid_search.predict(X_test)
y_pred_gb_train= grid_search.predict(X_train)

In [65]:
rf_results = metricas(y_test, y_train, y_pred_gb_test, y_pred_gb_train, "Gradient")
rf_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.084203,0.012205,0.110475,0.987607,test,Gradient
1,0.081584,0.011484,0.107163,0.988559,train,Gradient


In [66]:
with open('data/gradient_modelo.pkl', 'wb') as modelo:
        pickle.dump(grid_search, modelo)