In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import pickle
import os

import numpy as np

df=pd.read_csv('./data/train.csv')

def limpia_catergoricas(data):

    def limpia_color(df):
        return df.apply(lambda x: abs(ord(x.upper())-ord('Z')))


    def limpia_corte(df):
        dict_cut={'Ideal':5,'Premium':4,  'Very Good':3,'Good':2,  'Fair':1, 'Poor':0}
        return df.map(dict_cut)

    def limpia_claridad(df):
        dict_clarity={'I3':0, 'I2':1, 'I1':2, 'SI2':3, 'SI1':4, 'VS2':5, 
                    'VS1':6, 'VVS2':7, 'VVS1':8, 'IF':9,'F':10}
        return df.map(dict_clarity)

    data.color=limpia_color(data.color)
    data.cut=limpia_corte(data.cut)
    data.clarity=limpia_claridad(data.clarity)
    return data
    

def limpia_dimensiones(data):
    data.columns=['id', 'carat', 'cut', 'color', 'clarity', 'depth_p', 'table_p', 'x', 'y',
       'z', 'price']
    return data[['cut', 'color', 'clarity', 'carat', 'depth_p', 'table_p','price']]

def incluye_ideal_table(data,ideal):
    data['ideal_table']=abs(data.table_p-ideal)
    return data.drop('table_p',axis=1)
df=limpia_catergoricas(df)
df=limpia_dimensiones(df)

df_size = df[df.carat - np.floor(df.carat) == 0]
df_off = df[df.carat - np.floor(df.carat) != 0]
#df=df_size
#df=df_off



In [4]:
from sklearn.model_selection import RandomizedSearchCV
max_depth = [int(x) for x in np.linspace(start = 1000, stop = 5000, num = 500)]
max_features = [6]
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 2000, num = 500)]
min_samples_split = [int(x) for x in np.linspace(2, 101, num = 50 )]
min_samples_leaf = [int(x) for x in np.linspace(1, 20, num = 9 )]
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid

param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               }

x=df.drop('price',axis=1)
y=df.price
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

print(param)

rf=RandomForestRegressor(min_impurity_decrease=0.001, warm_start=False)
rf_random = RandomizedSearchCV(estimator = rf, n_iter=300, param_distributions= param, scoring='neg_mean_squared_error', cv=5, verbose=3, n_jobs = -1)

rf_random.fit(x_train, y_train)

mse(y_test,rf_random.predict(x_test))

{'min_samples_split': [500, 510, 520, 530, 540, 551, 561, 571, 581, 592, 602, 612, 622, 632, 643, 653, 663, 673, 684, 694, 704, 714, 724, 735, 745, 755, 765, 776, 786, 796, 806, 816, 827, 837, 847, 857, 868, 878, 888, 898, 908, 919, 929, 939, 949, 960, 970, 980, 990, 1001], 'max_features': [6], 'n_estimators': [10, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, 65, 69, 73, 77, 81, 85, 89, 93, 97, 101, 105, 109, 113, 117, 121, 125, 129, 133, 137, 141, 145, 149, 153, 157, 161, 165, 169, 173, 177, 181, 185, 189, 193, 197, 201, 205, 209, 213, 217, 221, 225, 229, 233, 237, 241, 245, 249, 253, 257, 261, 265, 269, 273, 277, 281, 285, 289, 293, 297, 301, 305, 309, 313, 317, 321, 325, 329, 333, 337, 341, 344, 348, 352, 356, 360, 364, 368, 372, 376, 380, 384, 388, 392, 396, 400, 404, 408, 412, 416, 420, 424, 428, 432, 436, 440, 444, 448, 452, 456, 460, 464, 468, 472, 476, 480, 484, 488, 492, 496, 500, 504, 508, 512, 516, 520, 524, 528, 532, 536, 540, 544, 548, 552, 556, 560, 564, 568, 572, 

0.03645910999074288

In [5]:
print(mse(y_test,rf_random.predict(x_test)))
rf_random.best_estimator_



0.03645910999074288


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=2643, max_features=6, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.001,
                      min_impurity_split=None, min_samples_leaf=17,
                      min_samples_split=510, min_weight_fraction_leaf=0.0,
                      n_estimators=691, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [3]:
print(mse(y_test,rf_random.predict(x_test)))
rf_random.best_estimator_


0.03663078558219718


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=4254, max_features=6, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.001,
                      min_impurity_split=None, min_samples_leaf=20,
                      min_samples_split=712, min_weight_fraction_leaf=0.0,
                      n_estimators=1250, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [3]:
print(mse(y_test,rf_random.predict(x_test)))
rf_random.best_estimator_


0.036279051823871186


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=12131, max_features=6, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.001,
                      min_impurity_split=None, min_samples_leaf=20,
                      min_samples_split=472, min_weight_fraction_leaf=0.0,
                      n_estimators=110, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [3]:
rf_random.best_estimator_


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10979, max_features=6, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.001,
                      min_impurity_split=None, min_samples_leaf=5001,
                      min_samples_split=684, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [7]:
rf_random.best_estimator_


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=72000, max_features=6, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.001,
                      min_impurity_split=None, min_samples_leaf=20,
                      min_samples_split=500, min_weight_fraction_leaf=0.0,
                      n_estimators=1919, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [8]:
mse(y_test,rf__random_mejor.predict(x_test))

0.035219735222454855