In [1]:
# Library for loading dataset
import pandas as pd

# Libraries for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import robust_scale

# Library for modeling
from sklearn.ensemble import HistGradientBoostingRegressor

# Library for selection
from sklearn.model_selection import train_test_split

# Library for metrics
from sklearn.metrics import r2_score

# Library for exportation
import pickle

In [2]:
# Loading data
df = pd.read_csv('clean_diamonds.csv')

In [3]:
# Drop unnecessary columns
df.drop(columns=['index', 'table', 'depth'], inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53943 entries, 0 to 53942
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53943 non-null  float64
 1   cut      53943 non-null  object 
 2   color    53943 non-null  object 
 3   clarity  53943 non-null  object 
 4   price    53943 non-null  int64  
 5   x        53943 non-null  float64
 6   y        53943 non-null  float64
 7   z        53943 non-null  float64
dtypes: float64(4), int64(1), object(3)
memory usage: 3.3+ MB


In [5]:
# Encode objects
le = LabelEncoder()

df['cut'] = le.fit_transform(df['cut'])
df['color'] = le.fit_transform(df['color'])
df['clarity'] = le.fit_transform(df['clarity'])

features = ['carat','cut','color','clarity','x','y','z']

# Declare X and Y variables
x = df[features]     
y = df['price']

# Train set and Test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)


# Scale X variable (x_train and x_test) with robust_scale
x_train_scale = robust_scale(x_train)
x_test_scale = robust_scale(x_test)

In [6]:
# Create the HistGradientBoostingRegressor model with the following parameters obtained from model_selection
HGBR = HistGradientBoostingRegressor(
    l2_regularization=2.208787572338781e-05,
    learning_rate=0.036087332404571744,
    loss='squared_error', max_iter=512,
    max_leaf_nodes=64, min_samples_leaf=3,
    n_iter_no_change=18, random_state=1,
    validation_fraction=None, warm_start=True)


In [7]:
# Train and test the model
HGBR.fit(x_train_scale, y_train)
y_pred = HGBR.predict(x_test_scale)
y_pred

array([  840.94836554, 14558.10991348,   926.79436059, ...,
         793.96748751,   655.44558954,   534.91581472])

In [8]:
# Shows scores (train and test)
print('Train score: ', HGBR.score(x_train_scale, y_train))
print('Test score: ', r2_score(y_test, y_pred))

Train score:  0.9898104793920454
Test score:  0.9803135594866754


In [9]:
pickle.dump(HGBR, open('model.pkl', 'wb'))