In [2]:
import time
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pylab as plt

from sklearn.model_selection import train_test_split

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.inspection import permutation_importance


### Functions

In [43]:
def ordinal_encoding(df):
    #df['cut'] = df['cut'].replace({'Fair': 1
    #                               'Good': 2,
    #                               'Very Good': 3,
    #                               'Premium': 4,
    #                               'Ideal': 5}
    
    #df['color'] = df['color'].replace({'D': 1
    #                                   'E': 2
    #                                   'F': 3
    #                                   'G': 4
    #                                   'H': 5
    #                                   'I': 6
    #                                   'J': 7}
    
    #df['clarity'] = df['clarity'].replace({'VVS1': 1
    #                                       'VVS2': 2
    #                                       'VS1':  3
    #                                       'VS2':  4
    #                                       'SI1':  5
    #                                       'SI2':  6
    #                                       'I1':   7
    #                                       'IF':   8}
    df['cut'] = df['cut'].replace({'Fair': 1,
                                   'Good': 2, 
                                   'Very Good': 3, 
                                   'Premium': 4, 
                                   'Ideal': 5})

    df['color'] = df['color'].replace({'D': 7,
                                       'E': 6,
                                       'F': 5,
                                       'G': 4,
                                       'H': 3,
                                       'I': 2,
                                       'J': 1})

    df['clarity'] = df['clarity'].replace({'I1': 1,
                                           'SI2': 2,
                                           'SI1': 3,
                                           'VS2': 4,
                                           'VS1': 5,
                                           'VVS2': 6,
                                           'VVS1': 7,
                                           'IF': 8})

    return df

In [44]:
def add_features(df):
    df['volume'] = df['x'] * df['y'] * df['z']
        
    return df

In [45]:
def scaling_log(df):
    df['carat'] = df['carat'].apply(lambda x: np.log(x) if x != 0 else 0)
    df['cut'] = df['cut'].apply(lambda x: np.log(x) if x != 0 else 0)
    df['color'] = df['color'].apply(lambda x: np.log(x) if x != 0 else 0)
    df['clarity'] = df['clarity'].apply(lambda x: np.log(x) if x != 0 else 0)
    df['depth'] = df['depth'].apply(lambda x: np.log(x) if x != 0 else 0)
    df['table'] = df['table'].apply(lambda x: np.log(x) if x != 0 else 0)
    df['volume'] = df['volume'].apply(lambda x: np.log(x) if x != 0 else 0)
    
    return df

In [46]:
def X_y_split(df, features, target):
    X = df[features]
    if (target != ''):
        y = df[target]
    else:
        y = None
    return X, y

In [47]:
def scaling(X):
    scaler = RobustScaler()
    #scaler = StandardScaler()
    #scaler = MinMaxScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=cols_features)
    
    return X

In [48]:
def test_predict(df_test, m):
    df_pred = pd.DataFrame(m.predict(df_test), columns=['price']).reset_index().rename(columns={'index': 'id'})
    return df_pred

In [49]:
def save_csv(df, test_id):
    df.to_csv(f'../data/results/diamonds_results_{test_id}.csv', index=False)

### Analysis

In [50]:
pd.set_option('display.max_columns', None)

In [51]:
# load df train
diamonds_train = pd.read_csv('../data/diamonds_train.csv').rename(columns={'index_id': 'id'})
diamonds_train.head(2)

Unnamed: 0,id,price,carat,city,cut,color,clarity,depth,table,x,y,z
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,4268,1.21,Kimberly,Premium,J,VS2,62.4,58.0,6.83,6.79,4.25
1,248aa2bdd0032920ac9e5f6ad36c350549da067efeaf7b...,4839,1.2,Kimberly,Premium,J,VS2,60.8,60.0,6.85,6.89,4.18


### Data clean

In [52]:
diamonds_train_clean = diamonds_train.copy()

In [53]:
# ordinal
diamonds_train_clean = ordinal_encoding(diamonds_train_clean)
diamonds_train_clean.head(2)

Unnamed: 0,id,price,carat,city,cut,color,clarity,depth,table,x,y,z
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,4268,1.21,Kimberly,4,1,4,62.4,58.0,6.83,6.79,4.25
1,248aa2bdd0032920ac9e5f6ad36c350549da067efeaf7b...,4839,1.2,Kimberly,4,1,4,60.8,60.0,6.85,6.89,4.18


In [54]:
# feature engineering
diamonds_train_clean = add_features(diamonds_train_clean)
diamonds_train_clean.head(2)

Unnamed: 0,id,price,carat,city,cut,color,clarity,depth,table,x,y,z,volume
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,4268,1.21,Kimberly,4,1,4,62.4,58.0,6.83,6.79,4.25,197.096725
1,248aa2bdd0032920ac9e5f6ad36c350549da067efeaf7b...,4839,1.2,Kimberly,4,1,4,60.8,60.0,6.85,6.89,4.18,197.28137


In [None]:
# log
##diamonds_train_clean = scaling_log(diamonds_train_clean)
##diamonds_train_clean.head(2)

### Features and target

In [55]:
# features and target
cols_features = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'volume']

col_target = 'price'

X, y = X_y_split(diamonds_train_clean, cols_features, col_target)

print(X.shape, y.shape)

(40455, 7) (40455,)


In [56]:
X.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,volume
0,1.21,4,1,4,62.4,58.0,197.096725
1,1.2,4,1,4,60.8,60.0,197.28137


### Scaling

In [57]:
# scaling
X = scaling(X)
X.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,volume
0,0.796875,0.0,-1.0,0.0,0.4,0.333333,0.777758
1,0.78125,0.0,-1.0,0.0,-0.666667,1.0,0.779501


### Train and hyperparameters

In [None]:
# model HistGradientBoostingRegressor

In [None]:
# robust scaling

start_time = time.time()

# model
model = HistGradientBoostingRegressor(random_state=42)
hyperparameters = model.get_params()

# train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# cross validation
n_scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10, n_jobs=-1)

elapsed_time = time.time() - start_time

print(model)
print(n_scores)
print(f'mean of scores: {np.mean(n_scores)}')
print(f'time: {elapsed_time} seconds')


In [None]:
X.columns

In [None]:
params = {'max_depth': [2, 4, 8, 16, 32],
          'max_iter': [10, 100, 1000, 10000]}

grid = GridSearchCV(estimator=model, 
                    param_grid=params,
                    cv=10,
                    scoring='neg_root_mean_squared_error')

start_time = time.time()
grid.fit(X_train, y_train)
elapsed_time = time.time() - start_time

print(f'time: {elapsed_time} seconds')
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))

### Selected model

In [58]:
# robust scaling

start_time = time.time()

# model and best hyperparams
model = HistGradientBoostingRegressor(random_state=42, max_depth=8, max_iter=1000)
hyperparameters = model.get_params()

# train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# cross validation
n_scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10, n_jobs=-1)

elapsed_time = time.time() - start_time

print(model)
print(n_scores)
print(f'mean of scores: {np.mean(n_scores)}')
print(f'time: {elapsed_time} seconds')

# training
start_time = time.time()
model.fit(X, y)
elapsed_time = time.time() - start_time

print(f'time: {elapsed_time} seconds')


(32364, 7) (8091, 7) (32364,) (8091,)
HistGradientBoostingRegressor(max_depth=8, max_iter=1000, random_state=42)
[-530.98068306 -522.81793976 -563.17938749 -546.53206134 -577.34423977
 -551.45209324 -531.05429403 -544.53297304 -519.27107858 -563.49792782]
mean of scores: -545.0662678120515
time: 1.0341787338256836 seconds
time: 0.49555015563964844 seconds


In [None]:
# ordinal encoding (manual order ascending) clarity, color, cut
# RobustScaler
# model selected HistGradientBoostingRegressor
# hyperparams max_depth=8, max_iter=1000
# add volume
# remove city, x, y, z


In [None]:
X.columns

### Features importance

In [59]:
r = permutation_importance(model, 
                           X_test, 
                           y_test,
                           n_repeats=30,
                           random_state=42,
                           scoring='neg_root_mean_squared_error')

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f'{X.columns[i]:<8}'
              f'{r.importances_mean[i]:.3f}'
              f' +/- {r.importances_std[i]:.3f}')


carat   3935.615 +/- 25.347
volume  2481.133 +/- 17.219
clarity 1198.284 +/- 20.178
color   848.998 +/- 14.264
cut     48.063 +/- 3.287
depth   45.420 +/- 2.949
table   25.254 +/- 2.378


### Test

In [60]:
# load df test
diamonds_test = pd.read_csv('../data/diamonds_test.csv').rename(columns={'index_id': 'id'})
diamonds_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [61]:
diamonds_test_clean = diamonds_test.copy()
# apply clean
diamonds_test_clean = ordinal_encoding(diamonds_test_clean)
diamonds_test_clean = add_features(diamonds_test_clean)

# get final df
col_target = ''
X_diamonds_test, y_diamonds_test = X_y_split(diamonds_test_clean, cols_features, col_target)

print(X_diamonds_test.shape)

X_diamonds_test = scaling(X_diamonds_test)

(13485, 7)


In [62]:
X_diamonds_test.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,volume
0,0.140625,-0.5,0.333333,-0.5,0.533333,1.0,0.101505
1,0.78125,0.5,-1.0,0.5,-0.6,0.0,0.770379


In [63]:
# predict and save
df_result_diamonds = test_predict(X_diamonds_test, model)
save_csv(df_result_diamonds, 'HGBR_2')

In [64]:
df_result_diamonds['price'].agg(['min', 'max', 'mean'])

min       344.524846
max     18449.916147
mean     3950.444710
Name: price, dtype: float64

In [65]:
diamonds_train['price'].agg(['min', 'max', 'mean'])

min       326.000000
max     18823.000000
mean     3928.444469
Name: price, dtype: float64