In [105]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
from copy import deepcopy
from functools import partial
from itertools import combinations

# Import sklearn classes for model selection, cross validation, and performance evaluation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, accuracy_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from category_encoders import OneHotEncoder, OrdinalEncoder, CountEncoder

# Import libraries for Hypertuning
import optuna
# Import libraries for gradient boosting
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoost, CatBoostRegressor, CatBoostClassifier
from catboost import Pool
from hyperopt import STATUS_OK,Trials, fmin,hp,tpe

# Suppress warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


In [106]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from skopt import gp_minimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from collections import OrderedDict
import optuna

In [107]:
train = pd.read_csv('train.csv',index_col=[0])
test = pd.read_csv('test.csv',index_col=[0])

In [108]:
train['x'] = train['x'].replace(0, 1)
test['x'] = test['x'].replace(0, 1)

In [109]:
train_rel = pd.read_csv('original_dataset.csv')
train_rel['id']=train_rel.index+(train.shape[0]+test.shape[0])
train_rel = train_rel.drop('Unnamed: 0',axis=1).set_index('id')

In [110]:

sp = pd.read_csv("sample_submission.csv")
# depth has missing values


In [112]:
    train['volume'] = train['x'] * train['y'] * train['z']
    train['density'] = train['carat'] / train['volume']
    train['table_percentage'] = (train['table'] / ((train['x'] + train['y']) / 2)) * 100
    train['depth_percentage'] = (train['depth'] / ((train['x'] + train['y']) / 2)) * 100
    train['symmetry'] = (abs(train['x'] - train['z']) + abs(train['y'] - train['z'])) / (train['x'] + train['y'] + train['z'])
    train['surface_area'] = 2 * ((train['x'] * train['y']) + (train['x'] * train['z']) + (train['y'] * train['z']))
    train['depth_to_table_ratio'] = train['depth'] / train['table']

In [113]:
    test['volume'] = test['x'] * test['y'] * test['z']
    test['density'] = test['carat'] / test['volume']
    test['table_percentage'] = (test['table'] / ((test['x'] + test['y']) / 2)) * 100
    test['depth_percentage'] = (test['depth'] / ((test['x'] + test['y']) / 2)) * 100
    test['symmetry'] = (abs(test['x'] - test['z']) + abs(test['y'] - test['z'])) / (test['x'] + test['y'] + test['z'])
    test['surface_area'] = 2 * ((test['x'] * test['y']) + (test['x'] * test['z']) + (test['y'] * test['z']))
    test['depth_to_table_ratio'] = test['depth'] / test['table']

In [114]:
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

In [115]:
train_all = pd.concat([train,train_rel],axis=0)
train_all = train_all.dropna()

In [116]:
cuts = ['Fair','Good','Very Good','Premium','Ideal']
colors = ['J','I','H','G','F','E','D']
clars = ['I3','I2','I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF','FL']

cut_dict = {cuts[i]:i for i in range(len(cuts))}
color_dict = {colors[i]:i for i in range(len(colors))}
clar_dict = {clars[i]:i for i in range(len(clars))}

def convert_feat(df):
    """
    A function that converts the categorical data to ordinals
    """
    df.cut = df.cut.map(cut_dict)
    df.color=df.color.map(color_dict)
    df.clarity = df.clarity.map(clar_dict)
    return df

In [117]:
train_all = convert_feat(train_all)
test = convert_feat(test)

In [120]:
train_all

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,price,volume,density,table_percentage,depth_percentage,symmetry,surface_area,depth_to_table_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1.52,3,4,5,62.2,58.0,7.27,7.33,4.55,13619,242.465405,0.006269,794.520548,852.054795,0.287206,239.4382,1.072414
1,2.03,2,0,3,62.0,58.0,8.06,8.12,5.05,13387,330.508360,0.006142,716.934487,766.378245,0.286387,294.3124,1.068966
2,0.70,4,3,6,61.2,57.0,5.69,5.73,3.50,2772,114.112950,0.006134,998.248687,1071.803853,0.296247,145.1474,1.073684
3,0.32,4,3,6,61.6,56.0,4.38,4.41,2.71,666,52.345818,0.006113,1274.175199,1401.592719,0.293043,86.2734,1.100000
4,1.70,3,3,5,62.6,59.0,7.65,7.61,4.77,14453,277.692705,0.006122,773.263434,820.445609,0.285572,262.0134,1.061017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193568,0.31,4,6,7,61.1,56.0,4.35,4.39,2.67,1130,50.987655,0.006080,1281.464531,1398.169336,0.297984,84.8646,1.091071
193569,0.70,3,3,7,60.3,58.0,5.75,5.77,3.47,2874,115.125925,0.006080,1006.944444,1046.875000,0.305537,146.3038,1.039655
193570,0.73,2,4,4,63.1,57.0,5.72,5.75,3.62,3036,119.061800,0.006131,993.897123,1100.261552,0.280318,148.8228,1.107018
193571,0.34,2,6,4,62.9,55.0,4.45,4.49,2.81,681,56.145205,0.006056,1230.425056,1407.158837,0.282553,90.2038,1.143636


In [121]:
test

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,volume,density,table_percentage,depth_percentage,symmetry,surface_area,depth_to_table_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
193573,0.35,4,6,5,62.3,56.0,4.51,4.54,2.82,57.740628,0.006062,1237.569061,1376.795580,0.287279,91.9928,1.112500
193574,0.77,2,4,3,62.8,56.0,5.83,5.87,3.68,125.937328,0.006114,957.264957,1073.504274,0.282185,154.5562,1.121429
193575,0.71,4,1,5,61.9,53.0,5.77,5.74,3.55,117.575290,0.006039,920.938315,1075.586447,0.292829,147.9606,1.167925
193576,0.33,4,3,7,61.6,55.0,4.44,4.42,2.73,53.575704,0.006160,1241.534989,1390.519187,0.293356,87.6252,1.120000
193577,1.20,2,1,5,62.7,56.0,6.75,6.79,4.24,194.329800,0.006175,827.178730,926.144756,0.284589,206.4842,1.119643
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322618,0.72,4,6,7,62.0,56.0,5.75,5.78,3.57,118.648950,0.006068,971.379011,1075.455334,0.290728,148.7942,1.107143
322619,0.70,3,6,4,59.6,62.0,5.77,5.74,3.43,113.600914,0.006162,1077.324066,1035.621199,0.311245,145.1982,0.961290
322620,1.01,3,3,7,62.3,58.0,6.44,6.41,4.01,165.534404,0.006101,902.723735,969.649805,0.286477,185.6178,1.074138
322621,1.35,4,6,2,62.0,56.0,7.05,7.08,4.38,218.623320,0.006175,792.639774,877.565464,0.290113,223.6068,1.107143


In [73]:
train

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
...,...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67,1130
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47,2874
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62,3036
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81,681


In [122]:
train_all.columns, test.columns

(Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z',
        'price', 'volume', 'density', 'table_percentage', 'depth_percentage',
        'symmetry', 'surface_area', 'depth_to_table_ratio'],
       dtype='object'),
 Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z',
        'volume', 'density', 'table_percentage', 'depth_percentage', 'symmetry',
        'surface_area', 'depth_to_table_ratio'],
       dtype='object'))

In [124]:
X_all = train_all.drop('price',axis=1)
y_all = train_all.price

In [126]:
X_all.isna().sum()

carat                   0
cut                     0
color                   0
clarity                 0
depth                   0
table                   0
x                       0
y                       0
z                       0
volume                  0
density                 0
table_percentage        0
depth_percentage        0
symmetry                0
surface_area            0
depth_to_table_ratio    0
dtype: int64

def fit_and_score(model, X, y,fit_params=None,random_state=None,
                  early_stopping=True,early_stopping_rounds=100,gb_model = 'xgb',
                  test_size=0.1,eval_size=0.1,
                 fit_verbose = False,print_es = True,print_score=True):
     if early_stopping:
        X_train, X_val, y_train,y_val = train_test_split(X_train,y_train,
                                                         test_size=eval_size,
                                                         random_state=random_state)
        eval_set = [(X_val,y_val)]
        if gb_model == 'cb':
            model.fit(X_train,y_train,
                     eval_set = eval_set,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose=fit_verbose)
            best_it = model.get_best_iteration()
        else:
            model.fit(X_train,y_train,
                     eval_set = eval_set,
                     eval_metric='rmse',
                     early_stopping_rounds=early_stopping_rounds,
                     verbose=fit_verbose)
            if gb_model == 'xgb':
                best_it = model.best_iteration
            else:
                best_it = model.best_iteration_
            print(f'Eval set RMSE stopped improving after {best_it} iterations.')
        if print_es:
            print(f'Eval set RMSE stopped improving after {best_it} iterations.')
     else:
        model.fit(X_train,y_train,verbose=False)
     y_pred = model.predict(X_test)
     score = mean_squared_error(y_test,y_pred,squared=False)
     if print_score:
        print(f'RMSE on test set:     {score:.4f}\n=====')
     return score
    

In [166]:
def cv_score(model,X,y,n_splits=5,fit_params=None,random_state=None,
             early_stopping=True,early_stopping_rounds=100,gb_model='xgb',
             fit_verbose=False,print_es=True,
             print_scores=True,print_mean_score = True):
    """
    *A function which computes k-fold CV
    *Model is a pipeline or regressor
    *Outputs CV score for each fold and
     mean CV score across folds using RMSE metric
    *Make sure to set early_stopping = True if desired,
     and set gb_model to be xgb, lgbm, or cb due to
     annoying syntax difference involving best_iteration
    """
    kf = KFold(n_splits=n_splits,shuffle=True,
               random_state=random_state)
    if early_stopping:
        scores = []
        X_train, X_val, y_train,y_val = train_test_split(X,y,test_size=0.1,
                                                         random_state=random_state)
        eval_set = [(X_val,y_val)]
        for i,(train_idx,test_idx) in enumerate(kf.split(X_train,y_train)):
            fold_X_train = X_train.iloc[train_idx,:]
            fold_X_test = X_train.iloc[test_idx,:]
            fold_y_train = y_train.iloc[train_idx]
            fold_y_test = y_train.iloc[test_idx]
            if gb_model =='cb':
                model.fit(fold_X_train,fold_y_train,
                         eval_set = eval_set,
                         early_stopping_rounds=early_stopping_rounds,
                         verbose=fit_verbose)
                best_it = model.get_best_iteration()
            else:
                 model.fit(fold_X_train,fold_y_train,
                         eval_set = eval_set,
                         eval_metric='rmse',
                         early_stopping_rounds=early_stopping_rounds,
                         verbose=fit_verbose)
                 if gb_model == 'xgb':
                     best_it = model.best_iteration
                 else:
                     best_it = model.best_iteration_
            if print_es:
                print(f'Fold {i+1}: eval set RMSE score stopped improving after {best_it} iterations.')
            fold_y_pred = model.predict(fold_X_test)
            scores.append(mean_squared_error(fold_y_test,fold_y_pred,squared=False))
        mean_score = np.mean(scores)
        if print_scores:
            print(f'=====\nRMSE scores for all folds: {scores} \n=====')
        if print_mean_score:
            print(f'Mean RMSE score across folds: {mean_score:.4f} \n=====')
    else:
        scores = cross_val_score(model,X,y,fit_params=fit_params,cv=kf,
                                 scoring='neg_root_mean_squared_error')
        mean_score = -np.mean(scores)
    if print_scores:
            print(f'RMSE scores for all folds: {-scores} \n=====')
    if print_mean_score:
            print(f'Mean RMSE score across folds: {mean_score:.4f} \n=====')
    return mean_score

In [167]:
def tune_cv(model,X,y,params,bayes=False,random_state=None,
            n_iter=50, verbose_search=0, verbose_model = False,
            early_stopping = True, early_stopping_rounds = 100, gb_model='xgb',
           print_params=True,print_mean_score = True):
    """
    *A function for tuning hyperparameters using either
     GridSearchCV or BayesSearchCV.
    *Outputs the mean CV score of the best parameters and
     the list of best parameters.
    *model is a regressor or pipeline
    *params is a dictionary of parameter options if bayes=False
    *params is a dictionary of parameter distributions if bayes=True
    *Make sure to set early_stopping = True if desired,
     and set gb_model to be xgb, lgbm, or cb due to
     annoying syntax difference involving best_iteration
    """
    kf = KFold(n_splits=5,shuffle=True,
               random_state=random_state)
    if bayes:
        search = BayesSearchCV(model, search_spaces = params,
                               n_iter = n_iter,
                               cv=kf,verbose=verbose_search,
                               random_state=random_state,
                               scoring = 'neg_root_mean_squared_error')
    else:
        search = GridSearchCV(model,param_grid=params,
                              cv=kf, scoring='neg_root_mean_squared_error')
    if early_stopping:
        X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.1,random_state=42)
        if gb_model == 'cb':
            fit_params = {'early_stopping_rounds':early_stopping_rounds,
                          'eval_set':[(X_val,y_val)],
                          'verbose':verbose_model}
        else:
            fit_params = {'early_stopping_rounds':early_stopping_rounds,
                          'eval_set':[(X_val,y_val)],
                          'eval_metric':'rmse',
                          'verbose':verbose_model}
        search.fit(X_train,y_train,**fit_params)
    else:
        search.fit(X,y)
    if print_mean_score:
        print(f'Best mean RMSE score across folds:   {-search.best_score_:.4f}')
    if print_params:
        print(f'Best params:  {search.best_params_}')    
        

In [168]:
model = LGBMRegressor(device_type='gpu',
                    n_estimators=1000)

mean_score = cv_score(model,X_all,y_all,early_stopping=True,
                      gb_model='lgbm',random_state=42)

Fold 1: eval set RMSE score stopped improving after 85 iterations.
Fold 2: eval set RMSE score stopped improving after 87 iterations.
Fold 3: eval set RMSE score stopped improving after 101 iterations.
Fold 4: eval set RMSE score stopped improving after 61 iterations.
Fold 5: eval set RMSE score stopped improving after 78 iterations.
=====
RMSE scores for all folds: [578.0122874825205, 580.2450429004989, 574.7155885039749, 592.338643814733, 569.7819561570776] 
=====
Mean RMSE score across folds: 579.0187 
=====


TypeError: bad operand type for unary -: 'list'

In [169]:
model = CatBoostRegressor(n_estimators=2000,eval_metric='RMSE',bootstrap_type='Bernoulli',
                         task_type='GPU')

mean_score = cv_score(model,X_all,y_all,early_stopping=True,
                      gb_model='cb',early_stopping_rounds=500,random_state=42)

Fold 1: eval set RMSE score stopped improving after 411 iterations.
Fold 2: eval set RMSE score stopped improving after 481 iterations.
Fold 3: eval set RMSE score stopped improving after 365 iterations.
Fold 4: eval set RMSE score stopped improving after 400 iterations.
Fold 5: eval set RMSE score stopped improving after 426 iterations.
=====
RMSE scores for all folds: [578.6910270396307, 579.9683282299878, 576.4394839989033, 588.140645508962, 567.6678789594745] 
=====
Mean RMSE score across folds: 578.1815 
=====


TypeError: bad operand type for unary -: 'list'

In [152]:
model = XGBRegressor(tree_method='gpu_hist',
                    n_estimators=1000)

mean_score = cv_score(model,X_all,y_all,early_stopping=True,
                      gb_model='xgb',random_state=42)

Fold 1: eval set RMSE score stopped improving after 21 iterations.
Fold 2: eval set RMSE score stopped improving after 18 iterations.
Fold 3: eval set RMSE score stopped improving after 17 iterations.
Fold 4: eval set RMSE score stopped improving after 18 iterations.
Fold 5: eval set RMSE score stopped improving after 17 iterations.
=====
RMSE scores for all folds: [584.8269884721167, 588.2696505945969, 586.9338738327128, 599.7638433504226, 573.6177828027759] 
=====
Mean RMSE score across folds: 586.6824 
=====


TypeError: bad operand type for unary -: 'list'

In [170]:
best_lgb_params = dict(OrderedDict([('colsample_bynode', 0.5129815662157305),
                                    ('colsample_bytree', 1.0),
                                    ('learning_rate', 0.01),
                                    ('max_depth', 10),
                                    ('min_child_samples', 37),
                                    ('min_child_weight', 7.797688984074546),
                                    ('num_leaves', 382),
                                    ('reg_lambda', 6.107460424694761),
                                    ('subsample', 1.0)]))
best_xgb_params = dict(OrderedDict([('colsample_bylevel', 0.9675443254513403),
                                    ('colsample_bynode', 0.75283924165703),
                                    ('colsample_bytree', 0.987879432406498),
                                    ('learning_rate', 0.010732870194337446),
                                    ('max_depth', 9),
                                    ('max_leaves', 138),
                                    ('min_child_weight', 0.001),
                                    ('reg_lambda', 0.9113562334738354),
                                    ('subsample', 1.0)]))
best_cb_params = {'max_depth': 8, 'subsample': 1.0}

In [171]:
def make_prediction(model,X,y,test,early_stopping = True,early_stopping_rounds = 100,gb_model = 'xgb',random_state=42,verbose=False,pred_verbose=True):
    if early_stopping:
        X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.1,
                                                           random_state=random_state)
        if gb_model =='cb':
            model.fit(X_train,y_train,verbose=verbose,
                      early_stopping_rounds=early_stopping_rounds,
                      eval_set=[(X_val,y_val)])
            best_it = model.get_best_iteration()
            best_score = model.get_best_score()['validation']['RMSE']
        else:
            model.fit(X_train,y_train,verbose=verbose,
                      early_stopping_rounds=early_stopping_rounds,
                      eval_set=[(X_val,y_val)],
                      eval_metric='rmse')
            if gb_model == 'xgb':
                best_it = model.best_iteration
                best_score = model.best_score
            else:
                best_it = model.best_iteration_
                best_score = model.best_score_['valid_0']['rmse']
        if pred_verbose:
            print(f'Eval set RMSE stopped improving after {best_it} iterations.')
            print(f'\nThe best eval set RMSE was {best_score:.6f}\n')
    else:
        model.fit(X,y)
    if pred_verbose:
        print('Done making prediction!')
    return model.predict(test)    

In [172]:
print('=====\nFitting and predicting with XGB:\n=====')
model = XGBRegressor(n_estimators=2000,**best_xgb_params,
                    tree_method='gpu_hist',random_state=42)
xgb_pred = make_prediction(model, X_all,y_all,test,
                           early_stopping=True)
print('=====\nFitting and predicting with LGB:\n=====')
model = LGBMRegressor(n_estimators=2000,**best_lgb_params,
                    device_type='gpu',random_state=42)
lgb_pred = make_prediction(model, X_all,y_all,test,
                           early_stopping=True,gb_model='lgb')
print('=====\nFitting and predicting with CB:\n=====')
model = CatBoostRegressor(n_estimators=5000,**best_cb_params,
                          task_type='GPU',bootstrap_type = 'Bernoulli',
                          eval_metric='RMSE',random_state=42)
cb_pred = make_prediction(model, X_all,y_all,test,
                           early_stopping=True,gb_model='cb')


=====
Fitting and predicting with XGB:
=====
Eval set RMSE stopped improving after 613 iterations.

The best eval set RMSE was 576.580012

Done making prediction!
=====
Fitting and predicting with LGB:
=====
Eval set RMSE stopped improving after 597 iterations.

The best eval set RMSE was 575.929373

Done making prediction!
=====
Fitting and predicting with CB:
=====
Eval set RMSE stopped improving after 325 iterations.

The best eval set RMSE was 583.958723

Done making prediction!


In [None]:
sp.price = (cb_pred + xgb_pred + lgb_pred)/3
a = sp.to_csv("ilkhali.csv")