# Rolex Listing Price Prediction based on model and complications

In [2]:
import pandas as pd
import numpy as np
import glob
import janitor
import altair as alt
import matplotlib as plt
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [3]:
import warnings

# Specifically suppress the UserWarnings related to unknown categories in OneHotEncoder
warnings.filterwarnings("ignore")

## Data Cleaning

In [4]:
files = glob.glob('data/result_df/*.csv')
dirty_df = pd.concat((pd.read_csv(file, index_col=0)
                for file in files)
              )

dirty_df = dirty_df.clean_names()
dirty_df.drop_duplicates(subset=['listing_code', 'reference_number'], inplace=True)
dirty_df.dropna(subset=['brand', 'model', 'listing_code', 'price', 'title', 'subtitle', 'case_diameter'], inplace=True)
dirty_df.reset_index(drop=True, inplace=True)


dirty_df.head()

Unnamed: 0,listing_code,brand,model,reference_number,movement,case_material,bracelet_material,year_of_production,condition,scope_of_delivery,...,frequency,buckle_width,bracelet_thickness,model_reference_number,day_date_ref_,datejust_reference_number,rolex_lady_datejust_reference_number,submariner_kermit_ref_,reference,submariner_date_reference
0,JB6751,Rolex,Datejust 36,126234,Quartz,Gold/Steel,Steel,2023,"Unworn\n(Mint condition, without signs of wear)","Original box, original papers",...,,,,,,,,,,
1,IK45G9,Rolex,Day-Date 36,18239A,Automatic,White gold,White gold,Unknown,Very good\n(Worn with little to no signs of wear),"Original papers, no original box",...,,,,,,,,,,
2,IW74W1,Rolex,Daytona,16523G,Automatic,Steel,Steel,Unknown,Very good\n(Worn with little to no signs of wear),"Original papers, no original box",...,,,,,,,,,,
3,JPV699,Rolex,Submariner Date,116610LV,Automatic,Steel,Steel,Unknown,Very good\n(Worn with little to no signs of wear),"Original box, original papers",...,,,,,,,,,,
4,JG57B0,Rolex,Sea-Dweller 4000,16600,Automatic,Steel,Steel,1993,Very good\n(Worn with little to no signs of wear),"Original box, original papers",...,,,,,,,,,,


In [5]:
# clean case_diameter
def is_convertible_to_int(value):
    try:
        int(value)
        return True
    except ValueError:
        return False

convertible_mask = dirty_df['case_diameter'].str[:2].apply(is_convertible_to_int)

dirty_df = dirty_df[convertible_mask]

dirty_df['case_diameter'] = dirty_df['case_diameter'].str[:2].astype('int')


In [6]:
# add column of whether the price is negotiable
dirty_df.insert(loc=13, column='is_negotiable', value=dirty_df['price'].str.contains('Negotiable', case=False).astype(int))

In [7]:
# keep only CA$ in the `price` column
dirty_df['price'] = dirty_df['price'].str.extract('C\$([0-9,]+)')[0].str.replace(',', '')
dirty_df['price'] = pd.to_numeric(dirty_df['price'], errors='coerce')
dirty_df['price'].fillna(0, inplace=True)
dirty_df['price'] = dirty_df['price'].astype(int)

dirty_df = dirty_df.query('price != 0')

In [8]:
# add column of whether the year of production is approximated
dirty_df.insert(loc=8, column='year_is_approximated', value=dirty_df['year_of_production'].str.contains('Approximation', case=False).astype(int))

# Clean year of production
dirty_df['year_of_production'] = dirty_df['year_of_production'].apply(lambda x: x[:4] if x != 'Unknown' else x)

In [9]:
# simplify the location to country only
dirty_df['country'] = dirty_df['location'].str.split(',').str[0]

Save the cleaned data locally

In [10]:
rolex_df = dirty_df
rolex_df.to_csv('data/rolex_df.csv')

## EDA

In [11]:
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split

In [12]:
display(rolex_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 62495 entries, 0 to 66279
Data columns (total 51 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   listing_code                          62495 non-null  object 
 1   brand                                 62495 non-null  object 
 2   model                                 62495 non-null  object 
 3   reference_number                      61846 non-null  object 
 4   movement                              61443 non-null  object 
 5   case_material                         60862 non-null  object 
 6   bracelet_material                     56783 non-null  object 
 7   year_of_production                    62495 non-null  object 
 8   year_is_approximated                  62495 non-null  int64  
 9   condition                             61537 non-null  object 
 10  scope_of_delivery                     62495 non-null  object 
 11  gender              

None

In [13]:
df = rolex_df[['model', 'movement', 'case_material', 'bracelet_material',
               'year_of_production', 'year_is_approximated', 'condition', 'scope_of_delivery',
               'country', 'availability', 'case_diameter', 'bezel_material',
               'crystal', 'dial', 'bracelet_color', 'clasp', 'clasp_material',
               'rating', 'reviews', 'price', 'is_negotiable']]
df.head(1)

Unnamed: 0,model,movement,case_material,bracelet_material,year_of_production,year_is_approximated,condition,scope_of_delivery,country,availability,...,bezel_material,crystal,dial,bracelet_color,clasp,clasp_material,rating,reviews,price,is_negotiable
0,Datejust 36,Quartz,Gold/Steel,Steel,2023,0,"Unworn\n(Mint condition, without signs of wear)","Original box, original papers",Malaysia,Item is in stock,...,White gold,Sapphire crystal,Grey,Steel,Fold clasp,Steel,,0,14959,1


In [14]:
df.shape

(62495, 21)

We will use only the following columns since they have fewer missing values and have more variation even for the same model. Features that are unrelated to the watch model is especially interesting, such as `condition` and `scope_of_delivery`, as they provide insights on how these factor in to the listing price.

In [15]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=123)
print(train_df.shape)
print(test_df.shape)

(43746, 21)
(18749, 21)


In [16]:
X_train, y_train = train_df.drop(
    columns=["price"]), train_df["price"]
y_train = pd.DataFrame(y_train)
X_test, y_test = test_df.drop(
    columns=["price"]), test_df["price"]
y_test = pd.DataFrame(y_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(43746, 20)
(43746, 1)
(18749, 20)
(18749, 1)


In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43746 entries, 63223 to 56395
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   model                 43746 non-null  object 
 1   movement              43024 non-null  object 
 2   case_material         42591 non-null  object 
 3   bracelet_material     39779 non-null  object 
 4   year_of_production    43746 non-null  object 
 5   year_is_approximated  43746 non-null  int64  
 6   condition             43081 non-null  object 
 7   scope_of_delivery     43746 non-null  object 
 8   country               43746 non-null  object 
 9   availability          43746 non-null  object 
 10  case_diameter         43746 non-null  int64  
 11  bezel_material        32242 non-null  object 
 12  crystal               35680 non-null  object 
 13  dial                  40358 non-null  object 
 14  bracelet_color        33701 non-null  object 
 15  clasp               

In [18]:
plot_columns = X_train.columns.to_list()

for column in plot_columns:
    top_categories = X_train[column].value_counts().head(10).index
    filtered_X_train = X_train[X_train[column].isin(top_categories)]

    chart = alt.Chart(filtered_X_train).mark_bar().encode(
        y=alt.Y(f"{column}:N", sort='-x'),
        x=alt.X('count()', title='Count')
    ).properties(
        title=f"Top 10 Categories in {column}"
    )
    
    chart.display()

In [19]:
y_train.describe(percentiles=[.25, .5, .75, 0.975]).apply(lambda s: s.apply('{0:.0f}'.format))

Unnamed: 0,price
count,43746
mean,31600
std,42708
min,88
25%,13125
50%,20709
75%,33713
97.5%,118300
max,1506426


In [20]:
alt.Chart(y_train.query('price <= 120000'),
          title='Histogram of Rolex price').mark_bar().encode(
    alt.X('price:Q').bin(maxbins=40),
    y='count()'
)

The above histogram is showing at least 97.5% of the price data. It is difficult to interpret the distribution with the outliers so they are disregarded for the purpose of this visualization.

In [21]:
train_df.corr(numeric_only=True).round(
    decimals=3).style.background_gradient()

Unnamed: 0,year_is_approximated,case_diameter,rating,reviews,price,is_negotiable
year_is_approximated,1.0,-0.074,0.013,0.26,-0.028,0.071
case_diameter,-0.074,1.0,0.015,-0.089,0.226,0.049
rating,0.013,0.015,1.0,0.099,-0.012,0.037
reviews,0.26,-0.089,0.099,1.0,-0.065,-0.118
price,-0.028,0.226,-0.012,-0.065,1.0,0.023
is_negotiable,0.071,0.049,0.037,-0.118,0.023,1.0


The price seems to be slightly positively correlated with case diameter, which is expected as larger models are usually equipped with more complications that drive up the price.

## Models

### Preprocessing

In [22]:
# imports
import sys, os
import time

import numpy as np
import pandas as pd
import altair as alt
from IPython.display import HTML

sys.path.append(os.path.join(os.path.abspath("."), "code"))

from IPython.display import display

# Classifiers and regressors
from sklearn.dummy import DummyClassifier, DummyRegressor

# Preprocessing and pipeline
from sklearn.impute import SimpleImputer

# train test split and cross validation
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import (
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import *
from sklearn.tree import *
from sklearn.ensemble import *
from sklearn.svm import *
from lightgbm.sklearn import *
from sklearn.model_selection import *
from xgboost import XGBRegressor

In [23]:
# adapted from 571 lecture notes
# code from lecture
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, n_jobs=-1, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" %
                       (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

One-hot encoding is applied on categorical features and scaling on numerical features. The preprocesspr with scaler is used only for distance-based models that are sensitive to units in features. It is easier to interpret the feature importance with unscaled data when using models that are robust to such data.

In [24]:
categorcial_feats = [col for col in X_train.columns if col not in ['case_diameter', 'rating', 'reviews']]
numerical_feats = ['case_diameter', 'rating', 'reviews']

categorical_pipe = make_pipeline(OneHotEncoder(drop='if_binary', handle_unknown='ignore'))
numerical_pipe = make_pipeline(StandardScaler(), SimpleImputer(strategy='median'))

preprocessor = make_column_transformer((categorical_pipe, categorcial_feats))
preprocessor_with_scaler = make_column_transformer((categorical_pipe, categorcial_feats),
                                                    (numerical_pipe, numerical_feats))
preprocessor

### Model Fitting

In [25]:
# create a dictionary for storing model scores
results_dict = {}

#### Baseline - Simple Linear Regression

In [26]:
import os
os.cpu_count()

import multiprocessing
multiprocessing.cpu_count()


32

In [27]:
linear_reg = make_pipeline(preprocessor,
                           LinearRegression(n_jobs=3))
results_dict["linear regression"] = mean_std_cross_val_scores(
    linear_reg, X_train, y_train, return_train_score=True
)
pd.DataFrame(results_dict).T



Unnamed: 0,fit_time,score_time,test_score,train_score
linear regression,1.244 (+/- 0.049),0.087 (+/- 0.004),0.440 (+/- 0.022),0.457 (+/- 0.005)


#### Classical Linear Regression Models: Ridge and Lasso

In [28]:
ridge = make_pipeline(preprocessor,
                      Ridge())
results_dict["ridge"] = mean_std_cross_val_scores(
    ridge, X_train, y_train, return_train_score=True
)
pd.DataFrame(results_dict).T



Unnamed: 0,fit_time,score_time,test_score,train_score
linear regression,1.244 (+/- 0.049),0.087 (+/- 0.004),0.440 (+/- 0.022),0.457 (+/- 0.005)
ridge,0.790 (+/- 0.013),0.085 (+/- 0.006),0.441 (+/- 0.022),0.457 (+/- 0.005)


In [29]:
# lasso = make_pipeline(preprocessor,
#                       Lasso())
# results_dict["lasso"] = mean_std_cross_val_scores(
#     lasso, X_train, y_train, return_train_score=True
# )
# pd.DataFrame(results_dict).T

In [30]:
# elasticnet = make_pipeline(preprocessor,
#                            ElasticNet())
# results_dict["elastic net"] = mean_std_cross_val_scores(
#     elasticnet, X_train, y_train, return_train_score=True
# )
# pd.DataFrame(results_dict).T

#### Tree-based Models

In [31]:
# dt = make_pipeline(preprocessor, DecisionTreeRegressor())
# results_dict["decision tree"] = mean_std_cross_val_scores(
#     dt, X_train, y_train, return_train_score=True
# )
# pd.DataFrame(results_dict).T

In [32]:
# rf = make_pipeline(preprocessor, RandomForestRegressor(random_state=123))
# results_dict["random forest"] = mean_std_cross_val_scores(
#     rf, X_train, y_train, return_train_score=True
# )
# pd.DataFrame(results_dict).T

In [33]:
os.cpu_count()

32

In [34]:
xgboost = make_pipeline(preprocessor, XGBRegressor(random_state=123, n_jobs=-1, verbosity=0))
results_dict["xgboost"] = mean_std_cross_val_scores(
    xgboost, X_train, y_train, return_train_score=True
)
pd.DataFrame(results_dict).T



Unnamed: 0,fit_time,score_time,test_score,train_score
linear regression,1.244 (+/- 0.049),0.087 (+/- 0.004),0.440 (+/- 0.022),0.457 (+/- 0.005)
ridge,0.790 (+/- 0.013),0.085 (+/- 0.006),0.441 (+/- 0.022),0.457 (+/- 0.005)
xgboost,0.909 (+/- 0.073),0.118 (+/- 0.005),0.596 (+/- 0.073),0.878 (+/- 0.006)


#### Distance-based Models

In [35]:
# %%time
# knn = make_pipeline(preprocessor_with_scaler, KNeighborsRegressor())
# results_dict["knn"] = mean_std_cross_val_scores(
#     knn, X_train, y_train, return_train_score=True
# )
# pd.DataFrame(results_dict).T

In [36]:
# svr = make_pipeline(preprocessor_with_scaler, LinearSVR())
# results_dict["SVR"] = mean_std_cross_val_scores(
#     svr, X_train, y_train, return_train_score=True
# )
# pd.DataFrame(results_dict).T

It appears that gradient boosted tree model is outperforming the other models, with short training time.

### Hyperparameter Optimization

In [39]:
# param_grid = {
#     "lgbmregressor__num_leaves": np.arange(100, 301, 10),
#     "lgbmregressor__learning_rate": np.arange(0.0001, 0.011, 0.001),
#     "lgbmregressor__n_estimators": np.arange(100, 301, 10)
# }

param_grid = {
    "xgbregressor__learning_rate": np.arange(0.0001, 1, 0.01),
    "xgbregressor__max_depth": np.arange(6, 100, 1),
    "xgbregressor__max_leaves": np.arange(0, 1501, 100),
    "xgbregressor__reg_gamma": np.arange(0, 20, 0.5),
    "xgbregressor__reg_lambda": np.arange(0, 10, 0.5),
    "xgbregressor__reg_alpha": np.arange(0, 10, 0.5)

}

In [41]:
random_search = RandomizedSearchCV(
    xgboost,
    param_distributions=param_grid,
    n_iter=100,
    n_jobs=-1,
    return_train_score=True,
    random_state=123
)

random_search.fit(X_train, y_train)



In [44]:
cv_result_df = pd.DataFrame(random_search.cv_results_)[
    [
        "mean_test_score",
        "param_xgbregressor__learning_rate",
        "param_xgbregressor__max_depth",
        "param_xgbregressor__max_leaves",
        "param_xgbregressor__reg_gamma",
        "param_xgbregressor__reg_lambda",
        "param_xgbregressor__reg_alpha",
        "mean_fit_time",
        "rank_test_score",
    ]
].set_index("rank_test_score").sort_index().T

cv_result_df.to_csv('xgboost_cv_result.csv')
cv_result_df

rank_test_score,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
mean_test_score,0.613351,0.610349,0.607352,0.6063,0.599473,0.596093,0.593441,0.589416,0.588831,0.588139,...,0.534559,0.533615,0.533379,0.526003,0.519029,0.518005,0.512624,0.491443,0.445149,0.419493
param_xgbregressor__learning_rate,0.2701,0.3101,0.7401,0.2701,0.1301,0.1001,0.9101,0.5601,0.3101,0.4101,...,0.8901,0.5301,0.9101,0.9801,0.4701,0.8501,0.7101,0.0101,0.9701,0.9401
param_xgbregressor__max_depth,53.0,9.0,9.0,36.0,77.0,67.0,36.0,65.0,34.0,58.0,...,59.0,19.0,37.0,95.0,81.0,19.0,59.0,75.0,32.0,92.0
param_xgbregressor__max_leaves,100.0,1000.0,1000.0,100.0,400.0,300.0,200.0,200.0,400.0,500.0,...,600.0,1400.0,1500.0,0.0,0.0,800.0,1500.0,1100.0,1100.0,900.0
param_xgbregressor__reg_gamma,14.5,10.0,5.0,1.0,18.0,5.5,6.5,9.0,16.0,7.5,...,2.5,17.5,14.0,8.5,16.5,11.5,12.0,7.5,11.5,12.5
param_xgbregressor__reg_lambda,9.0,5.0,6.0,3.5,1.0,0.5,6.5,3.0,4.5,5.5,...,1.0,0.5,2.0,4.5,0.5,0.5,0.0,2.0,0.0,0.0
param_xgbregressor__reg_alpha,1.0,6.5,0.5,5.5,7.0,6.5,2.5,6.5,2.0,4.0,...,7.5,8.0,4.0,1.0,0.0,4.5,3.0,2.0,5.5,3.5
mean_fit_time,2.419608,2.465455,2.553155,2.513701,5.735049,4.706646,3.752555,3.538437,5.735895,6.630708,...,7.583348,11.330504,16.218141,63.447745,126.487538,9.21786,15.96618,13.134716,12.356324,10.049223


In [65]:
random_search.best_estimator_.named_steps['xgbregressor'].save_model('model/xgboost_opt.model')
random_search.best_estimator_.named_steps['xgbregressor'].best_params_


AttributeError: 'XGBRegressor' object has no attribute 'best_params_'

In [63]:
xgboost_test = XGBRegressor()
xgboost_test.load_model('model/xgboost_opt.model')
xgboost_test

In [66]:
xgboost_opt = make_pipeline(random_search.best_estimator_)
results_dict["xgboost optimized"] = mean_std_cross_val_scores(
    xgboost_opt, X_train, y_train, return_train_score=True
)
pd.DataFrame(results_dict).T



Unnamed: 0,fit_time,score_time,test_score,train_score
linear regression,1.244 (+/- 0.049),0.087 (+/- 0.004),0.440 (+/- 0.022),0.457 (+/- 0.005)
ridge,0.790 (+/- 0.013),0.085 (+/- 0.006),0.441 (+/- 0.022),0.457 (+/- 0.005)
xgboost,0.909 (+/- 0.073),0.118 (+/- 0.005),0.596 (+/- 0.073),0.878 (+/- 0.006)
xgboost optimized,1.282 (+/- 0.009),0.143 (+/- 0.005),0.613 (+/- 0.055),0.858 (+/- 0.012)


In [67]:
xgboost_opt.fit(X_train, y_train)

In [101]:
xgboost_opt.__sklearn_is_fitted__()

True

In [103]:
xgboost_opt.named_steps['pipeline'].named_steps['xgbregressor'].save_model('xgboost_opt.json')
display(xgboost_opt.named_steps['pipeline'].named_steps['xgbregressor'])
xgboost_test = XGBRegressor()
xgboost_test.load_model('xgboost_opt.json')
xgboost_test

In [106]:
random_search.best_estimator_.named_steps['xgbregressor']

In [107]:
import pickle

# Save the entire pipeline model to a file using pickle
with open('xgboost_opt.pkl', 'wb') as file:
    pickle.dump(random_search.best_estimator_.named_steps['xgbregressor'], file)

# Load the entire pipeline model from a file using pickle
with open('xgboost_opt.pkl', 'rb') as file:
    xgboost_test = pickle.load(file)


In [108]:
xgboost_test

In [109]:
xgboost_test = make_pipeline(preprocessor, xgboost_test)
results_dict["xgboost test"] = mean_std_cross_val_scores(
    xgboost_test, X_train, y_train, return_train_score=True
)
pd.DataFrame(results_dict).T



Unnamed: 0,fit_time,score_time,test_score,train_score
linear regression,1.244 (+/- 0.049),0.087 (+/- 0.004),0.440 (+/- 0.022),0.457 (+/- 0.005)
ridge,0.790 (+/- 0.013),0.085 (+/- 0.006),0.441 (+/- 0.022),0.457 (+/- 0.005)
xgboost,0.909 (+/- 0.073),0.118 (+/- 0.005),0.596 (+/- 0.073),0.878 (+/- 0.006)
xgboost optimized,1.282 (+/- 0.009),0.143 (+/- 0.005),0.613 (+/- 0.055),0.858 (+/- 0.012)
xgboost test,1.250 (+/- 0.006),0.141 (+/- 0.006),0.613 (+/- 0.055),0.858 (+/- 0.012)
