In [40]:
import pandas as pd
import numpy as np
import glob
import janitor
import altair as alt
import matplotlib as plt
alt.data_transformers.enable("vegafusion")
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split

In [41]:
rolex_df = pd.read_csv('data/rolex_df.csv')

In [42]:
display(rolex_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62495 entries, 0 to 62494
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   model                 62495 non-null  object 
 1   movement              61443 non-null  object 
 2   case_material         60862 non-null  object 
 3   bracelet_material     56783 non-null  object 
 4   year_of_production    46712 non-null  float64
 5   year_is_approximated  62495 non-null  int64  
 6   condition             61537 non-null  object 
 7   scope_of_delivery     62495 non-null  object 
 8   country               62495 non-null  object 
 9   availability          62495 non-null  object 
 10  case_diameter         62495 non-null  int64  
 11  bezel_material        46054 non-null  object 
 12  crystal               50999 non-null  object 
 13  dial                  57611 non-null  object 
 14  bracelet_color        48105 non-null  object 
 15  clasp              

None

We will use only the following columns since they have fewer missing values and have more variation even for the same model. Features that are unrelated to the watch model is especially interesting, such as `condition` and `scope_of_delivery`, as they provide insights on how these factor in to the listing price.

In [44]:
rolex_df = rolex_df[['model', 'movement', 'case_material', 'bracelet_material',
               'year_of_production', 'year_is_approximated', 'condition', 'scope_of_delivery',
               'country', 'availability', 'case_diameter', 'bezel_material',
               'crystal', 'dial', 'bracelet_color', 'clasp', 'clasp_material',
               'rating', 'reviews', 'price', 'is_negotiable']]
print(rolex_df.shape)
rolex_df.head()

(62495, 21)


Unnamed: 0,model,movement,case_material,bracelet_material,year_of_production,year_is_approximated,condition,scope_of_delivery,country,availability,...,bezel_material,crystal,dial,bracelet_color,clasp,clasp_material,rating,reviews,price,is_negotiable
0,Datejust 41,Automatic,Gold/Steel,Gold/Steel,2023.0,0,"New\n(Brand new, without any signs of wear)","Original box, original papers",United States of America,Item is in stock,...,Rose gold,Sapphire crystal,Silver,Gold/Steel,Fold clasp,Gold/Steel,4.2,11,23421,1
1,Datejust 31,Automatic,Gold/Steel,Gold/Steel,2023.0,0,"New\n(Brand new, without any signs of wear)","Original box, original papers",United States of America,Item is in stock,...,Rose gold,Sapphire crystal,Mother of pearl,Gold/Steel,Fold clasp,Gold/Steel,5.0,398,25556,0
2,Datejust 36,Automatic,Gold/Steel,Gold/Steel,2023.0,0,"New\n(Brand new, without any signs of wear)","Original box, original papers",United States of America,Item is in stock,...,Rose gold,Sapphire crystal,Grey,Gold/Steel,Fold clasp,Gold/Steel,5.0,398,25556,0
3,GMT-Master II,Automatic,Steel,Steel,2023.0,0,"New\n(Brand new, without any signs of wear)","Original box, original papers",United States of America,Item is in stock,...,Ceramic,Sapphire crystal,Black,Steel,Fold clasp,Steel,5.0,398,25556,0
4,Explorer,Automatic,Steel,Steel,2021.0,0,Very good\n(Worn with little to no signs of wear),"Original box, original papers",United States of America,Item is in stock,...,Steel,Sapphire crystal,Black,Steel,Fold clasp,Steel,4.9,797,11711,0


In [45]:
train_df, test_df = train_test_split(rolex_df, test_size=0.2, random_state=123)
print(train_df.shape)
print(test_df.shape)

(49996, 21)
(12499, 21)


In [46]:
X_train, y_train = train_df.drop(
    columns=["price"]), train_df["price"]
y_train = pd.DataFrame(y_train)
X_test, y_test = test_df.drop(
    columns=["price"]), test_df["price"]
y_test = pd.DataFrame(y_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(49996, 20)
(49996, 1)
(12499, 20)
(12499, 1)


In [27]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49996 entries, 698 to 52734
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   model                 49996 non-null  object 
 1   movement              49159 non-null  object 
 2   case_material         48678 non-null  object 
 3   bracelet_material     45462 non-null  object 
 4   year_of_production    37355 non-null  float64
 5   year_is_approximated  49996 non-null  int64  
 6   condition             49234 non-null  object 
 7   scope_of_delivery     49996 non-null  object 
 8   country               49996 non-null  object 
 9   availability          49996 non-null  object 
 10  case_diameter         49996 non-null  int64  
 11  bezel_material        36838 non-null  object 
 12  crystal               40790 non-null  object 
 13  dial                  46103 non-null  object 
 14  bracelet_color        38567 non-null  object 
 15  clasp                 

In [28]:
plot_columns = X_train.columns.to_list()

for column in plot_columns:
    top_categories = X_train[column].value_counts().head(10).index
    filtered_X_train = X_train[X_train[column].isin(top_categories)]

    chart = alt.Chart(filtered_X_train).mark_bar().encode(
        y=alt.Y(f"{column}:N", sort='-x'),
        x=alt.X('count()', title='Count')
    ).properties(
        title=f"Top 10 Categories in {column}"
    )
    
    chart.display()

In [29]:
alt.Chart(y_train,
          title='Histogram of Rolex price').mark_bar().encode(
    alt.X('price:Q').bin(maxbins=40),
    y='count()'
)

Manual inspect if there is any outliers can be removed just by clipping the data

In [30]:
y_train.describe(percentiles=[.25, .5, .75, 0.975]).apply(lambda s: s.apply('{0:.0f}'.format))

Unnamed: 0,price
count,49996
mean,31724
std,43328
min,198
25%,13125
50%,20724
75%,33848
97.5%,119729
max,1506426


In [31]:
alt.Chart(y_train.query('price <= 120000'),
          title='Histogram of Rolex price').mark_bar().encode(
    alt.X('price:Q').bin(maxbins=40),
    y='count()'
)

The above histogram is showing at least 97.5% of the price data. It is difficult to interpret the distribution with the outliers so they are disregarded for the purpose of this visualization.  
  
The distribution of price resembles a Gamma distribution.

In [32]:
train_df.corr(numeric_only=True).round(
    decimals=3).style.background_gradient()

Unnamed: 0,year_of_production,year_is_approximated,case_diameter,rating,reviews,price,is_negotiable
year_of_production,1.0,-0.161,0.318,0.017,0.009,0.127,-0.012
year_is_approximated,-0.161,1.0,-0.076,0.016,0.262,-0.028,0.067
case_diameter,0.318,-0.076,1.0,0.013,-0.089,0.224,0.046
rating,0.017,0.016,0.013,1.0,0.099,-0.008,0.041
reviews,0.009,0.262,-0.089,0.099,1.0,-0.059,-0.115
price,0.127,-0.028,0.224,-0.008,-0.059,1.0,0.019
is_negotiable,-0.012,0.067,0.046,0.041,-0.115,0.019,1.0


The price seems to be slightly positively correlated with case diameter, which is expected as larger models are usually equipped with more complications that drive up the price.