# Rolex Listing Price Prediction based on model and complications

In [3]:
import pandas as pd
import numpy as np
import glob
import janitor
import altair as alt
import matplotlib as plt
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [4]:
# def warn(*args, **kwargs):
#     pass
import warnings
# warnings.warn = warn

# Specifically suppress the UserWarnings related to unknown categories in OneHotEncoder
warnings.filterwarnings("ignore")

## Data Cleaning

In [5]:
files = glob.glob('data/result_df/*.csv')
dirty_df = pd.concat((pd.read_csv(file, index_col=0)
                for file in files)
              )

dirty_df = dirty_df.clean_names()
dirty_df.drop_duplicates(subset=['listing_code', 'reference_number'], inplace=True)
dirty_df.dropna(subset=['brand', 'model', 'listing_code', 'price', 'title', 'subtitle', 'case_diameter'], inplace=True)
dirty_df.reset_index(drop=True, inplace=True)


dirty_df.head()

Unnamed: 0,listing_code,brand,model,reference_number,movement,case_material,bracelet_material,year_of_production,condition,scope_of_delivery,...,thickness,lug_width,buckle_width,frequency,bracelet_thickness,submariner_kermit_ref_,day_date_ref_,datejust_reference_number,submariner_date_reference,reference
0,IJD7R3,Rolex,Datejust 41,126331 NEW UNWORN 2023 Wimbledon 41mm Jubilee,Automatic,Gold/Steel,Gold/Steel,2023,"New\n(Brand new, without any signs of wear)","Original box, original papers",...,,,,,,,,,,
1,HOAOQ5,Rolex,Datejust 31,278271,Automatic,Gold/Steel,Gold/Steel,2023,"New\n(Brand new, without any signs of wear)","Original box, original papers",...,,,,,,,,,,
2,IJ9RY8,Rolex,Datejust 36,126231,Automatic,Gold/Steel,Gold/Steel,2023,"New\n(Brand new, without any signs of wear)","Original box, original papers",...,,,,,,,,,,
3,FDHJM3,Rolex,GMT-Master II,126710BLNR,Automatic,Steel,Steel,2023,"New\n(Brand new, without any signs of wear)","Original box, original papers",...,,,,,,,,,,
4,FFF9D3,Rolex,Explorer,124270,Automatic,Steel,Steel,2021,Very good\n(Worn with little to no signs of wear),"Original box, original papers",...,,,,,,,,,,


In [6]:
# clean case_diameter
def is_convertible_to_int(value):
    try:
        int(value)
        return True
    except ValueError:
        return False

convertible_mask = dirty_df['case_diameter'].str[:2].apply(is_convertible_to_int)

dirty_df = dirty_df[convertible_mask]

dirty_df['case_diameter'] = dirty_df['case_diameter'].str[:2].astype('int')


In [7]:
# add column of whether the price is negotiable
dirty_df.insert(loc=13, column='is_negotiable', value=dirty_df['price'].str.contains('Negotiable', case=False).astype(int))

In [8]:
# keep only CA$ in the `price` column
dirty_df['price'] = dirty_df['price'].str.extract('C\$([0-9,]+)')[0].str.replace(',', '')
dirty_df['price'] = pd.to_numeric(dirty_df['price'], errors='coerce')
dirty_df['price'].fillna(0, inplace=True)
dirty_df['price'] = dirty_df['price'].astype(int)

dirty_df = dirty_df.query('price != 0')

In [9]:
# add column of whether the year of production is approximated
dirty_df.insert(loc=8, column='year_is_approximated', value=dirty_df['year_of_production'].str.contains('Approximation', case=False).astype(int))

# Clean year of production
dirty_df['year_of_production'] = dirty_df['year_of_production'].apply(lambda x: x[:4] if x != 'Unknown' else x)
dirty_df['year_of_production'] = dirty_df['year_of_production'].replace('Unknown', np.nan)
dirty_df['year_of_production'] = pd.to_numeric(dirty_df['year_of_production'], errors='coerce')

In [10]:
# convert scope of delivery to string
# dirty_df['scope_of_delivery'] = dirty_df['scope_of_delivery'].astype('category')

In [11]:
# simplify the location to country only
dirty_df['country'] = dirty_df['location'].str.split(',').str[0]

Save the cleaned data locally

In [12]:
rolex_df = dirty_df
rolex_df.to_csv('data/rolex_df.csv')

## EDA

In [13]:
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split

In [14]:
display(rolex_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 62495 entries, 0 to 66279
Data columns (total 51 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   listing_code                          62495 non-null  object 
 1   brand                                 62495 non-null  object 
 2   model                                 62495 non-null  object 
 3   reference_number                      61846 non-null  object 
 4   movement                              61443 non-null  object 
 5   case_material                         60862 non-null  object 
 6   bracelet_material                     56783 non-null  object 
 7   year_of_production                    46712 non-null  float64
 8   year_is_approximated                  62495 non-null  int32  
 9   condition                             61537 non-null  object 
 10  scope_of_delivery                     62495 non-null  object 
 11  gender              

None

In [15]:
df = rolex_df[['model', 'movement', 'case_material', 'bracelet_material',
               'year_of_production', 'year_is_approximated', 'condition', 'scope_of_delivery',
               'country', 'availability', 'case_diameter', 'bezel_material',
               'crystal', 'dial', 'bracelet_color', 'clasp', 'clasp_material',
               'rating', 'reviews', 'price', 'is_negotiable']]
df.head(1)

Unnamed: 0,model,movement,case_material,bracelet_material,year_of_production,year_is_approximated,condition,scope_of_delivery,country,availability,...,bezel_material,crystal,dial,bracelet_color,clasp,clasp_material,rating,reviews,price,is_negotiable
0,Datejust 41,Automatic,Gold/Steel,Gold/Steel,2023.0,0,"New\n(Brand new, without any signs of wear)","Original box, original papers",United States of America,Item is in stock,...,Rose gold,Sapphire crystal,Silver,Gold/Steel,Fold clasp,Gold/Steel,4.2,11,23421,1


In [16]:
df.shape

(62495, 21)

We will use only the following columns since they have fewer missing values and have more variation even for the same model. Features that are unrelated to the watch model is especially interesting, such as `condition` and `scope_of_delivery`, as they provide insights on how these factor in to the listing price.

In [17]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)
print(train_df.shape)
print(test_df.shape)

(49996, 21)
(12499, 21)


In [18]:
X_train, y_train = train_df.drop(
    columns=["price"]), train_df["price"]
y_train = pd.DataFrame(y_train)
X_test, y_test = test_df.drop(
    columns=["price"]), test_df["price"]
y_test = pd.DataFrame(y_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(49996, 20)
(49996, 1)
(12499, 20)
(12499, 1)


In [19]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49996 entries, 749 to 56465
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   model                 49996 non-null  object 
 1   movement              49159 non-null  object 
 2   case_material         48678 non-null  object 
 3   bracelet_material     45462 non-null  object 
 4   year_of_production    37355 non-null  float64
 5   year_is_approximated  49996 non-null  int32  
 6   condition             49234 non-null  object 
 7   scope_of_delivery     49996 non-null  object 
 8   country               49996 non-null  object 
 9   availability          49996 non-null  object 
 10  case_diameter         49996 non-null  int32  
 11  bezel_material        36838 non-null  object 
 12  crystal               40790 non-null  object 
 13  dial                  46103 non-null  object 
 14  bracelet_color        38567 non-null  object 
 15  clasp                 

In [None]:
plot_columns = X_train.columns.to_list()

for column in plot_columns:
    top_categories = X_train[column].value_counts().head(10).index
    filtered_X_train = X_train[X_train[column].isin(top_categories)]

    chart = alt.Chart(filtered_X_train).mark_bar().encode(
        y=alt.Y(f"{column}:N", sort='-x'),
        x=alt.X('count()', title='Count')
    ).properties(
        title=f"Top 10 Categories in {column}"
    )
    
    chart.display()

In [None]:
y_train.describe(percentiles=[.25, .5, .75, 0.975]).apply(lambda s: s.apply('{0:.0f}'.format))

Unnamed: 0,price
count,49996
mean,31688
std,43378
min,88
25%,13116
50%,20680
75%,33713
97.5%,118977
max,1506426


In [None]:
alt.Chart(y_train.query('price <= 120000'),
          title='Histogram of Rolex price').mark_bar().encode(
    alt.X('price:Q').bin(maxbins=40),
    y='count()'
)

The above histogram is showing at least 97.5% of the price data. It is difficult to interpret the distribution with the outliers so they are disregarded for the purpose of this visualization.

In [None]:
train_df.corr(numeric_only=True).round(
    decimals=3).style.background_gradient()

Unnamed: 0,year_of_production,year_is_approximated,case_diameter,rating,reviews,price,is_negotiable
year_of_production,1.0,-0.161,0.319,0.022,0.014,0.133,-0.015
year_is_approximated,-0.161,1.0,-0.074,0.013,0.264,-0.029,0.068
case_diameter,0.319,-0.074,1.0,0.014,-0.091,0.224,0.047
rating,0.022,0.013,0.014,1.0,0.099,-0.012,0.038
reviews,0.014,0.264,-0.091,0.099,1.0,-0.063,-0.118
price,0.133,-0.029,0.224,-0.012,-0.063,1.0,0.022
is_negotiable,-0.015,0.068,0.047,0.038,-0.118,0.022,1.0


The price seems to be slightly positively correlated with case diameter, which is expected as larger models are usually equipped with more complications that drive up the price.

## Models

### Preprocessing

In [None]:
# imports
import sys, os
import time

import numpy as np
import pandas as pd
import altair as alt
from IPython.display import HTML

sys.path.append(os.path.join(os.path.abspath("."), "code"))

from IPython.display import display

# Classifiers and regressors
from sklearn.dummy import DummyClassifier, DummyRegressor

# Preprocessing and pipeline
from sklearn.impute import SimpleImputer

# train test split and cross validation
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import (
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import *
from sklearn.tree import *
from sklearn.ensemble import *
from sklearn.svm import *
from lightgbm.sklearn import *
from sklearn.model_selection import *
from xgboost import XGBRegressor

In [None]:
# adapted from 571 lecture notes
# code from lecture
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, n_jobs=-1, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" %
                       (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

One-hot encoding is applied on categorical features and scaling on numerical features. The preprocesspr with scaler is used only for distance-based models that are sensitive to units in features. It is easier to interpret the feature importance with unscaled data when using models that are robust to such data.

In [None]:
categorcial_feats = [col for col in X_train.columns if col not in ['case_diameter', 'rating', 'reviews', 'year_of_production']]
numerical_feats = ['case_diameter', 'rating', 'reviews', 'year_of_production']

categorical_pipe = make_pipeline(OneHotEncoder(drop='if_binary', handle_unknown='ignore'))
numerical_pipe = make_pipeline(StandardScaler(), SimpleImputer(strategy='median'))

preprocessor = make_column_transformer((categorical_pipe, categorcial_feats))
preprocessor_with_scaler = make_column_transformer((categorical_pipe, categorcial_feats),
                                                    (numerical_pipe, numerical_feats))
preprocessor_with_scaler

### Model Fitting

In [None]:
# create a dictionary for storing model scores
results_dict = {}

#### Baseline - Simple Linear Regression

In [None]:
linear_reg = make_pipeline(preprocessor,
                           LinearRegression(n_jobs=-1))
results_dict["linear regression"] = mean_std_cross_val_scores(
    linear_reg, X_train, y_train, return_train_score=True
)
pd.DataFrame(results_dict).T



Unnamed: 0,fit_time,score_time,test_score,train_score
linear regression,1.126 (+/- 0.088),0.106 (+/- 0.024),0.436 (+/- 0.030),0.443 (+/- 0.007)


#### Classical Linear Regression Models: Ridge and Lasso

In [None]:
ridge = make_pipeline(preprocessor,
                      Ridge())
results_dict["ridge"] = mean_std_cross_val_scores(
    ridge, X_train, y_train, return_train_score=True
)
pd.DataFrame(results_dict).T



Unnamed: 0,fit_time,score_time,test_score,train_score
linear regression,1.126 (+/- 0.088),0.106 (+/- 0.024),0.436 (+/- 0.030),0.443 (+/- 0.007)
ridge,0.743 (+/- 0.024),0.085 (+/- 0.004),0.437 (+/- 0.030),0.443 (+/- 0.007)


In [None]:
# lasso = make_pipeline(preprocessor,
#                       Lasso())
# results_dict["lasso"] = mean_std_cross_val_scores(
#     lasso, X_train, y_train, return_train_score=True
# )
# pd.DataFrame(results_dict).T

In [None]:
# elasticnet = make_pipeline(preprocessor,
#                            ElasticNet())
# results_dict["elastic net"] = mean_std_cross_val_scores(
#     elasticnet, X_train, y_train, return_train_score=True
# )
# pd.DataFrame(results_dict).T

#### Tree-based Models

In [None]:
# dt = make_pipeline(preprocessor, DecisionTreeRegressor())
# results_dict["decision tree"] = mean_std_cross_val_scores(
#     dt, X_train, y_train, return_train_score=True
# )
# pd.DataFrame(results_dict).T

In [None]:
# rf = make_pipeline(preprocessor, RandomForestRegressor(random_state=123))
# results_dict["random forest"] = mean_std_cross_val_scores(
#     rf, X_train, y_train, return_train_score=True
# )
# pd.DataFrame(results_dict).T

In [None]:
xgboost = make_pipeline(preprocessor_with_scaler, XGBRegressor(objective='reg:gamma', random_state=123, n_jobs=-1, verbosity=0))
results_dict["xgboost"] = mean_std_cross_val_scores(
    xgboost, X_train, y_train, return_train_score=True
)
pd.DataFrame(results_dict).T



Unnamed: 0,fit_time,score_time,test_score,train_score
linear regression,1.126 (+/- 0.088),0.106 (+/- 0.024),0.436 (+/- 0.030),0.443 (+/- 0.007)
ridge,0.743 (+/- 0.024),0.085 (+/- 0.004),0.437 (+/- 0.030),0.443 (+/- 0.007)
xgboost,1.230 (+/- 0.009),0.143 (+/- 0.007),0.585 (+/- 0.052),0.680 (+/- 0.008)
xgboost optimized,1.396 (+/- 0.014),0.179 (+/- 0.006),0.646 (+/- 0.072),0.845 (+/- 0.008)


In [None]:
# from sklearn.feature_selection import SelectKBest

# xgboost_feat_select = make_pipeline(preprocessor_with_scaler,
#                                     SelectKBest(k=230),
#                                     XGBRegressor(random_state=123, verbosity=0))
# results_dict["xgboost feat select"] = mean_std_cross_val_scores(
#     xgboost_feat_select, X_train, y_train, return_train_score=True
# )
# pd.DataFrame(results_dict).T

#### Distance-based Models

In [None]:
# %%time
# knn = make_pipeline(preprocessor_with_scaler, KNeighborsRegressor())
# results_dict["knn"] = mean_std_cross_val_scores(
#     knn, X_train, y_train, return_train_score=True
# )
# pd.DataFrame(results_dict).T

In [None]:
# svr = make_pipeline(preprocessor_with_scaler, LinearSVR())
# results_dict["SVR"] = mean_std_cross_val_scores(
#     svr, X_train, y_train, return_train_score=True
# )
# pd.DataFrame(results_dict).T

It appears that gradient boosted tree model is outperforming the other models, with short training time.

### Hyperparameter Optimization

In [None]:
TUNING = True

In [None]:
param_grid = {
    "xgbregressor__learning_rate": np.arange(0.1, 1, 0.05),
    "xgbregressor__max_depth": np.arange(6, 20, 1),
    "xgbregressor__max_leaves": np.arange(300, 2001, 50),
    "xgbregressor__n_estimators": np.arange(100, 2001, 50),
    # "xgbregressor__gamma": np.arange(0, 100, 0.5),
    "xgbregressor__lambda": np.arange(0, 100, 0.5),
    # "xgbregressor__alpha": np.arange(0, 100, 0.5)

}

In [None]:
if TUNING:
    random_search = RandomizedSearchCV(
    xgboost,
    param_distributions=param_grid,
    n_iter=500,
    n_jobs=-1,
    return_train_score=True,
    random_state=123
    )

    random_search.fit(X_train, y_train)



In [None]:
if TUNING:
    cv_result_df = pd.DataFrame(random_search.cv_results_)[
        [
            "mean_test_score",
            "param_xgbregressor__learning_rate",
            "param_xgbregressor__max_depth",
            "param_xgbregressor__max_leaves",
            "param_xgbregressor__n_estimators",
            # "param_xgbregressor__gamma",
            "param_xgbregressor__lambda",
            # "param_xgbregressor__alpha",
            "mean_fit_time",
            "rank_test_score",
        ]
    ].set_index("rank_test_score").sort_index().T

    cv_result_df.to_csv('model/xgboost_cv_result.csv')
    cv_result_df
else:
    cv_result_df = pd.read_csv('model/xgboost_cv_result.csv', index_col=0)

cv_result_df

rank_test_score,1,2,3,4,5,6,7,8,9,10,...,491,492,493,494,495,496,497,498,499,500
mean_test_score,0.66434,0.656683,0.654874,0.654582,0.65165,0.650637,0.650572,0.650073,0.649804,0.648693,...,-0.54216,-0.542305,-0.54237,-0.54237,-0.542381,-0.542381,-0.542381,-0.54246,-0.542467,-0.542473
param_xgbregressor__learning_rate,0.341,0.421,0.281,0.201,0.741,0.061,0.181,0.461,0.761,0.541,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
param_xgbregressor__max_depth,19.0,8.0,10.0,12.0,8.0,19.0,16.0,12.0,7.0,12.0,...,6.0,7.0,8.0,10.0,8.0,13.0,19.0,16.0,7.0,13.0
param_xgbregressor__max_leaves,500.0,1800.0,800.0,850.0,1900.0,1050.0,550.0,1050.0,1350.0,650.0,...,450.0,450.0,700.0,1850.0,1700.0,1500.0,1900.0,1450.0,2000.0,650.0
param_xgbregressor__n_estimators,1300.0,500.0,1700.0,600.0,1700.0,950.0,700.0,250.0,600.0,1550.0,...,1550.0,1150.0,900.0,900.0,850.0,850.0,850.0,400.0,350.0,300.0
param_xgbregressor__lambda,96.5,31.5,54.0,33.0,42.5,78.0,41.5,37.0,78.5,34.0,...,95.5,41.0,21.5,61.5,99.0,68.0,93.0,14.0,58.5,95.0
param_xgbregressor__alpha,1.5,0.5,1.5,0.0,1.5,2.5,2.0,2.5,1.0,3.5,...,19.0,33.5,45.5,77.0,4.5,71.0,80.5,60.5,78.5,97.0
mean_fit_time,36.110154,10.941848,28.999308,20.418665,27.076131,67.010197,28.059022,10.418494,12.43209,18.152969,...,9.395872,7.334755,5.831402,5.963985,5.626815,5.702136,5.492291,3.054985,2.822632,2.59081


In [None]:
import pickle
# Extract the best models
# `cv_results_` attribute provides a lot of information about the search
results = random_search.cv_results_
scores = results['mean_test_score']
indices = list(range(len(scores)))

# Sort the indices of the scores, in descending order of scores
sorted_indices = sorted(indices, key=lambda i: scores[i], reverse=True)

# Save the top 5 models
top_n = 5  # Number of top models to save
for rank, index in enumerate(sorted_indices[:top_n], start=1):
    model_params = results['params'][index]
    model_score = scores[index]
    model = random_search.estimator.set_params(**model_params)
    model.fit(X_train, y_train)  # Refit the model with the best parameters on the full training set
    
    # Save the model using pickle
    filename = f'model_rank_{rank}_score_{model_score:.4f}.pkl'
    with open(filename, 'wb') as file:
        pickle.dump(model, file)
    print(f"Saved: {filename} - Score: {model_score:.4f}")

Saved: model_rank_1_score_0.6463.pkl - Score: 0.6463
Saved: model_rank_2_score_0.6459.pkl - Score: 0.6459
Saved: model_rank_3_score_0.6455.pkl - Score: 0.6455
Saved: model_rank_4_score_0.6415.pkl - Score: 0.6415
Saved: model_rank_5_score_0.6411.pkl - Score: 0.6411


In [None]:
import altair as alt

fig_hyperparam = alt.Chart(cv_result_df.T).mark_point(clip=True).encode(
    x = alt.X('mean_test_score').scale(domain=(0.58, 0.7)),
    y=alt.Y(alt.repeat("row"), type='quantitative')
).properties(
    width=900
).repeat(
    row=cv_result_df.T.columns.to_list()[1:-1]
)

fig_hyperparam

In [None]:
import pickle

if TUNING:

    # Save the entire pipeline model to a file using pickle
    with open('model/xgboost_opt.pkl', 'wb') as file:
        pickle.dump(random_search.best_estimator_.named_steps['xgbregressor'], file)

    # # Load the entire pipeline model from a file using pickle
    # with open('model/xgboost_opt.pkl', 'rb') as file:
    #     xgboost_test = pickle.load(file)


In [None]:
if TUNING:
    xgboost_opt = make_pipeline(random_search.best_estimator_)
else:
    with open('model/xgboost_opt.pkl', 'rb') as file:
        xgboost_opt = pickle.load(file)   
    xgboost_opt = make_pipeline(preprocessor_with_scaler, xgboost_opt)
                            
results_dict["xgboost optimized"] = mean_std_cross_val_scores(
    xgboost_opt, X_train, y_train, return_train_score=True
)
pd.DataFrame(results_dict).T



Unnamed: 0,fit_time,score_time,test_score,train_score
linear regression,1.126 (+/- 0.088),0.106 (+/- 0.024),0.436 (+/- 0.030),0.443 (+/- 0.007)
ridge,0.743 (+/- 0.024),0.085 (+/- 0.004),0.437 (+/- 0.030),0.443 (+/- 0.007)
xgboost,1.230 (+/- 0.009),0.143 (+/- 0.007),0.585 (+/- 0.052),0.680 (+/- 0.008)
xgboost optimized,19.824 (+/- 0.691),3.485 (+/- 0.271),0.664 (+/- 0.055),0.954 (+/- 0.003)


A more interpretable metric:

In [None]:
from sklearn.metrics import mean_absolute_percentage_error
# xgboost_opt.fit(X_train, y_train)
print(f'{mean_absolute_percentage_error(y_train["price"], xgboost_opt.predict(X_train)):.3f}')

prediction = y_train.copy()
prediction['pred'] = xgboost_opt.predict(X_train)
prediction['residual'] = prediction['price'] - prediction['pred']
prediction['perc_error'] = prediction['residual'] / prediction['price']

# alt.Chart(prediction).mark_point().encode(
#     x=alt.X('price'),
#     y=alt.Y('perc_error')
# )
prediction.sort_values('perc_error').head(20)

0.064


Unnamed: 0,price,pred,residual,perc_error
50556,10452,50474.984375,-40022.984375,-3.829218
22528,2322,11154.339844,-8832.339844,-3.803764
60362,119,543.258362,-424.258362,-3.565196
649,407,1802.946533,-1395.946533,-3.429844
62023,26433,69235.328125,-42802.328125,-1.619276
6820,38208,99278.929688,-61070.929688,-1.598381
27242,52662,131113.328125,-78451.328125,-1.489714
59614,73707,181780.578125,-108073.578125,-1.466259
7281,15283,34957.632812,-19674.632812,-1.287354
20988,7371,16604.248047,-9233.248047,-1.252645


In [None]:
alt.Chart(prediction,
          title='Histogram of predicted price').mark_bar().encode(
    alt.X('pred:Q').bin(maxbins=60),
    y='count()'
)

### Prediction on Test Set

In [None]:
# Fit the optimized model
xgboost_opt.fit(X_train, y_train)

xgboost_opt.score(X_test, y_test)

0.676354307761087

In [None]:
xgboost_opt

In [None]:
test_pred = y_test.copy()
test_pred['prediction'] = xgboost_opt.predict(X_test)
test_pred['residual'] = test_pred['price'] - test_pred['prediction']
test_pred.head()

Unnamed: 0,price,prediction,residual
26441,87745,74007.976562,13737.023438
46268,16634,18416.087891,-1782.087891
18088,14459,16442.650391,-1983.650391
33935,60293,67959.023438,-7666.023438
45197,34911,35702.300781,-791.300781


In [None]:
prediction_plot = alt.Chart(test_pred, title='Actual Listing Price and Prediction').mark_point().encode(
    alt.X('price').title('Actual Price'),
    alt.Y('prediction').title('Predicted Price')
)

min_price = min(test_pred['price'].min(), test_pred['prediction'].min())
max_price = max(test_pred['price'].max(), test_pred['prediction'].max())

# Create a DataFrame for the 45-degree line
line_data = pd.DataFrame({
    'price': [min_price, max_price],
    'prediction': [min_price, max_price]
})

# Create the 45-degree line chart
line_chart = alt.Chart(line_data).mark_line(color='red').encode(
    x='price',
    y='prediction'
)

prediction_plot + line_chart

In [None]:
fig_pred = alt.Chart(test_pred).mark_point(clip=True).encode(
    x=alt.X('price').scale(domain=[0, 120000]),
    y=alt.Y('prediction').scale(domain=[0, 120000])
)
line_chart = alt.Chart(line_data).mark_line(color='red', clip=True).encode(
    x=alt.X('price').scale(domain=[0, 120000]),
    y=alt.Y('prediction').scale(domain=[0, 120000])
)

fig_pred 
(fig_pred + line_chart).configure_mark(
    opacity=0.3
)