In [34]:
import numpy as np
import pandas as pd
import missingno as msno
import sklearn.metrics as m
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
# cross validation
from sklearn.model_selection import cross_val_score
# grid search
from sklearn.model_selection import GridSearchCV
# outlier
from sklearn.neighbors import LocalOutlierFactor

In [35]:
url = 'https://raw.githubusercontent.com/digipodium/Datasets/main/regression/diamond_pricing.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,Paleonium,Pressure,Price
0,17,6567,2810.280298
1,59,5253,1986.967089
2,123,9715,2083.132087
3,182,2073,2236.340285
4,133,6400,1903.323339


In [36]:
X = df.drop(columns=['Price'])
y = df['Price']

remove outlier if they are present

In [37]:
lop = LocalOutlierFactor(n_neighbors=20)
yhat = lop.fit_predict(X)
X = X[yhat==1].copy()
y = y[yhat==1].copy()

In [38]:
pipeline=  Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('model', DecisionTreeRegressor())
])

scores = cross_val_score(pipeline, X, y, cv=5)
actual_score = scores.mean()
px.bar(x=['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5'], 
       y=scores, title=f'Cross Validation Score={actual_score*100:.2f}%', log_y=True)

Grid Search

In [39]:
params = {
    'model__max_depth': [2,5,10,25,100,500],
    'model__criterion': ['squared_error', 'friedman_mse', 'absolute_error'],
    'model__splitter': ['best', 'random'],
}

grid = GridSearchCV(pipeline, params, cv=5, n_jobs=-1, verbose=1)
grid.fit(X, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [40]:
grid.best_params_

{'model__criterion': 'absolute_error',
 'model__max_depth': 25,
 'model__splitter': 'best'}

In [41]:
# visualize the grid
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.sort_values(by='rank_test_score', inplace=True)
grid_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__criterion,param_model__max_depth,param_model__splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
30,0.027593,0.003706,0.002611,0.000495,absolute_error,25,best,"{'model__criterion': 'absolute_error', 'model_...",0.964777,0.96242,0.975186,0.964485,0.960476,0.965469,0.005101,1
34,0.025873,0.002042,0.002135,0.000196,absolute_error,500,best,"{'model__criterion': 'absolute_error', 'model_...",0.963054,0.959657,0.975004,0.964262,0.957267,0.963849,0.006101,2
32,0.031523,0.006425,0.002438,0.000387,absolute_error,100,best,"{'model__criterion': 'absolute_error', 'model_...",0.962836,0.959334,0.974282,0.964498,0.956607,0.963512,0.006043,3
22,0.007444,0.002346,0.002205,0.00051,friedman_mse,500,best,"{'model__criterion': 'friedman_mse', 'model__m...",0.967374,0.963933,0.974458,0.957918,0.951161,0.962969,0.007966,4
20,0.006725,0.001169,0.002637,0.000583,friedman_mse,100,best,"{'model__criterion': 'friedman_mse', 'model__m...",0.968,0.964112,0.974635,0.958458,0.947196,0.96248,0.009281,5


In [42]:
grid_df['setup'] = grid_df['params'].apply(str)
px.bar(grid_df, x='rank_test_score', y='mean_test_score',hover_data=['setup'],)