In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import QuantileTransformer

In [None]:
url='https://raw.githubusercontent.com/digipodium/Datasets/main/regression/diamonds.csv'

In [None]:
df = pd.read_csv(url, index_col=0)
df

In [None]:
df.describe()

# handling skewness

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(15,5))
sns.histplot(data=df, x = 'price', ax=ax[0])
sns.boxplot(data=df, y = 'price', ax=ax[1])
plt.show()

In [None]:
X = df.drop('price', axis=1)
y = df['price']

# transform the y to log scale
y = np.log1p(y) # this log1p will remove the skewness of the data and make it more normal distribution for better prediction

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(15,5))
sns.histplot(x = y, ax=ax[0])
sns.boxplot(y = y, ax=ax[1])
plt.show()

In [None]:
y = df['price']
qt = QuantileTransformer()
y_trans = qt.fit_transform(y.values.reshape(-1,1)).flatten()

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(15,5))
sns.histplot(x = y_trans, ax=ax[0])
sns.boxplot(y = y_trans, ax=ax[1])
plt.show()

back to model pipeline creation

In [None]:
X = df.drop('price', axis=1)
y = df['price']

In [None]:
cat_cols = df.select_dtypes(exclude='number').columns
num_cols = df.select_dtypes('number').columns
num_cols = num_cols.drop('price')
print(cat_cols)
print(num_cols)

In [None]:
cat_pipe = Pipeline([('oe', OrdinalEncoder())])
num_pipe = Pipeline([('sc', StandardScaler())])
transformer = ColumnTransformer([
    ('categorical', cat_pipe, cat_cols),
    ('numerical', num_pipe, num_cols),
])
transformer

In [None]:
model_pipe = Pipeline([
    ('transformer', transformer),
    ('model', RandomForestRegressor())
])
model_pipe

In [None]:
qt = QuantileTransformer()
y_trans = qt.fit_transform(y.values.reshape(-1,1)).flatten()
X_train, X_test, y_train, y_test = train_test_split(X, y_trans, test_size=0.2, random_state=42)
model_pipe.fit(X_train, y_train)
y_pred = model_pipe.predict(X_test)
# metrics
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))

In [None]:
print("score", r2_score(y_test, y_pred))

In [None]:
result = model_pipe.predict(X_test[:10])

In [None]:
qt.inverse_transform(result.reshape(-1,1))

In [None]:
qt.inverse_transform(y_test[:10].reshape(-1,1))

In [None]:
from joblib import dump

In [44]:
dump({
    'model': model_pipe,
    'quantile': qt,
    'description': 'diamond price prediction'
}, 'diamond_price.joblib')


['diamond_price.joblib']

- add a train and test error evaluation
- validation curve analysis
- grid search for hyperparameter tuning