In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import numpy as np 
import pandas as pd

In [5]:
data = pd.read_csv("../input/diamonds/diamonds.csv")

In [6]:
def data_prep(data: pd.DataFrame) -> pd.DataFrame:
    data = data.drop(["Unnamed: 0"], axis = 1)
    data = data.drop(data[data["x"] == 0].index)
    data = data.drop(data[data["y"] == 0].index)
    data = data.drop(data[data["z"] == 0].index)
    data = data[(data["depth"] < 75) & (data["depth"] > 45)]
    data = data[(data["table"]< 80) & (data["table"] > 40)]
    data = data[(data["x"] < 30)]
    data = data[(data["y"] < 30)]
    data = data[(data["z"] < 30) & (data["z"] > 2)]

    return data

In [7]:
data = data_prep(data)

In [8]:
categorical_columns = ['cut', 'color', 'clarity']

label_encoder = LabelEncoder()
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75


Build pipeline

In [9]:
X = data.drop(["price"], axis = 1)
y = data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.25, random_state=7)

In [10]:
pipeline_lr = Pipeline([("scalar1",StandardScaler()),
                     ("lr_classifier",LinearRegression())])

pipeline_dt = Pipeline([("scalar2",StandardScaler()),
                     ("dt_classifier",DecisionTreeRegressor())])

pipeline_rf = Pipeline([("scalar3",StandardScaler()),
                     ("rf_classifier",RandomForestRegressor())])


pipeline_kn = Pipeline([("scalar4",StandardScaler()),
                     ("rf_classifier",KNeighborsRegressor())])


pipeline_xgb = Pipeline([("scalar5",StandardScaler()),
                     ("rf_classifier",XGBRegressor())])

pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_kn, pipeline_xgb]

pipeline_dict = {0: "LinearRegression", 1: "DecisionTree", 2: "RandomForest",3: "KNeighbors", 4: "XGBRegressor"}

for pipe in pipelines:
    pipe.fit(X_train, y_train)

Cross-validation scoring

In [22]:
cv_results_rms = []
for i, model in enumerate(pipelines):
    cv_score = cross_val_score(model, X_train, y_train, scoring = "neg_root_mean_squared_error", cv=10)
    cv_results_rms.append(cv_score)
    print("%s: %f " % (pipeline_dict[i], cv_score.mean()))

LinearRegression: -1348.811824 
DecisionTree: -751.188455 
RandomForest: -547.427264 
KNeighbors: -823.649220 
XGBRegressor: -547.645088 


XGBClassifier gives best scoring on negative root mean square error

### Model evaluation

In [12]:
pred = pipeline_xgb.predict(X_test)

In [17]:
mse = metrics.mean_squared_error(y_test, pred)
mae = metrics.mean_absolute_error(y_test, pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {np.sqrt(mse)}")

MAE: 280.1733446465832
MSE: 307728.16599314264
RMSE: 554.7325175191577
