In [1]:
## fetching the dataset
import pandas as pd

df = pd.read_csv("data/gemstone.csv")

df.head() ## display the head of the dataset

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [2]:
df["volume"] = df["x"] * df["y"] * df["z"]

In [3]:
import numpy as np

df["log_price"] = np.log1p(df["price"]) ## using log transformation to normalize the data
df["log_carat"] = np.log1p(df["carat"])

In [4]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price,volume,log_price,log_carat
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619,242.465405,9.519295,0.924259
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387,330.50836,9.502114,1.108563
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772,114.11295,7.927685,0.530628
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666,52.345818,6.50279,0.277632
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453,277.692705,9.578726,0.993252


In [6]:
## removing unwanted columns
df.drop(["id","x","y","z","carat","price"],axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0,cut,color,clarity,depth,table,volume,log_price,log_carat
0,Premium,F,VS2,62.2,58.0,242.465405,9.519295,0.924259
1,Very Good,J,SI2,62.0,58.0,330.50836,9.502114,1.108563
2,Ideal,G,VS1,61.2,57.0,114.11295,7.927685,0.530628
3,Ideal,G,VS1,61.6,56.0,52.345818,6.50279,0.277632
4,Premium,G,VS2,62.6,59.0,277.692705,9.578726,0.993252


In [8]:
X = df.drop("log_price",axis=1)
y = df["log_price"]

In [9]:
numerical_columns = X.select_dtypes(exclude="O").columns
categorical_columns = X.select_dtypes(include="O").columns

In [10]:
numerical_columns

Index(['depth', 'table', 'volume', 'log_carat'], dtype='object')

In [11]:
categorical_columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [12]:
## creating function to remove the outlier
def remove_outlier_iqr(data, column):
    """
    This function is used to remove the outlier for the dataframe and the column provided.
    arg1: DataFrame that need to be used.
    arg2: Column from that dataset from which you need to remove the outlier.
    """
    try:

        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)

        IQR = Q3 - Q1

        lower_bound = Q1 - (1.5 * IQR)
        upper_bound = Q3 + (1.5 * IQR)

        return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

    except Exception as e:
        print(f"Exception occured while trying to remove the outlier: {e}")

In [13]:
for col in numerical_columns:
    df = remove_outlier_iqr(df,col)

In [14]:
## checking the shape of the data
df.shape

(176106, 8)

In [23]:
## creating pipeline
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder

In [18]:
cut_cat = ["Fair","Good","Very Good","Premium","Ideal"]
color_cat = ["D","E","F","G","H","I","J"]
clarity_cat = ["I1","SI2","SI1","VS2","VS1","VVS2","VVS1","IF"]

In [24]:
num_pipeline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("encoder",OrdinalEncoder(categories=[cut_cat,color_cat,clarity_cat]))
    ]
)

preprocessor = ColumnTransformer([
    ("num_pipeline",num_pipeline,numerical_columns),
    ("cat_pipeline",cat_pipeline,categorical_columns)
])

In [25]:
## Using train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=42)

In [26]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

In [27]:
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [28]:
X_train.head()

Unnamed: 0,num_pipeline__depth,num_pipeline__table,num_pipeline__volume,num_pipeline__log_carat,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-1.129988,-0.641897,-0.809728,-0.861987,4.0,1.0,5.0
1,-1.777823,0.921902,0.98663,1.028434,2.0,4.0,4.0
2,0.165682,0.400636,1.964427,1.817121,3.0,4.0,3.0
3,-0.574701,-0.641897,-0.996745,-1.102865,4.0,2.0,6.0
4,0.25823,0.400636,-0.999603,-1.102865,2.0,5.0,2.0


In [30]:
y_train

11504     7.074963
95284     8.911800
184777    9.453757
5419      6.928538
45466     6.100319
            ...   
119879    7.252054
103694    9.620129
131932    8.883224
146867    6.705639
121958    6.013715
Name: log_price, Length: 135501, dtype: float64

In [31]:
## creating a func to evaluate the model
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
import numpy as np

def eval_model(true,predict):
    """
    This function is used to evaluate the model on specific metrics
    """
    try:
        mse = mean_squared_error(true,predict)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(true,predict)
        r_square = r2_score(true,predict)

        return mse,rmse,mae,r_square
    except Exception as e:
        print(f"Exception occured while trying to evaluate the model: {e}")

In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

models = {
    "LinearRegression":LinearRegression(),
    "Ridge":Ridge(),
    "Lasso":Lasso(),
    "ElasticNet":ElasticNet(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "SVR":SVR()
}

param_grid = {
    "LinearRegression":{
        "fit_intercept":[True,False],
        "n_jobs":[None,-1]
    },
    "Ridge":{
        "alpha":[1.0,1.5,2.0,2.5],
        "solver":["sag","saga","lbfgs"],
        "max_iter":[None,2,5,8,10]
    },
    "Lasso":{
        "alpha":[1.0,1.5,2.0,2.5],
        "selection":["random","cyclic"],
        "max_iter":[1000,2000,3000,4000],

    },
    "ElasticNet":{
        "alpha":[1.0,1.5,2.0,2.5],
        "max_iter":[1000,2000,3000,4000],
        "selection":["random","cyclic"]
    },
    "DecisionTreeRegressor":{
        "criterion":["squared_error","absolute_error","friedman_mse","poisson"],
        "splitter":["best","random"],
        "max_depth":[None,10,20,30],
        "min_samples_split":[2,4,6,8],
        "min_samples_leaf":[1,2,3,4]

    },
    "RandomForestRegressor":{
        "n_estimators":[None,100,200],
        "criterion":["squared_error","absolute_error","friedman_mse"],
        "max_depth":[None,10,20,30],
        "min_samples_split":[2,4,6,8],
        "min_samples_leaf":[1,2,3,4]
    },
    "SVR":{
        "kernel":["linear","poly","rbf"],
        "C":[1.0,1.5,2.0,2.5],
        "epsilon":[0.1,0.2,0.3,0.4]
    }

}

best_score = -1
best_estimator = None

for model_name,model in models.items():
    ## fitting the model
    gridsearch = GridSearchCV(estimator=model,param_grid=param_grid[model_name],verbose=True,n_jobs=-1,cv=3)
    gridsearch.fit(X_train,y_train)

    ## making pred on the test dataset
    y_pred = gridsearch.predict(X_test)

    best_model_score = gridsearch.best_score_
    best_model_param = gridsearch.best_params_
    best_model_estimator = gridsearch.best_estimator_

    print(f"Model best score {best_model_score} and model param is : {best_model_param}")

    mse,rmse,mae,r_square = eval_model(y_test,y_pred)

    print(f"{model_name}_ mean_squared_error: {mse}")
    print(f"{model_name}_ root_mean_squared_error: {rmse}")
    print(f"{model_name}_ mean_absolute_error: {mae}")
    print(f"{model_name}_ r2_score: {r_square}")
    
    print("="*30)

    if best_model_score > best_score:
        best_score = best_model_score
        best_estimator = best_model_estimator


print(f"Best Model Found: {best_estimator} and best model score: {best_score}")



Fitting 3 folds for each of 4 candidates, totalling 12 fits
Model best score 0.9788992108322917 and model param is : {'fit_intercept': True, 'n_jobs': None}
LinearRegression_ mean_squared_error: 0.020900584063114444
LinearRegression_ root_mean_squared_error: 0.14457034295841745
LinearRegression_ mean_absolute_error: 0.10851569488065385
LinearRegression_ r2_score: 0.979800114794732
Fitting 3 folds for each of 60 candidates, totalling 180 fits


60 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/egglisten/Data Science/projects/gemstone_price_prediction/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 833, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/egglisten/Data Science/projects/gemstone_price_prediction/.venv/lib/python3.13/site-packages/sklearn/base.py", line 1336, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/egglisten/Data Science/projects/gemstone_price_prediction/.venv/lib/python3.13/site-packages/sk

Model best score 0.9789301424672762 and model param is : {'alpha': 1.0, 'max_iter': 5, 'solver': 'saga'}
Ridge_ mean_squared_error: 0.020901424055563472
Ridge_ root_mean_squared_error: 0.14457324806326885
Ridge_ mean_absolute_error: 0.10851726547247974
Ridge_ r2_score: 0.9797993029633021
Fitting 3 folds for each of 32 candidates, totalling 96 fits
Model best score -5.559029242115561e-06 and model param is : {'alpha': 1.0, 'max_iter': 1000, 'selection': 'random'}
Lasso_ mean_squared_error: 1.0347103138036824
Lasso_ root_mean_squared_error: 1.0172071145070125
Lasso_ mean_absolute_error: 0.8796599148600521
Lasso_ r2_score: -2.1315022849632825e-05
Fitting 3 folds for each of 32 candidates, totalling 96 fits
Model best score 0.5717962244832473 and model param is : {'alpha': 1.0, 'max_iter': 1000, 'selection': 'random'}
ElasticNet_ mean_squared_error: 0.4437425423379648
ElasticNet_ root_mean_squared_error: 0.6661400320788151
ElasticNet_ mean_absolute_error: 0.5723053408814891
ElasticNet_ r2_



KeyboardInterrupt: 