In [1]:
import pandas as pd

In [2]:
df1=pd.DataFrame(pd.read_csv("data\gemstone.csv"))

In [3]:
df1.head(11)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
5,5,1.51,Very Good,J,SI1,62.8,58.0,7.34,7.29,4.59,7506
6,6,0.74,Ideal,E,VS2,61.8,57.0,5.76,5.79,3.57,3229
7,7,1.34,Premium,G,SI2,62.5,57.0,7.0,7.05,4.38,6224
8,8,0.3,Ideal,F,IF,62.0,56.0,4.35,4.37,2.7,886
9,9,0.3,Good,J,VS1,63.6,57.0,4.26,4.28,2.72,421


In [4]:
df1.drop(columns=['id'],inplace=True)

Seggregate the input and output variables

In [5]:
X=df1.drop(columns=['price'],axis=1)
Y=df1["price"]

In [6]:
numerical_columns=X.select_dtypes(exclude='object').columns
categorical_columns=X.select_dtypes(include='object').columns

In [7]:
categorical_columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [8]:
df1["cut"].value_counts()

cut
Ideal        92454
Premium      49910
Very Good    37566
Good         11622
Fair          2021
Name: count, dtype: int64

In [9]:
df1["color"].value_counts().sort_values()

color
J     6456
I    17514
D    24286
H    30799
F    34258
E    35869
G    44391
Name: count, dtype: int64

In [10]:
df1["clarity"].value_counts().sort_values()

clarity
I1        512
IF       4219
VVS1    10628
VVS2    15762
SI2     30484
VS1     30669
VS2     48027
SI1     53272
Name: count, dtype: int64

In [11]:
cut_categories=["Fair","Good","Very Good","Premium","Ideal"]
color_categories=["D","E","F","G","H","I","J"]
clarity_categories=["I1","SI2","SI1","VS2","VS1","VVS2","VVS1","IF"]

Categories are created

create pipeline

In [12]:
from sklearn.impute import SimpleImputer #Missing values
from sklearn.preprocessing import StandardScaler # Feature scaling (Numerical datatypes)
from sklearn.preprocessing import OrdinalEncoder # To rank the categorical variables
#Pipeline 
from sklearn.pipeline import Pipeline # To club everything together 
from sklearn.compose import ColumnTransformer # Begin the work 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [13]:
num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy= "median")), 
        ("scaler", StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy= "most_frequent")), 
        ("OrdinalEncoder", OrdinalEncoder(categories= [cut_categories, color_categories, clarity_categories])), 
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    [
        ('num_pipeline', num_pipeline, numerical_columns), 
        ('cat_pipeline', cat_pipeline, categorical_columns)
    ]
)

Train Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size= 0.25, random_state= 45)

In [15]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out()) 
X_test = pd.DataFrame(preprocessor.transform(X_test), columns = preprocessor.get_feature_names_out())

In [16]:

X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.19541,0.72199,-0.640704,-0.049355,-0.081201,0.008907,0.873256,-0.317345,-1.315751
1,-0.173766,-0.203669,0.401246,-0.004238,0.018691,-0.005603,-0.133146,2.145303,0.017304
2,1.579392,0.72199,0.922221,1.457564,1.444428,1.518017,-0.133146,0.913979,-1.315751
3,3.267619,-0.481366,-1.161679,2.567452,2.516001,2.446701,0.873256,2.145303,0.017304
4,-0.455137,-0.111103,-0.640704,-0.34713,-0.308229,-0.324838,0.873256,1.529641,2.683414


In [17]:
X_test.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.475552,-0.759064,-1.161679,0.708616,0.7361,0.632866,0.873256,-0.317345,0.017304
1,-1.191031,-0.666498,0.922221,-1.565299,-1.552344,-1.587267,-1.139547,-1.548669,1.350359
2,-0.996236,-0.018537,-1.161679,-1.168266,-1.143693,-1.151947,-1.139547,0.913979,1.350359
3,-0.498425,-0.759064,1.443196,-0.356154,-0.389959,-0.426413,-0.133146,-0.317345,0.017304
4,0.237469,-3.443474,0.922221,0.474006,0.445504,0.139503,-0.133146,-1.548669,-0.649224


Apply the models

In [18]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [19]:
regression = LinearRegression()
regression.fit(X_train, y_train)


In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [20]:
regression.coef_

array([ 6410.30028966,  -133.18235413,   -69.85943892, -1758.49512798,
        -429.51055284,   -75.79135359,    70.97707844,  -465.44129638,
         650.80643035])

In [21]:
regression.intercept_

3964.713119666067

In [22]:
import numpy as np 
def model_evaluation(true, predicted): 
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    r2_square = r2_score(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    return mae, mse, r2_square, rmse 

Training multiple models

In [23]:
models = {
    "LinearRegression": LinearRegression(), 
    "Lasso": Lasso(), 
    "Ridge": Ridge()
}

model_list = []
r2_list = []

for i in range(len(list(models))): 
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mae, mse, r2_square, rmse = model_evaluation(y_test, y_pred)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model training performance")
    print(f"The MAE score is {mae}")
    print(f"The MSE score is {mse}")
    print(f"The R2 score is {r2_square}")
    print(f"The RMSE score is {rmse}")

    r2_list.append(r2_square)
    print("-"*35)
    print("\n")

LinearRegression
Model training performance
The MAE score is 679.7054095585776
The MSE score is 1032239.2574918634
The R2 score is 0.9371745654465842
The RMSE score is 1015.9917605432947
-----------------------------------


Lasso
Model training performance
The MAE score is 680.8409748615863
The MSE score is 1032531.9195436214
The R2 score is 0.9371567530833693
The RMSE score is 1016.1357781042951
-----------------------------------


Ridge
Model training performance
The MAE score is 679.7333205816836
The MSE score is 1032245.6702053159
The R2 score is 0.937174175148011
The RMSE score is 1015.9949164269061
-----------------------------------


